Files
heartbeat/hbd
T
2012-09-22 17:51:16 +00:00

731 lines
14 KiB
Python
Executable File

#!/usr/bin/env python
# $Id: hbd,v 1.32 2012/09/22 17:51:16 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence)
#
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback
False=0
True=1
LOGFILE="/home/andreas/public_html/messages/andreas"
PICKFILE="/tmp/hbd.pick"
AEMAIL=["andreas@wrede.ca"]
NAME="heatbeat"
SMTPSERVER="localhost"
hosts={}
htab={}
msgs=[]
num=0
upcount=0
PORT=50003
TPORT=50004
THOST=""
DEBUG=False
verbose=False
INTERVAL=10
GRACE=2
visual=0
os.environ['TZ']='Canada/Eastern'
stdscr=None
win=None
msgw=None
msgwB=None
msgwHeight=10
def handler(signum, frame):
global up, sig
if up == 0:
return
sig=signum
if verbose: print "signal: %s up: %d" % (sig, up)
up=0
# sys.exit(0)
def shortname(name):
r=string.split(name,'.')
return r[0]
class NullDevice:
def write(self, s):
pass
class Host:
up="up"
down="down"
overdue="overdue"
def __init__(self, name, addr):
global num
self.name=shortname(name)
self.addr=addr
self.num=num
self.lastbeat=time.time()
self.upcount=0
self.state=Host.up
self.state="up"
self.statetime=self.lastbeat
self.interval=0
self.doesack=""
num+=1
def getstate(self):
return self.state
def dispstate(self):
if self.state in ["down","overdue"]:
state="<b>%s</b>" % self.state
else:
state="%s" % self.state
if self.doesack != "":
return "%s(%s)" % (state, self.doesack)
return state
# set new state, return number of secs in previous state
def newstate(self, state, when=0):
self.state=state
now=time.time()-when
s=now-self.statetime
self.statetime=now
if visual:
displaystatetime(self.name)
return s
def email(s, msg):
ret="OK"
toaddrs=AEMAIL
fromaddr="aew.heartbeat@wrede.ca"
subj="Info from %s: %s" % (NAME, s)
date=time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
body="To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
try:
server = smtplib.SMTP(SMTPSERVER)
if DEBUG: server.set_debuglevel(1)
server.sendmail(fromaddr, toaddrs, body)
except smtplib.SMTPRecipientsRefused, errs:
log("cannot send email: %s\n" % (errs))
ret="Fail"
except:
print("smtp error: "+traceback.format_exc())
saveandrestart()
try:
server.quit()
except:
pass
return ret
#
#
def dur(sec):
sec=int(sec)
h=sec / 3600
m=(sec - h * 3600) / 60
s=(sec - h * 3600) % 60
if h > 0:
return "%d:%02d:%02d" % (h, m, s)
if m > 0:
return "%d:%02d" % (m, s)
return "0:%02d" % s
#
#
#
def addhost(name, addr):
sname=shortname(name)
if hosts.has_key(sname):
del htab[hosts[sname].addr]
hosts[sname].addr=addr
if visual:
displayaddr(sname)
htab[addr]=sname
m="%s, changed address to %s" % (sname, addr)
log(m)
if name in watchhosts:
email("address change", m)
else:
hosts[sname]=Host(sname, addr)
s=hosts.keys()
s.sort()
x=0
for n in s:
hosts[n].num=x
x+=1
htab[addr]=sname
if visual:
display()
#
#
#
def on_exit():
if visual:
exitcurses()
logf.close()
print "exit"
def initlog(logfile):
return open(logfile,"a")
#
#
#
def initwin():
global win, msgw, msgwB, msgwHeight
maxY,maxX=stdscr.getmaxyx()
begin_x = 0
begin_y = 2
height = len(htab)+2
if DEBUG: log("initwin called with %d" % height)
win = curses.newwin(height, maxX, begin_y, begin_x)
a=win.border(0,0,0,0,0,0,curses.ACS_LTEE,curses.ACS_RTEE)
msgwB = curses.newwin(0, 0, height+1, begin_x)
msgwB.border(0,0,0,0,curses.ACS_LTEE,curses.ACS_RTEE)
msgwHeight=maxY-height-3
msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1)
msgw.setscrreg(0, msgwHeight-1)
msgw.scrollok(1)
stdscr.addstr(0,0, "hbd Version 1.0", curses.A_BOLD)
stdscr.refresh()
msgwB.refresh()
#
#
#
def checkoverdue():
for h in hosts.keys():
if hosts[h].state == Host.down:
continue
if reportstrict:
gr=grace
else:
gr=5*grace
timeout=hosts[h].interval+gr
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
m="%s is overdue" % h
log(m)
if h in watchhosts:
email("overdue", m)
hosts[h].newstate(Host.overdue, gr)
#
#
#
#
def displaytime():
maxY,maxX=stdscr.getmaxyx()
stdscr.addstr(0,maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)
for h in hosts.keys():
d=hosts[h].getstate()
attr=0
if verbose and hosts[h].state != Host.down:
d=dur(now-hosts[h].lastbeat)
if hosts[h].state == Host.overdue:
attr=curses.A_BOLD
win.addstr(hosts[h].num+1, 25, "%8s" % d, attr)
win.refresh()
stdscr.refresh()
#
#
#
def displaystatetime(h, refresh=1):
win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))
if refresh:
win.refresh()
#
#
#
def displayaddr(h, refresh=1):
win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr)
if refresh:
win.refresh()
#
#
#
def displaybody():
for h in hosts.keys():
win.addstr(hosts[h].num+1, 1, "%-25s" % (h))
if hosts[h].addr != None:
displayaddr(h, 0)
if hosts[h].statetime != None:
displaystatetime(h, 0)
win.refresh()
#
#
#
def displaymsgs():
global msgw, msgs
y=0
for m in msgs[len(msgs)-msgwHeight:]:
msgw.addstr(y, 0, m)
y+=1
msgw.refresh()
#
#
#
def display():
if visual:
initwin()
displaytime()
displaybody()
displaymsgs()
def log(m, service="heartbeat"):
msg=time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n"
msgs.append(msg)
if logfmt == "msg":
m2="%d|%s|%s\n" % (now, service, m)
else:
m2=msg
logf.write(m2)
logf.flush()
if msgw != None:
y,x=msgw.getyx()
# if y >= msgwHeight-1:
# msgw.scroll()
msgw.addstr(msg)
msgw.clrtoeol()
msgw.refresh()
#
#
def fromaddr(name, addr, boot, interval, acks):
global htab
if not hosts.has_key(name):
addhost(name, addr)
host=hosts[name]
host.doesack=acks
if host.addr != addr:
log("%s changed address to %s" % (host.name, addr))
if htab.has_key(host.addr):
del htab[host.addr]
host.addr=addr
htab[addr]=name
host.lastbeat=now
if host.getstate() != Host.up and interval > 0:
lasts=host.state
d=host.newstate(Host.up)
m="%s, back after being %s for %s" % (host.name, lasts, dur(d))
log(m)
if name in watchhosts:
email("back", name)
host.upcount+=1
#
#
#
def readsock():
global htab, win
data, addr = sock.recvfrom(1024)
sock.sendto("ACK", addr)
pairs=string.split(data,';')
boot=0
shutdown=0
name="unknown"
service="unknown"
msg=None
interval=0
deltaT=0.0
acks=""
for pair in pairs:
l=string.split(pair,"=")
key=l[0]
if len(l) != 2:
val="0"
else:
val=l[1]
if key == 'boot':
boot+=1
elif key == 'shutdown':
shutdown+=1
elif key == 'interval':
interval=int(val)
elif key == 'name':
name=shortname(val)
elif key == 'msg':
msg=val
elif key == 'service':
service=val
elif key == 'time':
try:
deltaT=now-float(val)
except:
pass
elif key == 'acks':
acks=val
if boot:
if acks != "":
a="(%s)" % acks
else:
a=""
m="%s booted, deltaT %0.2g sec %s" % (name, deltaT,a)
log(m)
if name in watchhosts:
email("booted", m)
if msg:
m="%s msg: %s" % (name, msg)
log(m, service=service)
if name in watchhosts:
email("msg", m)
fromaddr(name, addr[0], boot, interval, acks)
if shutdown:
m="%s shutdown" % name
log(m)
if name in watchhosts:
email("shutdown", m)
try:
hosts[name].newstate(Host.down)
except:
pass
if interval > 0:
try:
hosts[name].interval=interval
except:
pass
#
#
#
def initcurses():
global stdscr
stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)
if DEBUG: sys.stderr.write("curses init done: %s\n" % stdscr)
def exitcurses():
curses.nocbreak(); stdscr.keypad(0); curses.echo()
curses.endwin()
#
#
#
class HtmlHandler(SocketServer.BaseRequestHandler):
def handle(self):
f=self.request.makefile()
uri='/unknown'
while 1:
line=string.strip(f.readline())
if len(line) == 0:
break
r=line.split()
if r[0] == "GET":
uri=r[1]
html=r[2]
if uri != "/":
code=404
cause="Not Found"
else:
code=200
cause="OK"
self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause))
self.request.send("Date: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now)))
self.request.send("Server: hbd\r\n")
self.request.send("Last-Modified: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now)))
self.request.send("Accept-Ranges: bytes\r\n")
self.request.send("Connection: close\r\n")
self.request.send("Content-Type: text/html; charset=ISO-8859-1\r\n\r\n")
res=[]
if code != 200:
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append('<html><head>')
res.append('<meta http-equiv=Refresh content=60>')
res.append('<title>%s %s</title>' % (code, cause))
res.append('</head><body>')
res.append('<h1>%s</h1>' % (cause))
res.append('<p>The requested URL %s was not found on this server.</p>' % uri)
res.append('<hr>')
res.append('<address>hbd (Unix) Server at %s Port %d</address>' % (hbd_host, hbd_port))
res.append('</body></html>')
else:
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append("<html>")
res.append("<head>")
res.append("<meta http-equiv=Refresh content=%d>\n" % 60)
res.append("</head>")
res.append('<body BGCOLOR="#FFFFFF" LINK="#008000" VLINK="#008000" BACKGROUND="/~andreas/images/tile.marble.gif">')
res.append("<H2>Heartbeat status</h2><h4> %s (%s)</H4>" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ','Europe/Berlin')))
res.append("<table>")
res.append("<tr><th>Host</th><th>State</th><th>IP Addr</th><th>Last change</th></tr>\n" )
hosts_sorted=hosts.keys()
hosts_sorted.sort()
for h in hosts_sorted:
res.append("<tr><td>%-24s</td><td>%-7s</td><td>%-16s</td><td>%-17s</td></tr>\n" % (h, hosts[h].dispstate(), hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))))
res.append("</table>")
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs)-30:]:
res.append("%s<BR>" % m)
res.append("</body></html>")
try:
self.request.send(string.join(res,"\n"))
except:
pass
def saveandrestart():
sock.close()
serv.socket.close()
pickf=open(PICKFILE, 'w')
pick=cPickle.Pickler(pickf)
pick.dump(hosts)
pick.dump(htab)
pick.dump(msgs)
pickf.close()
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
#
# Main
#
helpflag=False
forground=False
restart=None
optlist=[]
args=[]
home=os.environ['HOME']
cmdargs=[]
configfile="%s/.hbrc" % home
try:
optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:v')
except:
helpflag=True
for o,a in optlist:
if o == '-c':
configfile=a
cmdargs+=[o, a]
if o == '-d':
visual=True
cmdargs+=[o]
elif o == '-f':
forground=True
cmdargs+=[o]
elif o == '-h':
helpflag=True
elif o == '-v':
verbose=True
cmdargs+=[o]
if helpflag:
print "hbc HeartBeatDaemon"
print "usage: hbd [-dfhv] [-c configfile]"
print
print " -c configfile"
print " -d display"
print " -f run in foreground"
print " -h this help"
print " -v verbose"
print
print """ config file can contain
logfile=/var/log/heartbeat.log
logfmt=[text|msg]
hb_port=50003
interval=20
hbd_port=50004
hbd_host=www.domain.com
grace=1
"""
sys.exit(1)
if visual:
forground=True
#
# set defaults
hb_port=PORT
hbd_host=THOST
hbd_port=TPORT
logfile=LOGFILE
logfmt="text"
interval=INTERVAL
grace=GRACE
reportstrict=False
watchhosts=[]
drophosts=[]
try:
f=open(configfile,"r")
if verbose: print "notice: using config file %s" % configfile
except:
print "warning: running without conifig file: %s" % configfile
f=None
if f:
while 1:
l=f.readline()
if len(l) == 0:
break
if verbose: print " %s" % l[:-1]
r=l[:-1].split('=')
if r[0] == 'interval':
interval=eval(r[1])
elif r[0] == 'grace':
grace=eval(r[1])
elif r[0] == 'hbd_port':
hbd_port=eval(r[1])
elif r[0] == 'hbd_host':
hbd_host=r[1]
elif r[0] == 'hb_port':
hb_port=eval(r[1])
elif r[0] == 'logfile':
logfile=r[1]
elif r[0] == 'logfmt':
logfmt=r[1]
elif r[0] == 'reportstrict':
reportstrict=r[1] in ["True","true","TRUE","1"]
elif r[0] == 'watchhosts':
watchhosts=eval(r[1])
elif r[0] == 'drophosts':
drophosts=eval(r[1])
f.close()
if len(args) != 0:
print "error: args"
sys.exit(1)
if verbose: print "notice: logging to %s" % logfile
logf=initlog(logfile)
if os.path.exists(PICKFILE):
pickf=open(PICKFILE, 'r')
pick=cPickle.Unpickler(pickf)
hosts=pick.load()
htab=pick.load()
msgs=pick.load()
pickf.close()
# os.unlink(PICKFILE)
for h in drophosts:
if hosts.has_key(h):
del hosts[h]
now=time.time()
startsec=int(now) % interval
if visual:
import curses
initcurses()
display()
stdscr.nodelay(1)
if verbose:
if restart: log("Restarting")
else: log("Starting")
atexit.register(on_exit)
ilist=[]
sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR, \
sock.getsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR) | 1)
sock.bind(("",hb_port))
ilist.append(sock)
serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler)
ilist.append(serv.fileno())
if not forground and not restart:
pid=os.fork()
if pid > 0:
if verbose: print "daemoinizing... pid=%d" % pid
sys.exit(0)
verbose=False
os.close(0)
os.close(1)
os.close(2)
sys.stdin.close()
sys.stdout = NullDevice()
sys.stderr = NullDevice()
os.chdir("/")
os.setsid()
os.umask(0)
up=1
sig=0
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler)
next=int(now)+1
sleep=next - now
while up:
if visual:
c = stdscr.getch()
if c == ord('c'): msgs=[]; display()
elif c == ord('q'): break # Exit the while()
elif c == ord('d'): DEBUG=not DEBUG
elif c == ord('v'): verbose=not verbose
# elif c == ord('p'): PrintDocument()
# elif c == ord('x'): x = y = 0
try:
sr=select.select(ilist,[],[],sleep)
now=time.time()
except KeyboardInterrupt:
sys.exit(0)
except select.error, value:
if value[0] != 4: # interrupted system call
print select.error, value
#raise os.error, value
continue
if visual:
exitcurses()
initcurses()
display()
continue
for fh in sr[0]:
if fh == sock:
readsock()
if fh == serv.fileno():
serv.handle_request()
if now >= next:
next=now+1
checkoverdue()
if visual:
stdscr.move(1 , 0)
stdscr.clrtoeol()
displaytime()
sleep=next-now
if sleep < 0:
sys.stderr.write("sleep is negaitive! %s next=%s\n" % (sleep, next))
sleep=0
if DEBUG: sys.stderr.write("sleep=%s next=%s\n" % (sleep, next))
if sig == signal.SIGHUP:
saveandrestart()