#!/usr/bin/env python # $Id: hbd,v 1.7 2005/07/21 18:07:33 andreas Exp $ # Wait for heartbeat messages and act on them (or their absence) # import time, os, string, sys, socket, atexit, select, SocketServer, getopt False=0 True=1 LOGFILE="/home/andreas/public_html/messages/andreas" hosts={} num=0 upcount=0 PORT=50003 TPORT=50004 THOST="" DEBUG=False verbose=False INTERVAL=10 GRACE=2 visual=0 msgs=[] stdscr=None win=None msgw=None msgwB=None msgwHeight=10 class NullDevice: def write(self, s): pass class Host: up="up" down="down" overdue="overdue" def __init__(self, name, addr): global num self.name=name self.addr=addr self.num=num self.lastbeat=time.time() self.upcount=0 self.state=Host.up self.uppercent="n/a" self.state="up" self.statetime=self.lastbeat self.interval=0 num+=1 def getstate(self): return self.state # set new state, return number of secs in previous state def newstate(self, state): self.state=state now=time.time() s=now-self.statetime self.statetime=now if visual: displaystatetime(self.name) return s # # def dur(sec): sec=int(sec) h=sec / 3600 m=(sec - h * 3600) / 60 s=(sec - h * 3600) % 60 if h > 0: return "%d:%02d:%02d" % (h, m, s) if m > 0: return "%d:%02d" % (m, s) return "0:%02d" % s # # # def addhost(name, addr): if hosts.has_key(name): del htab[hosts[name].addr] hosts[name].addr=addr if visual: displayaddr(name) htab[addr]=name log("%s, changed address to %s" % (name, addr)) else: hosts[name]=Host(name, addr) s=hosts.keys() s.sort() x=0 for n in s: hosts[n].num=x x+=1 htab[addr]=name if visual: display() # # # def on_exit(): if visual: exitcurses() logf.close() print "exit" def initlog(logfile): return open(logfile,"a") # # # def initwin(): global win, msgw, msgwB, msgwHeight maxY,maxX=stdscr.getmaxyx() begin_x = 0 begin_y = 2 height = len(htab)+2 if DEBUG: log("initwin called with %d" % height) win = curses.newwin(height, maxX, begin_y, begin_x) a=win.border(0,0,0,0,0,0,curses.ACS_LTEE,curses.ACS_RTEE) msgwB = curses.newwin(0, 0, height+1, begin_x) msgwB.border(0,0,0,0,curses.ACS_LTEE,curses.ACS_RTEE) msgwHeight=maxY-height-3 msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1) msgw.setscrreg(0, msgwHeight-1) msgw.scrollok(1) stdscr.addstr(0,0, "hbd Version 1.0", curses.A_BOLD) stdscr.refresh() msgwB.refresh() # # # def checkoverdue(): for h in hosts.keys(): if hosts[h].state == Host.down: continue if reportstrict: timeout=hosts[h].interval+grace else: timeout=hosts[h].interval*5+grace if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout: if reportstrict: log("%s is overdue" % h) hosts[h].newstate(Host.overdue) # # # # def displaytime(): maxY,maxX=stdscr.getmaxyx() stdscr.addstr(0,maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD) for h in hosts.keys(): d=hosts[h].getstate() attr=0 if verbose and hosts[h].state != Host.down: d=dur(now-hosts[h].lastbeat) if hosts[h].state == Host.overdue: attr=curses.A_BOLD win.addstr(hosts[h].num+1, 25, "%8s" % d, attr) win.addstr(hosts[h].num+1, 53, "%3s" % hosts[h].uppercent ) win.refresh() stdscr.refresh() # # # def displaystatetime(h, refresh=1): win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))) if refresh: win.refresh() # # # def displayaddr(h, refresh=1): win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr) if refresh: win.refresh() # # # def displaybody(): for h in hosts.keys(): win.addstr(hosts[h].num+1, 1, "%-24s" % (h)) if hosts[h].addr != None: displayaddr(h, 0) if hosts[h].statetime != None: displaystatetime(h, 0) win.refresh() # # # def displaymsgs(): global msgw, msgs y=0 for m in msgs[len(msgs)-msgwHeight:]: msgw.addstr(y, 0, m) y+=1 msgw.refresh() # # # def display(): if visual: initwin() displaytime() displaybody() displaymsgs() def log(m, service="heartbeat"): msg=time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n" msgs.append(msg) if logfmt == "msg": m2="%d|%s|%s\n" % (now, service, m) else: m2=msg logf.write(m2) logf.flush() if msgw != None: y,x=msgw.getyx() # if y >= msgwHeight-1: # msgw.scroll() msgw.addstr(msg) msgw.clrtoeol() msgw.refresh() # # def fromaddr(name, addr, boot, interval): global htab if not htab.has_key(addr): addhost(name, addr) host=hosts[htab[addr]] host.lastbeat=now if host.getstate() != Host.up and interval > 0: lasts=host.state d=host.newstate(Host.up) log("%s, back after being %s for %s" % (host.name, lasts, dur(d))) host.upcount+=1 # # # def readsock(): global htab, win data, addr = sock.recvfrom(1024) pairs=string.split(data,';') boot=0 shutdown=0 name="unknown" msg=None interval=0 deltaT=0.0 for pair in pairs: l=string.split(pair,"=") key=l[0] if len(l) != 2: val="0" else: val=l[1] if key == 'boot': boot+=1 elif key == 'shutdown': shutdown+=1 elif key == 'interval': interval=int(val) elif key == 'name': name=val elif key == 'msg': msg=val elif key == 'service': service=val elif key == 'time': try: deltaT=now-float(val) except: pass if boot: log("%s booted, deltaT %0.2g sec" % (name, deltaT)) if msg: log("%s %0.2g msg: %s" % (name, deltaT, msg),service=service) fromaddr(name, addr[0], boot, interval) if shutdown: log("%s shutdown" % name) hosts[name].newstate(Host.down) if interval > 0: hosts[name].interval=interval # # # def updatestats(): global upcount upcount+=1 for h in hosts.keys(): if upcount > 0: hosts[h].uppercent="%3.0f" % ((hosts[h].upcount*hosts[h].interval*100.0)/(upcount*interval)) # # # def initcurses(): global stdscr stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) if DEBUG: sys.stderr.write("curses init done: %s\n" % stdscr) def exitcurses(): curses.nocbreak(); stdscr.keypad(0); curses.echo() curses.endwin() # # # class HtmlHandler(SocketServer.BaseRequestHandler): def handle(self): f=self.request.makefile() while 1: line=string.strip(f.readline()) if len(line) == 0: break r=line.split() if r[0] == "GET": uri=r[1] html=r[2] if uri != "/": code=404 cause="Not Found" else: code=200 cause="OK" self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause)) self.request.send("Date: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now))) self.request.send("Server: hbd\r\n") self.request.send("Last-Modified: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now))) self.request.send("Accept-Ranges: bytes\r\n") self.request.send("Connection: close\r\n") self.request.send("Content-Type: text/html; charset=ISO-8859-1\r\n\r\n") res=[] if code != 200: res.append('') res.append('') res.append('%s %s' % (code, cause)) res.append('') res.append('

%s

' % (cause)) res.append('

The requested URL %s was not found on this server.

' % uri) res.append('
') res.append('
hbd (Unix) Server at %s Port %d
' % (hbd_host, hbd_port)) res.append('') else: res.append('') res.append("") res.append("") res.append("\n" % 60) res.append("") res.append('') res.append("

Heartbeat status at %s

" % time.strftime("%H:%M:%S", time.localtime(now))) res.append("") res.append("\n" ) for h in hosts.keys(): res.append("\n" % (h, hosts[h].state, hosts[h].addr, hosts[h].uppercent, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))) res.append("
HostStateIP AddrResLast change
%-24s%-7s%-16s%-3s%-17s
") res.append("

") for m in msgs[len(msgs)-30:]: res.append("%s
" % m) try: self.request.send(string.join(res,"\n")) except: pass # # Main # helpflag=False forground=False optlist=[] args=[] home=os.environ['HOME'] configfile="%s/.hbrc" % home try: optlist, args = getopt.getopt(sys.argv[1:], 'c:dfhv') except: helpflag=True for o,a in optlist: if o == '-c': configfile=a if o == '-d': visual=True elif o == '-f': forground=True elif o == '-h': helpflag=True elif o == '-v': verbose=True if helpflag: print "hbc HeartBeatDaemon" print "usage: hbd [-dfhv] [-c configfile]" print print " -c configfile" print " -d display" print " -f run in foreground" print " -h this help" print " -v verbose" print print """ config file can contain logfile=/var/log/heartbeat.log logfmt=[text|msg] hb_port=50003 interval=20 hbd_port=50004 hbd_host=www.domain.com grace=1 """ sys.exit(1) if visual: forground=True # # set defaults hb_port=PORT hbd_host=THOST hbd_port=TPORT logfile=LOGFILE logfmt="text" interval=INTERVAL grace=GRACE reportstrict=False try: f=open(configfile,"r") if verbose: print "notice: using config file %s" % configfile except: print "warning: running without conifig file: %s" % configfile f=None if f: while 1: l=f.readline() if len(l) == 0: break if verbose: print " %s" % l[:-1] r=l[:-1].split('=') if r[0] == 'interval': interval=eval(r[1]) elif r[0] == 'grace': grace=eval(r[1]) elif r[0] == 'hbd_port': hbd_port=eval(r[1]) elif r[0] == 'hbd_host': hbd_host=r[1] elif r[0] == 'hb_port': hb_port=eval(r[1]) elif r[0] == 'logfile': logfile=r[1] elif r[0] == 'logfmt': logfmt=r[1] elif r[0] == 'reportstrict': reportstrict=r[1] f.close() if len(args) != 0: print "error: args" sys.exit(1) if verbose: print "notice: logging to %s" % logfile logf=initlog(logfile) now=time.time() startsec=int(now) % interval htab={} if visual: import curses initcurses() display() stdscr.nodelay(1) if verbose: log("Starting") atexit.register(on_exit) ilist=[] sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind(("",hb_port)) ilist.append(sock) serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler) ilist.append(serv.fileno()) if not forground: pid=os.fork() if pid > 0: if verbose: print "daemoinizing... pid=%d" % pid sys.exit(0) verbose=False os.close(0) os.close(1) os.close(2) sys.stdin.close() sys.stdout = NullDevice() sys.stderr = NullDevice() os.chdir("/") os.setsid() os.umask(0) next=int(now)+1 sleep=next - now while 1: if visual: c = stdscr.getch() if c == ord('c'): msgs=[]; display() elif c == ord('q'): break # Exit the while() elif c == ord('d'): DEBUG=not DEBUG elif c == ord('v'): verbose=not verbose # elif c == ord('p'): PrintDocument() # elif c == ord('x'): x = y = 0 try: sr=select.select(ilist,[],[],sleep) now=time.time() except KeyboardInterrupt: sys.exit(0) except select.error, value: if value[0] != 4: # interrupted system call print select.error, value #raise os.error, value continue if visual: exitcurses() initcurses() display() continue for fh in sr[0]: if fh == sock: readsock() if fh == serv.fileno(): serv.handle_request() if now >= next: next=now+1 if int(now) % interval == startsec: updatestats() checkoverdue() if visual: stdscr.move(1 , 0) stdscr.clrtoeol() displaytime() sleep=next-now if sleep < 0: sys.stderr.write("sleep is negaitive! %s next=%s\n" % (sleep, next)) sleep=0 if DEBUG: sys.stderr.write("sleep=%s next=%s\n" % (sleep, next))