diff --git a/hbc b/hbc index 8a9d22f..9532746 100755 --- a/hbc +++ b/hbc @@ -1,9 +1,13 @@ #!/usr/bin/env python -# $Id: hbc,v 1.2 2005/07/15 14:25:06 andreas Exp $ +# $Id: hbc,v 1.3 2005/07/19 20:31:05 andreas Exp $ import sys, time, socket, os, signal, getopt, string PORT=50003 -INTERVAL=20 +INTERVAL=10 + +class NullDevice: + def write(self, s): + pass def handler(signum, frame): @@ -12,9 +16,10 @@ def handler(signum, frame): return sys.exit(0) -helpflag=0 -verbose=0 -daemon=0 +msgonly=False +helpflag=False +verbose=False +daemon=False optlist=[] args=[] msgboot=[] @@ -24,7 +29,7 @@ configfile="%s/.hbrc" % home try: optlist, args = getopt.getopt(sys.argv[1:], 'bc:dhm:v') except: - helpflag=1 + helpflag=True for o,a in optlist: if o == '-b': @@ -32,24 +37,37 @@ for o,a in optlist: elif o == '-c': configfile=a elif o == '-d': - daemon=1 + daemon=True elif o == '-h': - helpflag=1 + helpflag=True elif o == '-m': msgboot.append("service=%s" % "service") + a.replace(';',':') msgboot.append("msg=%s" % a) + msgonly=True elif o == '-v': - verbose+=1 + verbose=True if helpflag: print "hbc HeartBeatClient" print "usage: hbc [-bdhv] [-c configfile] [-m msg][host1 [..]]" print + print " -b indicate machine boot" + print " -c configfile" + print " -d daemonize" + print " -h this help" + print " -m send a message" + print " -v verbose" + print print """ config file can contain hb_hosts=('host1', 'host2', ..._ hb_port=50003 interval=20 +logfile=... +logfmt={|test|msg} +grace=SECONDS +reportstrict={True|False} """ sys.exit(1) @@ -76,6 +94,8 @@ if f: r=l[:-1].split('=') if r[0] == 'hb_hosts': hb_hosts=eval(r[1]) + if verbose: + print "notice: cfg hb_hosts: %s" % hb_hosts elif r[0] == 'interval': interval=eval(r[1]) elif r[0] == 'hb_port': @@ -99,7 +119,7 @@ if verbose: print "notice: interval: %s" % interval print "notice: iam: %s" % iam -if daemon: +if not msgonly: msgboot.append("interval=%s" % interval) if len(msgboot) > 0: @@ -118,15 +138,27 @@ if len(msgboot) > 0: else: break +if msgonly: + sys.exit(0) -if not daemon: - sys.exit(0) +if daemon: + pid=os.fork() + if pid > 0: + if verbose: + print "daemoinizing... pid=%d" % pid + sys.exit(0) + + + os.close(0) + os.close(1) + os.close(2) + sys.stdin.close() + sys.stdout = NullDevice() + sys.stderr = NullDevice() + os.chdir("/") + os.setsid() + os.umask(0) -pid=os.fork() -if pid > 0: - if verbose: - print "daemoinizing... pid=%d" % pid - sys.exit(0) msg="interval=%s;name=%s" % (interval, iam) up=1 @@ -152,3 +184,5 @@ for hb_host in hb_hosts: sock.sendto(msg, (hb_host, hb_port)) time.sleep(1) sock.close() + + diff --git a/hbd b/hbd index 5c61c32..dc31ff6 100755 --- a/hbd +++ b/hbd @@ -1,23 +1,23 @@ #!/usr/bin/env python -# $Id: hbd,v 1.1 2005/07/14 19:26:01 andreas Exp $ +# $Id: hbd,v 1.2 2005/07/19 20:31:05 andreas Exp $ # Wait for heartbeat messages and act on them (or their absence) # -import time, os, string, sys, socket, curses, atexit, select, SocketServer +import time, os, string, sys, socket, curses, atexit, select, SocketServer, getopt -LOGF="/home/andreas/public_html/messages/andreas" +LOGFILE="/home/andreas/public_html/messages/andreas" hosts={} num=0 upcount=0 PORT=50003 TPORT=50004 -THOST="10.99.1.4" +THOST="" DEBUG=False verbose=False INTERVAL=10 -GRACE=10 +GRACE=2 visual=0 @@ -29,11 +29,14 @@ msgw=None msgwB=None msgwHeight=10 +class NullDevice: + def write(self, s): + pass + class Host: up="up" down="down" overdue="overdue" - OVERDUE="OVERDUE" def __init__(self, name, addr): global num @@ -46,12 +49,13 @@ class Host: self.uppercent="n/a" self.state="up" self.statetime=self.lastbeat - self.interval=INTERVAL + self.interval=0 num+=1 def getstate(self): return self.state + # set new state, return number of secs in previous state def newstate(self, state): self.state=state now=time.time() @@ -109,9 +113,8 @@ def on_exit(): print "exit" -def initlog(): - global logf - logf=open(LOGF,"a") +def initlog(logfile): + return open(logfile,"a") # # # @@ -135,7 +138,7 @@ def initwin(): msgw.setscrreg(0, msgwHeight-1) msgw.scrollok(1) - stdscr.addstr(0,0, "WatchArnsberg Version 1.0", curses.A_BOLD) + stdscr.addstr(0,0, "hbd Version 1.0", curses.A_BOLD) stdscr.refresh() msgwB.refresh() # @@ -146,11 +149,13 @@ def checkoverdue(): for h in hosts: if hosts[h].state == Host.down: continue - if hosts[h].state == Host.up and now-hosts[h].lastbeat > hosts[h].interval+GRACE: + if reportstrict: + timeout=hosts[h].interval+grace + else: + timeout=hosts[h].interval*5+grace + if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout: + if reportstrict: log("%s is overdue" % h) hosts[h].newstate(Host.overdue) - elif hosts[h].state == Host.overdue and now-hosts[h].lastbeat > hosts[h].interval*5+GRACE: - log("%s is overdue" % h) - hosts[h].newstate(Host.OVERDUE) # # @@ -165,7 +170,7 @@ def displaytime(): attr=0 if verbose and hosts[h].state != Host.down: d=dur(now-hosts[h].lastbeat) - if hosts[h].state == Host.OVERDUE: + if hosts[h].state == Host.overdue: attr=curses.A_BOLD win.addstr(hosts[h].num+1, 25, "%8s" % d, attr) win.addstr(hosts[h].num+1, 53, "%3s" % hosts[h].uppercent ) @@ -223,7 +228,10 @@ def display(): def log(m, service="heartbeat"): msg=time.strftime("%b %d %H:%M:%S")+": "+m+"\n" msgs.append(msg) - m2="%d|%s|%s\n" % (now, service, m) + if logfmt == "msg": + m2="%d|%s|%s\n" % (now, service, m) + else: + m2=msg logf.write(m2) logf.flush() if msgw != None: @@ -236,18 +244,17 @@ def log(m, service="heartbeat"): # # -def fromaddr(name, addr, boot): +def fromaddr(name, addr, boot, interval): global htab if not htab.has_key(addr): addhost(name, addr) host=hosts[htab[addr]] host.lastbeat=now - if host.getstate() != Host.up: + if host.getstate() != Host.up and interval > 0: lasts=host.state d=host.newstate(Host.up) - if lasts != 'overdue': - log("%s, back after being %s for %s" % (host.name, lasts, dur(d))) + log("%s, back after being %s for %s" % (host.name, lasts, dur(d))) host.upcount+=1 # @@ -261,7 +268,7 @@ def readsock(): shutdown=0 name="unknown" msg=None - interval=INTERVAL + interval=0 for pair in pairs: l=string.split(pair,"=") key=l[0] @@ -285,11 +292,12 @@ def readsock(): log("%s booted" % name) if msg: log("%s msg: %s" % (name, msg),service=service) - fromaddr(name, addr[0], boot) + fromaddr(name, addr[0], boot, interval) if shutdown: log("%s shutdown" % name) hosts[name].newstate(Host.down) - hosts[name].interval=interval + if interval > 0: + hosts[name].interval=interval # @@ -300,7 +308,7 @@ def updatestats(): upcount+=1 for h in hosts: if upcount > 0: - hosts[h].uppercent="%3.0f" % ((hosts[h].upcount*hosts[h].interval*100.0)/(upcount*INTERVAL)) + hosts[h].uppercent="%3.0f" % ((hosts[h].upcount*hosts[h].interval*100.0)/(upcount*interval)) # # # @@ -339,7 +347,7 @@ class HtmlHandler(SocketServer.BaseRequestHandler): cause="OK" self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause)) self.request.send("Date: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now))) - self.request.send("Server: WatchArnsberg\r\n") + self.request.send("Server: hbd\r\n") self.request.send("Last-Modified: %s\r\n" % time.strftime("%a, %d %b %Y %H:%M:%S GMT",time.gmtime(now))) self.request.send("Accept-Ranges: bytes\r\n") self.request.send("Connection: close\r\n") @@ -353,7 +361,7 @@ class HtmlHandler(SocketServer.BaseRequestHandler): res.append('

%s

' % (cause)) res.append('

The requested URL %s was not found on this server.

' % uri) res.append('
') - res.append('
WatchArnsberg (Unix) Server at somewhere.planix.com Port %d
' % TPORT) + res.append('
hbd (Unix) Server at %s Port %d
' % (hbd_host, hbd_port)) res.append('') else: @@ -383,11 +391,106 @@ class HtmlHandler(SocketServer.BaseRequestHandler): # Main # +helpflag=False +forground=False +optlist=[] +args=[] +home=os.environ['HOME'] +configfile="%s/.hbrc" % home +try: + optlist, args = getopt.getopt(sys.argv[1:], 'c:dfhv') +except: + helpflag=True -initlog() +for o,a in optlist: + if o == '-c': + configfile=a + if o == '-d': + visual=True + elif o == '-f': + forground=True + elif o == '-h': + helpflag=True + elif o == '-v': + verbose=True + + +if helpflag: + print "hbc HeartBeatDaemon" + print "usage: hbd [-dfhv] [-c configfile]" + print + print " -c configfile" + print " -d display" + print " -f run in foreground" + print " -h this help" + print " -v verbose" + print + print """ config file can contain +logfile=/var/log/heartbeat.log +logfmt=[text|msg] +hb_port=50003 +interval=20 +hbd_port=50004 +hbd_host=www.domain.com +grace=1 +""" + + sys.exit(1) + +if visual: + forground=True +# +# set defaults + +hb_port=PORT +hbd_host=THOST +hbd_port=TPORT +logfile=LOGFILE +logfmt="text" +interval=INTERVAL +grace=GRACE +reportstrict=False + +try: + f=open(configfile,"r") + if verbose: print "notice: using config file %s" % configfile +except: + print "warning: running without conifig file: %s" % configfile + f=None + +if f: + while 1: + l=f.readline() + if len(l) == 0: + break + r=l[:-1].split('=') + if r[0] == 'interval': + interval=eval(r[1]) + elif r[0] == 'grace': + grace=eval(r[1]) + elif r[0] == 'hbd_port': + hbd_port=eval(r[1]) + elif r[0] == 'hbd_host': + hbd_host=r[1] + elif r[0] == 'hb_port': + hb_port=eval(r[1]) + elif r[0] == 'logfile': + logfile=r[1] + elif r[0] == 'logfmt': + logfmt=r[1] + elif r[0] == 'reportstrict': + reportstrict=r[1] + f.close() + +if len(args) != 0: + print "error: args" + sys.exit(1) + + +logf=initlog(logfile) now=time.time() -startsec=int(now) % INTERVAL +startsec=int(now) % interval htab={} if visual: @@ -401,12 +504,30 @@ if DEBUG: log("Starting") ilist=[] sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM) -sock.bind(("",PORT)) +sock.bind(("",hb_port)) ilist.append(sock) -serv=SocketServer.TCPServer((THOST,TPORT),HtmlHandler) +serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler) ilist.append(serv.fileno()) +if not forground: + pid=os.fork() + if pid > 0: + if verbose: + print "daemoinizing... pid=%d" % pid + sys.exit(0) + + verbose=False + os.close(0) + os.close(1) + os.close(2) + sys.stdin.close() + sys.stdout = NullDevice() + sys.stderr = NullDevice() + os.chdir("/") + os.setsid() + os.umask(0) + next=int(now)+1 sleep=next - now while 1: @@ -441,7 +562,7 @@ while 1: serv.handle_request() if now >= next: next=now+1 - if int(now) % INTERVAL == startsec: + if int(now) % interval == startsec: updatestats() checkoverdue() if visual: