#!/usr/bin/env python # $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $ # Wait for heartbeat messages and act on them (or their absence) # VER = 1.51 import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback, urllib from subprocess import Popen, STDOUT, PIPE False = 0 True = 1 LOGFILE = "/home/andreas/public_html/messages/andreas" PICKFILE = "/var/tmp/hbd.pick" AEMAIL = ["andreas@wrede.ca"] NAME = "heatbeat" SMTPSERVER = "localhost" hosts = {} htab = {} msgs = [] num = 0 upcount = 0 PORT = 50003 TPORT = 50004 THOST = "" DEBUG = False verbose = False INTERVAL = 10 GRACE = 2 visual = 0 os.environ['TZ'] = 'EST5EDT' stdscr = None win = None msgw = None msgwB = None msgwHeight = 10 def handler(signum, frame): global running, sig sig = signum if not running: if verbose: print "NOT runing signal: %s running: %d" % (sig, running) return # signal.signal(sig, handler) if verbose: print "signal: %s running: %s frame: %s" % (sig, running, frame) running = False # sys.exit(0) def shortname(name): r = string.split(name, '.') return r[0] class NullDevice: def write(self, s): pass class Host: up = "up" down = "down" overdue = "overdue" def __init__(self, name, addr): global num self.name = shortname(name) self.addr = addr self.num = num self.lastbeat = time.time() self.upcount = 0 self.state = Host.up self.state = "up" self.statetime = self.lastbeat self.interval = 0 self.doesack = -1 self.cmds = [] num += 1 # called when reloading class from pickle def fixup(self): try: a=self.cmds except: self.cmds=[] def getstate(self): return self.state def dispstate(self): if self.state in ["down", "overdue"]: state = "%s" % self.state else: state = "%s" % self.state if self.doesack != -1: return "%s(%s)" % (state, self.doesack) return state # set new state, return number of secs in previous state def newstate(self, state, when=0): self.state = state now = time.time()-when s = now-self.statetime self.statetime = now if visual: displaystatetime(self.name) return s def email(s, msg): ret = "OK" toaddrs = AEMAIL fromaddr = "aew.heartbeat@wrede.ca" subj = "Info from %s: %s" % (NAME, s) date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime()) body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg) try: server = smtplib.SMTP(SMTPSERVER) if DEBUG: server.set_debuglevel(1) server.sendmail(fromaddr, toaddrs, body) except smtplib.SMTPRecipientsRefused, errs: log("cannot send email: %s\n" % (errs)) ret = "Fail" except: print("smtp error: "+traceback.format_exc()) saveandrestart() try: server.quit() except: pass return ret # # nsupdate: set the DNS A record for a fqdn # return: None if ok, else error text def nsupdate(hostname, newip): D = {} D['domain'] = 'dy.wapanafa.org' D['fqdn'] = '%s.dy.wapanafa.org' % hostname D['dnsttl'] = '5' D['newip'] = newip D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime()) nsup = """update delete %(fqdn)s A update add %(fqdn)s %(dnsttl)s A %(newip)s update delete %(fqdn)s TXT update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s" send answer """ % D # log("DBG: nsup %s" % nsup) cmd = ["/usr/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"] # log("DBG: cmd %s" % cmd) try: p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT) except OSError, e: return "nsupdate: execution failed: %s" % e except: return "nsupdate: some error occured" (output, err) = p.communicate(nsup) if output.find('status: NOERROR') >= 0: return None return output # # def dur(sec): sec = int(sec) h = sec / 3600 m = (sec - h * 3600) / 60 s = (sec - h * 3600) % 60 if h > 0: return "%d:%02d:%02d" % (h, m, s) if m > 0: return "%d:%02d" % (m, s) return "0:%02d" % s # # def addhost(name, addr): sname = shortname(name) if sname in hosts: # was: hosts.has_key(sname): del htab[hosts[sname].addr] hosts[sname].addr = addr if visual: displayaddr(sname) htab[addr] = sname m = "%s, changed address to %s" % (sname, addr) log(m) else: hosts[sname] = Host(sname, addr) s = hosts.keys() s.sort() x = 0 for n in s: hosts[n].num = x x += 1 htab[addr] = sname if visual: display() # def on_exit(): if visual: exitcurses() if DEBUG: sys.stderr.write("on_exit\n") logf.close() print "exit" def initlog(logfile): return open(logfile, "a") # # def initwin(): global win, msgw, msgwB, msgwHeight maxY, maxX = stdscr.getmaxyx() begin_x = 0 begin_y = 2 height = len(htab)+2 if DEBUG: log("initwin called with %d" % height) win = curses.newwin(height, maxX, begin_y, begin_x) a = win.border(0, 0, 0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE) msgwB = curses.newwin(0, 0, height+1, begin_x) msgwB.border(0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE) msgwHeight = maxY-height-3 msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1) msgw.setscrreg(0, msgwHeight-1) msgw.scrollok(1) stdscr.addstr(0, 0, "hbd Version %s" % VER, curses.A_BOLD) stdscr.refresh() msgwB.refresh() # def checkoverdue(): for h in hosts.keys(): if hosts[h].state == Host.down: continue timeout = hosts[h].interval+grace if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout: m = "%s is overdue" % h if h in watchhosts: email("overdue", m) hosts[h].newstate(Host.overdue, grace) log(m) # # def displaytime(): maxY, maxX = stdscr.getmaxyx() stdscr.addstr(0, maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD) for h in hosts.keys(): d = hosts[h].getstate() attr = 0 if verbose and hosts[h].state != Host.down: d = dur(now-hosts[h].lastbeat) if hosts[h].state == Host.overdue: attr = curses.A_BOLD win.addstr(hosts[h].num+1, 25, "%8s" % d, attr) win.refresh() stdscr.refresh() # # def displaystatetime(h, refresh=1): win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))) if refresh: win.refresh() # # def displayaddr(h, refresh=1): win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr) if refresh: win.refresh() # # def displaybody(): for h in hosts.keys(): win.addstr(hosts[h].num+1, 1, "%-25s" % (h)) if hosts[h].addr is not None: displayaddr(h, 0) if hosts[h].statetime is not None: displaystatetime(h, 0) win.refresh() # # def displaymsgs(): global msgw, msgs y = 0 for m in msgs[len(msgs)-msgwHeight:]: msgw.addstr(y, 0, m) y += 1 msgw.refresh() # # def display(): if visual: initwin() displaytime() displaybody() displaymsgs() def log(m, service="heartbeat"): if DEBUG: print "Log: %s" % m msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n" msgs.append(msg) if logfmt == "msg": m2 = "%d|%s|%s\n" % (now, service, m) else: m2 = msg logf.write(m2) logf.flush() if msgw is not None: y, x = msgw.getyx() # if y > = msgwHeight-1: # msgw.scroll() msgw.addstr(msg) msgw.clrtoeol() msgw.refresh() pickleit() # # def fromaddr(name, addr, boot, interval, acks): global htab if not name in hosts: # was: hosts.has_key(name): addhost(name, addr) host = hosts[name] host.doesack = acks if host.addr != addr: if host.addr in htab: # was: htab.has_key(host.addr): del htab[host.addr] host.addr = addr htab[addr] = name m = "%s changed address to %s" % (host.name, addr) if name in dyndnshosts: err = nsupdate(name, addr) if err: m += ", DNS failed: %s" % err else: m += ", DNS updated." log(m) if name in watchhosts: email("address change", m) host.lastbeat = now if host.getstate() != Host.up and interval > 0: lasts = host.state d = host.newstate(Host.up) m = "%s, back after being %s for %s" % (host.name, lasts, dur(d)) log(m) if name in watchhosts: email("back", name) host.upcount += 1 # # def readsock(sock): global htab, win data, addr = sock.recvfrom(1024) pairs = string.split(data, ';') boot = 0 shutdown = 0 name = "unknown" service = "unknown" msg = None interval = 0 deltaT = 0.0 acks = -1 for pair in pairs: l = string.split(pair, "=") key = l[0] if len(l) != 2: val = "0" else: val = l[1] if key == 'boot': boot += 1 elif key == 'shutdown': shutdown += 1 elif key == 'interval': interval = int(val) elif key == 'name': name = shortname(val) elif key == 'msg': msg = val elif key == 'service': service = val elif key == 'time': try: deltaT = now-float(val) except: pass elif key == 'acks': try: acks = int(val) except: acks = -1 if boot: if acks == -1: a = "(%s)" % acks else: a = "" m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a) log(m) if name in watchhosts: email("booted", m) if msg: m = "%s msg: %s" % (name, msg) log(m, service=service) if name in watchhosts: email("msg", m) fromaddr(name, addr[0], boot, interval, acks) if shutdown: m = "%s shutdown" % name log(m) if name in watchhosts: email("shutdown", m) try: hosts[name].newstate(Host.down) except: pass if interval > 0: try: hosts[name].interval = interval except: pass rmsg="ACK" if len(hosts[name].cmds): rmsg=hosts[name].cmds[0] email("%s cmd exec" % name, "command '%s' initiated" % hosts[name].cmds[0]) del hosts[name].cmds[0] log("%s command initiated" % name) try: ss=sock.sendto(rmsg, addr) if DEBUG: log("msg from %s,%s, sent %s bytes back" % (addr[0], addr[1], ss)) except: pass # # # def initcurses(): global stdscr stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) if DEBUG: sys.stderr.write("curses init done: %s\n" % stdscr) def exitcurses(): curses.nocbreak() stdscr.keypad(0) curses.echo() curses.endwin() class HtmlServer(SocketServer.TCPServer): allow_reuse_address = True # # class HtmlHandler(SocketServer.BaseRequestHandler): def buildhead(self, title="Heartbeat", refresh=None): res=[] res.append('') res.append("") res.append("") res.append('%s' % (title)) if refresh: res.append("\n" % refresh) res.append("") res.append('') return res def buildpage(self): res=self.buildhead(refresh=60) res.append("

Heartbeat status

%s (%s)

" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT'))) res.append("") res.append("\n") hosts_sorted = hosts.keys() hosts_sorted.sort() for h in hosts_sorted: res.append("\n" % (h, hosts[h].dispstate(), hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))) res.append("
HostStateIP AddrLast change
%-24s%-7s%-16s%-17s
") res.append("

Log of Events

") for m in msgs[len(msgs)-30:]: res.append("%s
" % m) res.append("") return res def handle(self): global sig, running headers=[] headers.append("Date: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now))) headers.append("Server: hbd") headers.append("Last-Modified: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now))) headers.append("Accept-Ranges: bytes") headers.append("Connection: close") headers.append("Content-Type: text/html; charset = ISO-8859-1") uri = '/unknown' f = self.request.makefile() while 1: line = string.strip(f.readline()) if len(line) == 0: break r = line.split() if r[0] == "GET": uri = r[1] html = r[2] upar=string.split(uri,"?") if len(upar) == 1: uarg=[] else: uarg=string.split(upar[1],"&") code = 200 cause = "OK" if uri == "/": res=self.buildpage() elif upar[0] == "/c": # command on host /c?h=melschserver&c=sudo%20ls uname="" ucmd="" if uarg[0][:2] == "h=": uname=uarg[0][2:] if uarg[1][:2] == "c=": ucmd=uarg[1][2:] if ucmd != "" and uname != "" and hosts.has_key(uname): hosts[uname].cmds.append(urllib.unquote(ucmd)) res=self.buildhead() res.append("2Done") elif upar[0] == "/d": # drop host /d?h=melschserver if uarg[0][:2] == "h=": uname=uarg[0][2:] if uname != "" and hosts.has_key(uname): del hosts[uname] log("%s dropped" % uname) res=self.buildhead() res.append("Done") elif upar[0] == "/r": # restart res=self.buildhead() res.append("restart request") sig=signal.SIGHUP running=False log("restart request") else: code = 404 cause = "Not Found" res=[] res.append('') res.append('') res.append('%s %s' % (code, cause)) res.append('') res.append('

%s

' % (cause)) res.append('

The requested URL %s was not found on this server.

' % uri) res.append('
') res.append('
hbd (Unix) Server at %s Port %s
' % (hbd_host, hbd_port)) res.append('') self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause)) for h in headers: self.request.send("%s\r\n" % h) self.request.send("\r\n") try: self.request.send(string.join(res, "\n")) except: pass def saveandrestart(): sock.close() # serv.shutdown() #N.B. dont shutdown() as we don't use serv_forever serv.server_close() log("restarting") os.execv(sys.argv[0], [sys.argv[0]]+cmdargs) def pickleit(): pickf = open(PICKFILE, 'w') pick = cPickle.Pickler(pickf) pick.dump(hosts) pick.dump(htab) pick.dump(msgs) pickf.close() # # Main # helpflag = False forground = False optlist = [] args = [] home = os.environ['HOME'] cmdargs = [] configfile = "%s/.hbrc" % home try: optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:vx') except: helpflag = True for o, a in optlist: if o == '-c': configfile = a cmdargs += [o, a] if o == '-d': visual = True cmdargs += [o] elif o == '-f': forground = True cmdargs += [o] elif o == '-h': helpflag = True elif o == '-v': verbose = True cmdargs += [o] elif o == '-x': DEBUG = True if helpflag: print "hbc HeartBeatDaemon" print "usage: hbd [-dfhvx] [-c configfile]" print print " -c configfile" print " -d display" print " -f run in foreground" print " -h this help" print " -v verbose" print " -x debug" print print """ config file can contain logfile = /var/log/heartbeat.log logfmt = [text|msg] hb_port = 50003 interval = 20 hbd_port = 50004 hbd_host = www.domain.com grace = 2 """ sys.exit(1) if visual: forground = True # # set defaults hb_port = PORT hbd_host = THOST hbd_port = TPORT logfile = LOGFILE logfmt = "text" interval = INTERVAL grace = GRACE watchhosts = [] dyndnshosts = [] drophosts = [] try: f = open(configfile, "r") if verbose: print "notice: using config file %s" % configfile except: print "warning: running without conifig file: %s" % configfile f = None if f: while 1: l = f.readline() if len(l) == 0: break if verbose: print " %s" % l[:-1] r = l[:-1].split('=') if r[0] == 'interval': interval = eval(r[1]) elif r[0] == 'grace': grace = eval(r[1]) elif r[0] == 'hbd_port': hbd_port = eval(r[1]) elif r[0] == 'hbd_host': hbd_host = r[1] elif r[0] == 'hb_port': hb_port = eval(r[1]) elif r[0] == 'logfile': logfile = r[1] elif r[0] == 'logfmt': logfmt = r[1] elif r[0] == 'watchhosts': watchhosts = eval(r[1]) elif r[0] == 'dyndnshosts': dyndnshosts = eval(r[1]) elif r[0] == 'drophosts': drophosts = eval(r[1]) f.close() if len(args) != 0: print "error: args" sys.exit(1) if verbose: print "notice: logging to %s" % logfile logf = initlog(logfile) if os.path.exists(PICKFILE): pickf = open(PICKFILE, 'r') pick = cPickle.Unpickler(pickf) try: hosts = pick.load() htab = pick.load() msgs = pick.load() pickf.close() except: os.unlink(PICKFILE) for h in hosts.keys(): hosts[h].fixup() for h in drophosts: if h in hosts: # was: hosts.has_key(h): del hosts[h] now = time.time() startsec = int(now) % interval if visual: import curses initcurses() display() stdscr.nodelay(1) log("Starting %s" % VER) atexit.register(on_exit) ilist = [] sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind(("", hb_port)) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) ilist.append(sock) sock6 = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) sock6.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock6.bind(("", hb_port)) ilist.append(sock6) serv = HtmlServer((hbd_host, hbd_port), HtmlHandler) ilist.append(serv.fileno()) if not forground: pid = os.fork() if pid > 0: if verbose: print "daemoinizing... pid = %d" % pid sys.exit(0) verbose = False os.close(0) os.close(1) os.close(2) sys.stdin.close() sys.stdout = NullDevice() sys.stderr = NullDevice() os.chdir("/") os.setsid() os.umask(0) running = True sig = 0 signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGHUP, handler) next = int(now)+15 # 15 seconds time to settle after (re-)start sleep = next - now while running: if visual: c = stdscr.getch() if c == ord('c'): msgs = [] display() elif c == ord('q'): break # Exit the while() elif c == ord('d'): DEBUG = not DEBUG elif c == ord('v'): verbose = not verbose # elif c == ord('p'): # PrintDocument() # elif c == ord('x'): # x = y = 0 if DEBUG: sys.stderr.write("about to sleep = %s\n" % (sleep)) try: sr = select.select(ilist, [], [], sleep) now = time.time() except KeyboardInterrupt: sys.exit(0) except select.error, value: if value[0] != 4: # interrupted system call print select.error, value #raise os.error, value continue if visual: exitcurses() initcurses() display() continue except: sys.exit(1) for fh in sr[0]: if fh in [sock, sock6]: readsock(fh) elif fh == serv.fileno(): serv.handle_request() else: print("what happend just now") if now >= next: next = now+1 checkoverdue() if visual: stdscr.move(1, 0) stdscr.clrtoeol() displaytime() sleep = next-now if sleep < 0: sys.stderr.write("sleep is negaitive! %s next = %s\n" % (sleep, next)) sleep = 0 if DEBUG: sys.stderr.write("sleep = %s next = %s\n" % (sleep, next)) if sig == signal.SIGHUP: if DEBUG: sys.stderr.write("signal 1 exit\n") saveandrestart()