#!/usr/bin/env python # $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $ # Wait for heartbeat messages and act on them (or their absence) # VER = 2.00 import time import os import string import sys import socket import atexit import select import SocketServer import BaseHTTPServer import getopt import signal import cPickle import smtplib import traceback import urllib import httplib import threading import Queue from subprocess import Popen, STDOUT, PIPE SEND_EMAIL=False SEND_PUSHOVER=True DEBUG = 0 LOGFILE = "/home/andreas/public_html/messages/andreas" PICKFILE = "/var/tmp/hbd.pick" AEMAIL = ["andreas@wrede.ca"] NAME = "heatbeat" SMTPSERVER = "localhost" hosts = {} htab = {} msgs = [] num = 0 #AEW upcount = 0 PORT = 50003 TPORT = 50004 THOST = "" verbose = False INTERVAL = 10 GRACE = 2 os.environ['TZ'] = 'EST5EDT' tsfm=["%H","%d","%U"] lastfm=["","",""] def handler(signum, frame): global running, sig sig = signum if not running: if verbose: sys.stderr.write("NOT runing signal: %s running: %d" % (sig, running)) sys.exit(2) if verbose: sys.stderr.write("signal: %s running: %s frame: %s" % (sig, running, frame)) def shortname(name): r = string.split(name, '.') return r[0] class NullDevice: def write(self, s): pass class Addr: def __init__(self, a4, a6): self.a4 = a4 self.a6 = a6 class Host: up = "up" down = "down" overdue = "overdue" def __init__(self, name, addr): global num self.name = shortname(name) self.addr = addr self.num = num self.lastbeat = time.time() self.upcount = 0 self.state = Host.up self.state = "up" self.statetime = self.lastbeat self.interval = 0 self.doesack = -1 self.cmds = [] self.hdwcounts = [[0,0],[0,0],[0,0]] num += 1 # called when reloading class from pickle, add new fields here def fixup(self): try: a=self.cmds except: self.cmds=[] try: a=self.hdwcounts except: self.hdwcounts = [[self.doesack,self.upcount],[self.doesack,self.upcount],[self.doesack,self.upcount]] def getstate(self): return self.state def dispstate(self): if self.state in ["down", "overdue"]: state = "%s" % self.state else: state = "%s" % self.state return state def dispstats(self): if self.doesack != -1: if self.upcount > 0: # return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts) r = "" for v in xrange(3): a,u = self.hdwcounts[v] if (self.upcount - u) != 0: vs = "%0.0f" % (100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))) if vs == "0": vs="" else: vs="-" r+= '%s' % vs return r else: return "(%s)" % (self.doesack) return 'N/A>' # set new state, return number of secs in previous state def newstate(self, state, when=0): self.state = state now = time.time()-when s = now-self.statetime self.statetime = now return s def email(s, msg): if not SEND_EMAIL: return ret = "OK" toaddrs = AEMAIL fromaddr = "aew.heartbeat@wrede.ca" subj = "Info from %s: %s" % (NAME, s) date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime()) body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg) try: server = smtplib.SMTP(SMTPSERVER) if DEBUG > 0: server.set_debuglevel(1) server.sendmail(fromaddr, toaddrs, body) except smtplib.SMTPRecipientsRefused, errs: log("cannot send email: %s\n" % (errs)) ret = "Fail" except: print("smtp error: "+traceback.format_exc()) saveandrestart() try: server.quit() except: pass return ret def pushover(msg): if not SEND_PUSHOVER: return conn = httplib.HTTPSConnection("api.pushover.net:443") try: conn.request("POST", "/1/messages.json", urllib.urlencode({ "token": "ac7NLX2rPjXFareeDgLpXNoDf4iFmf", "user": "uDhH33UjQQDYtNzJb1ThRiWb9ingGK", "message": msg, }), { "Content-type": "application/x-www-form-urlencoded" }) conn.getresponse() except: pass # nsupdate: set the DNS A record for a fqdn # return: None if ok, else error text def nsupdate(hostname, newip): D = {} D['domain'] = 'dy.wapanafa.org' D['fqdn'] = '%s.dy.wapanafa.org' % hostname D['dnsttl'] = '5' D['newip'] = newip D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime()) if newip.find(":") > 0: nsup = """update delete %(fqdn)s AAAA update add %(fqdn)s %(dnsttl)s AAAA %(newip)s update delete %(fqdn)s TXT update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s" send answer """ % D else: nsup = """update delete %(fqdn)s A update add %(fqdn)s %(dnsttl)s A %(newip)s update delete %(fqdn)s TXT update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s" send answer """ % D if DEBUG > 0: log("DBG: nsup %s" % nsup) cmd = ["/usr/local/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"] if DEBUG > 0: log("DBG: cmd %s" % cmd) try: p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT) except OSError, e: return "nsupdate: execution failed: %s" % e except: return "nsupdate: some error occured" (output, err) = p.communicate(nsup) if output.find('status: NOERROR') >= 0: return None return output # def dur(sec): sec = int(sec) h = sec / 3600 m = (sec - h * 3600) / 60 s = (sec - h * 3600) % 60 if h > 0: return "%d:%02d:%02d" % (h, m, s) if m > 0: return "%d:%02d" % (m, s) return "0:%02d" % s # def addhost(name, addr): sname = shortname(name) if sname in hosts: # was: hosts.has_key(sname): del htab[hosts[sname].addr] hosts[sname].addr = addr htab[addr] = sname m = "%s, changed address to %s" % (sname, addr) log(m) else: hosts[sname] = Host(sname, addr) s = hosts.keys() s.sort() x = 0 for n in s: hosts[n].num = x x += 1 htab[addr] = sname # def on_exit(): if DEBUG > 0: sys.stderr.write("on_exit\n") try: logf.close() except: pass print "exit" def initlog(logfile): return open(logfile, "a") # # def checkoverdue(): for h in hosts.keys(): if hosts[h].state == Host.down: continue timeout = hosts[h].interval+grace if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout: m = "%s is overdue" % h if h in watchhosts: email("overdue", m) pushover(m) hosts[h].newstate(Host.overdue, grace) log(m) def log(m, service="heartbeat"): if DEBUG > 0: print "Log: %s" % m msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n" msgs.append(msg) if logfmt == "msg": m2 = "%d|%s|%s\n" % (now, service, m) else: m2 = msg logf.write(m2) logf.flush() pickleit() def dnsupdatethread(): while True: name, addr = dnsQ.get() m = "%s changed address to %s" % (name, addr) err = nsupdate(name, addr) if err: m += ", DNS failed: %s" % err email("error: nsupdate failed", m) else: m += ", DNS updated." dnsQ.task_done() log(m) # # def fromaddr(name, addr, boot, interval, acks): global htab newh=False if not name in hosts: # was: hosts.has_key(name): addhost(name, addr) newh=True host = hosts[name] host.doesack = acks if host.addr != addr: if host.addr in htab: # was: htab.has_key(host.addr): del htab[host.addr] host.addr = addr htab[addr] = name m = "%s changed address to %s" % (host.name, addr) if name in dyndnshosts and not ":" in addr: # don't try and cat ptr to IPv6 addr dnsQ.put((name, addr)) else: log(m) if name in watchhosts: email("address change", m) pushover(m) host.lastbeat = now if host.getstate() != Host.up and interval > 0: lasts = host.state d = host.newstate(Host.up) m = "%s, back after being %s for %s" % (host.name, lasts, dur(d)) log(m) if name in watchhosts: email("back", name) pushover("%s is back" % name) if boot or newh: host.upcount = host.doesack else: host.upcount += 1 # # def readsock(sock): global htab if DEBUG > 3: sys.stderr.write("readsock recfrom start") data, addr = sock.recvfrom(1024) if DEBUG > 2: sys.stderr.write("readsock = %s, %s\n" % (data,addr)) pairs = string.split(data, ';') boot = 0 shutdown = 0 name = "unknown" service = "unknown" msg = None interval = 0 deltaT = 0.0 acks = -1 for pair in pairs: l = string.split(pair, "=") key = l[0] if len(l) != 2: val = "0" else: val = l[1] if key == 'boot': boot += 1 elif key == 'shutdown': shutdown += 1 elif key == 'interval': interval = int(val) elif key == 'name': name = shortname(val) elif key == 'msg': msg = val elif key == 'service': service = val elif key == 'time': try: deltaT = now-float(val) except: pass elif key == 'acks': try: acks = int(val) except: acks = -1 if boot: if acks == -1: a = "(%s)" % acks else: a = "" m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a) log(m) if name in watchhosts: email("booted", m) pushover(m) if msg: m = "%s msg: %s" % (name, msg) log(m, service=service) if name in watchhosts: email("msg", m) pushover(m) fromaddr(name, addr[0], boot, interval, acks) if shutdown: m = "%s shutdown" % name log(m) if name in watchhosts: email("shutdown", m) pushover(m) try: hosts[name].newstate(Host.down) except: pass if interval > 0: try: hosts[name].interval = interval except: pass rmsg="ACK" if len(hosts[name].cmds): rmsg=hosts[name].cmds[0] msg="command '%s' initiated" % hosts[name].cmds[0] email("%s cmd exec" % name, msg) pushover(msg) del hosts[name].cmds[0] log("%s command initiated" % name) try: ss=sock.sendto(rmsg, addr) if DEBUG > 2: print "msg from %s,%s, sent %s bytes back" % (addr[0], addr[1], ss) except: pass # # #class HttpServer(BaseHTTPServer.HTTPServer): class HttpServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): allow_reuse_address = True def threaded(): pass # # class HttpHandler(BaseHTTPServer.BaseHTTPRequestHandler): def do_HEAD(self): self.send_response(200) self.send_header("Last-Modified", time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now))) # self.send_header("Accept-Ranges","bytes") # self.send_header("Connection","close") self.send_header("Content-Type","text/html; charset = ISO-8859-1") self.end_headers() def buildhead(self, title="Heartbeat", refresh=None): res=[] res.append('') res.append("") res.append("") res.append('%s' % (title)) if refresh: res.append("\n" % refresh) res.append("") res.append('') return res def buildpage(self): res=self.buildhead(refresh=60) res.append("

Heartbeat status %s

%s (%s)

" % (VER, time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT'))) res.append("") res.append("\n") hosts_sorted = hosts.keys() hosts_sorted.sort() for h in hosts_sorted: res.append("%s\n" % \ (h, hosts[h].dispstate(), hosts[h].dispstats(), hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))) res.append("
HostStateHrDyWkIP AddrLast change
%-24s%-7s%-16s%-17s
") res.append("

Log of Events

") for m in msgs[len(msgs)-30:]: res.append("%s
" % m) res.append("") return res def builderror(self, code, cause, lcause): res=[] res.append('') res.append('') res.append('%s %s' % (code, cause)) res.append('') res.append('

%s

' % (cause)) res.append('

%s

' % lcause) res.append('
') res.append('
hbd (Unix) Server at %s:%s
' % (hbd_host, hbd_port)) res.append('') return res def do_GET(self): global sig xsig = 0 self.do_HEAD() headers=[] if DEBUG > 2: sys.stderr.write("handle\n") uri = self.path upar=string.split(uri,"?") if len(upar) == 1: uarg=[] else: uarg=string.split(upar[1],"&") if DEBUG > 2: sys.stderr.write("handle = %s\n" % (uri)) code = 200 cause = "OK" if uri == "/": res=self.buildpage() elif upar[0] == "/c": # command on host /c?h=melschserver&c=sudo%20ls uname="" ucmd="" if len(uarg) != 2 or len(uarg[0]) < 3 or len(uarg[1]) < 3: code=400 cause='Argument error' res=self.builderror(code, cause, "need h= and c= arguments") else: if uarg[0][:2] == "h=": uname=uarg[0][2:] if uarg[1][:2] == "c=": ucmd=uarg[1][2:] if ucmd != "" and uname != "" and hosts.has_key(uname): hosts[uname].cmds.append(urllib.unquote(ucmd)) res=self.buildhead() res.append("2Done") elif upar[0] == "/d": # drop host /d?h=melschserver if len(uarg) != 1 or len(uarg[0]) < 3: code=400 cause='Argument error' res=self.builderror(code, cause, "need h= argument") else: if uarg[0][:2] == "h=": uname=uarg[0][2:] if uname != "" and hosts.has_key(uname): del hosts[uname] log("%s dropped" % uname) res=self.buildhead() res.append("Done") elif upar[0] == "/n": # register name if len(uarg) != 1 or len(uarg[0]) < 3: code=400 cause='Argument error' res=self.builderror(code, cause, "need h= argument") else: res=self.buildhead() if uarg[0][:2] == "h=": uname=uarg[0][2:] if uname != "" and hosts.has_key(uname): err = nsupdate(uname, hosts[uname].addr) ll="nsupdate request: %s" % err else: ll="name %s not found" % uname res.append(ll) log(ll) elif upar[0] == "/r": # restart res=self.buildhead() res.append("restart request") xsig=signal.SIGHUP log("restart request") else: code=404 cause="Not Found" res=self.builderror(code, cause, "The requested URL was not found on this server.") tosend = [] for h in headers: tosend.append("%s\r" % h) tosend.append("\r") # self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause)) # for h in headers: # self.request.send("%s\r\n" % h) # self.request.send("\r\n") tosend += res self.wfile.write(string.join(tosend, "\n")) if xsig: sig = xsig def setrunning(new): global running if DEBUG > 0: sys.stderr.write("running is now = %s\n" % (new)) running = new def closeup(): setrunning(False) try: sock.close() except: pass try: sock6.close() except: pass if DEBUG > 0: sys.stderr.write("asking http server to stop\n") try: serv.shutdown() if DEBUG > 0: sys.stderr.write("http server stopped\n") except Exception as e: if DEBUG > 0: sys.stderr.write("http server did NOT stop: %s\n" % str(e)) try: serv.server_close() except: pass log("restarting") try: logf.close() except: pass # signal.signal(signal.SIGTERM, 0) signal.signal(signal.SIGHUP, 0) def restart(): print "execv %s %s" % (sys.argv[0], [sys.argv[0]]+cmdargs) os.execv(sys.argv[0], [sys.argv[0]]+cmdargs) print "should not be here" def saveandrestart(): closeup() restart() def pickleit(): pickf = open(PICKFILE, 'w') pick = cPickle.Pickler(pickf) pick.dump(hosts) pick.dump(htab) pick.dump(msgs) pick.dump(lastfm) pickf.close() # # Main # helpflag = False forground = False optlist = [] args = [] home = os.environ['HOME'] cmdargs = [] configfile = "%s/.hbrc" % home try: optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:vx') except: helpflag = True for o, a in optlist: if o == '-c': configfile = a cmdargs += [o, a] if o == '-f': forground = True cmdargs += [o] elif o == '-h': helpflag = True elif o == '-v': verbose = True cmdargs += [o] elif o == '-x': DEBUG += 1 cmdargs += [o] if helpflag: print "hbc HeartBeatDaemon" print "usage: hbd [-dfhvx] [-c configfile]" print print " -c configfile" print " -d display" print " -f run in foreground" print " -h this help" print " -v verbose" print " -x increase debug lvl" print print """ config file can contain logfile = /var/log/heartbeat.log logfmt = [text|msg] hb_port = 50003 interval = 20 hbd_port = 50004 hbd_host = www.domain.com grace = 2 """ sys.exit(1) # # set defaults hb_port = PORT hbd_host = THOST hbd_port = TPORT logfile = LOGFILE logfmt = "text" interval = INTERVAL grace = GRACE watchhosts = [] dyndnshosts = [] drophosts = [] try: f = open(configfile, "r") if verbose: print "notice: using config file %s" % configfile except: print "warning: running without config file: %s" % configfile f = None if f: while 1: l = f.readline() if len(l) == 0: break if verbose: print " %s" % l[:-1] r = l[:-1].split('=') if r[0] == 'interval': interval = eval(r[1]) elif r[0] == 'grace': grace = eval(r[1]) elif r[0] == 'hbd_port': hbd_port = eval(r[1]) elif r[0] == 'hbd_host': hbd_host = r[1] elif r[0] == 'hb_port': hb_port = eval(r[1]) elif r[0] == 'logfile': logfile = r[1] elif r[0] == 'logfmt': logfmt = r[1] elif r[0] == 'watchhosts': watchhosts = eval(r[1]) elif r[0] == 'dyndnshosts': dyndnshosts = eval(r[1]) elif r[0] == 'drophosts': drophosts = eval(r[1]) f.close() if len(args) != 0: print "error: args" sys.exit(1) if verbose: print "notice: logging to %s" % logfile logf = initlog(logfile) if os.path.exists(PICKFILE): pickf = open(PICKFILE, 'r') pick = cPickle.Unpickler(pickf) try: hosts = pick.load() htab = pick.load() msgs = pick.load() try: lastfm = pick.load() except: lastfm = ["","",""] pickf.close() except: os.unlink(PICKFILE) for h in hosts.keys(): hosts[h].fixup() for h in drophosts: if h in hosts: # was: hosts.has_key(h): del hosts[h] now = time.time() startsec = int(now) % interval log("Starting %s" % VER) atexit.register(on_exit) ilist = [] sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind(("", hb_port)) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) ilist.append(sock) sock6 = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) sock6.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock6.bind(("", hb_port)) ilist.append(sock6) serv = HttpServer((hbd_host, hbd_port), HttpHandler) servthread = threading.Thread(target=serv.serve_forever) servthread.daemon = True servthread.start() #ilist.append(serv.fileno()) if not forground: pid = os.fork() if pid > 0: if verbose: print "daemoinizing... pid = %d" % pid sys.exit(0) verbose = False os.close(0) os.close(1) os.close(2) sys.stdin.close() sys.stdout = NullDevice() sys.stderr = NullDevice() os.chdir("/") os.setsid() os.umask(0) dnsQ = Queue.Queue() dnsT = threading.Thread(target=dnsupdatethread) dnsT.daemon = True dnsT.start() running = True sig = 0 #signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGHUP, handler) next = int(now)+15 # 15 seconds time to settle after (re-)start sleep = 1 firstcheck = int(now) + 15 while running: sr = None if DEBUG > 2: sys.stderr.write("about to sleep = %s\n" % (sleep)) try: sr = select.select(ilist, [], [], sleep) now = time.time() except KeyboardInterrupt: sys.stderr.write("Keyboard Interrupt!\n") running = False closeup() continue except select.error, value: if value[0] != 4: # interrupted system call sys.stderr.write("select err %s %s" % (select.error, value)) #raise os.error, value continue continue except Exception as e: if DEBUG > 2: sys.stderr.write("select exception %s\n" % (str(e))) sys.exit(1) if DEBUG > 2: sys.stderr.write("woke from sleep = %s (%s)\n" % (str(sr), str(ilist))) for fh in sr[0]: if fh in [sock, sock6]: readsock(fh) # elif fh == serv.fileno(): # serv.handle_request() else: sys.stderr.write("what happend just now?\n") if DEBUG > 2: sys.stderr.write("done handling, running is %s, sig is %s\n" % (running, sig)) # check hour/day/week for v in xrange(3): fm=tsfm[v] ts=time.strftime(tsfm[v], time.localtime(now)) if ts != lastfm[v]: lastfm[v]=ts for h in hosts.keys(): hosts[h].hdwcounts[v] = [hosts[h].doesack, hosts[h].upcount] if now >= next and now >= firstcheck: next = now+1 checkoverdue() sleep = next-now if sleep < 0: sys.stderr.write("sleep is negative! %s next = %s\n" % (sleep, next)) sleep = 0 if DEBUG > 2: sys.stderr.write("sleep = %s next = %s\n" % (sleep, next)) if sig != 0: setrunning(False) if sig == signal.SIGHUP: if DEBUG > 0: sys.stderr.write("signal 1 saveandrestart\n") saveandrestart()