Files
heartbeat/hbd
T
2015-01-06 15:44:22 +01:00

950 lines
19 KiB
Python
Executable File

#!/usr/bin/env python
# $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence)
#
VER = 1.52
import time
import os
import string
import sys
import socket
import atexit
import select
import SocketServer
import getopt
import signal
import cPickle
import smtplib
import traceback
import urllib
import httplib
from subprocess import Popen, STDOUT, PIPE
SEND_EMAIL=False
SEND_PUSHOVER=True
False = 0
True = 1
LOGFILE = "/home/andreas/public_html/messages/andreas"
PICKFILE = "/var/tmp/hbd.pick"
AEMAIL = ["andreas@wrede.ca"]
NAME = "heatbeat"
SMTPSERVER = "localhost"
hosts = {}
htab = {}
msgs = []
num = 0
upcount = 0
PORT = 50003
TPORT = 50004
THOST = ""
DEBUG = False
verbose = False
INTERVAL = 10
GRACE = 2
visual = 0
os.environ['TZ'] = 'EST5EDT'
stdscr = None
win = None
msgw = None
msgwB = None
msgwHeight = 10
def handler(signum, frame):
global running, sig
sig = signum
if not running:
if verbose:
print "NOT runing signal: %s running: %d" % (sig, running)
return
# signal.signal(sig, handler)
if verbose:
print "signal: %s running: %s frame: %s" % (sig, running, frame)
running = False
# sys.exit(0)
def shortname(name):
r = string.split(name, '.')
return r[0]
class NullDevice:
def write(self, s):
pass
class Host:
up = "up"
down = "down"
overdue = "overdue"
def __init__(self, name, addr):
global num
self.name = shortname(name)
self.addr = addr
self.num = num
self.lastbeat = time.time()
self.upcount = 0
self.state = Host.up
self.state = "up"
self.statetime = self.lastbeat
self.interval = 0
self.doesack = -1
self.cmds = []
num += 1
# called when reloading class from pickle
def fixup(self):
try:
a=self.cmds
except:
self.cmds=[]
def getstate(self):
return self.state
def dispstate(self):
if self.state in ["down", "overdue"]:
state = "<b>%s</b>" % self.state
else:
state = "%s" % self.state
if self.doesack != -1:
return "%s(%s)" % (state, self.doesack)
return state
# set new state, return number of secs in previous state
def newstate(self, state, when=0):
self.state = state
now = time.time()-when
s = now-self.statetime
self.statetime = now
if visual:
displaystatetime(self.name)
return s
def email(s, msg):
if not SEND_EMAIL:
return
ret = "OK"
toaddrs = AEMAIL
fromaddr = "aew.heartbeat@wrede.ca"
subj = "Info from %s: %s" % (NAME, s)
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
try:
server = smtplib.SMTP(SMTPSERVER)
if DEBUG:
server.set_debuglevel(1)
server.sendmail(fromaddr, toaddrs, body)
except smtplib.SMTPRecipientsRefused, errs:
log("cannot send email: %s\n" % (errs))
ret = "Fail"
except:
print("smtp error: "+traceback.format_exc())
saveandrestart()
try:
server.quit()
except:
pass
return ret
def pushover(msg):
if not SEND_PUSHOVER:
return
conn = httplib.HTTPSConnection("api.pushover.net:443")
try:
conn.request("POST", "/1/messages.json",
urllib.urlencode({
"token": "ac7NLX2rPjXFareeDgLpXNoDf4iFmf",
"user": "uDhH33UjQQDYtNzJb1ThRiWb9ingGK",
"message": msg, }), { "Content-type": "application/x-www-form-urlencoded" })
conn.getresponse()
except:
pass
# nsupdate: set the DNS A record for a fqdn
# return: None if ok, else error text
def nsupdate(hostname, newip):
D = {}
D['domain'] = 'dy.wapanafa.org'
D['fqdn'] = '%s.dy.wapanafa.org' % hostname
D['dnsttl'] = '5'
D['newip'] = newip
D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime())
nsup = """update delete %(fqdn)s A
update add %(fqdn)s %(dnsttl)s A %(newip)s
update delete %(fqdn)s TXT
update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
send
answer
""" % D
# log("DBG: nsup %s" % nsup)
cmd = ["/usr/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"]
# log("DBG: cmd %s" % cmd)
try:
p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
except OSError, e:
return "nsupdate: execution failed: %s" % e
except:
return "nsupdate: some error occured"
(output, err) = p.communicate(nsup)
if output.find('status: NOERROR') >= 0:
return None
return output
#
def dur(sec):
sec = int(sec)
h = sec / 3600
m = (sec - h * 3600) / 60
s = (sec - h * 3600) % 60
if h > 0:
return "%d:%02d:%02d" % (h, m, s)
if m > 0:
return "%d:%02d" % (m, s)
return "0:%02d" % s
#
def addhost(name, addr):
sname = shortname(name)
if sname in hosts: # was: hosts.has_key(sname):
del htab[hosts[sname].addr]
hosts[sname].addr = addr
if visual:
displayaddr(sname)
htab[addr] = sname
m = "%s, changed address to %s" % (sname, addr)
log(m)
else:
hosts[sname] = Host(sname, addr)
s = hosts.keys()
s.sort()
x = 0
for n in s:
hosts[n].num = x
x += 1
htab[addr] = sname
if visual:
display()
#
def on_exit():
if visual:
exitcurses()
if DEBUG:
sys.stderr.write("on_exit\n")
logf.close()
print "exit"
def initlog(logfile):
return open(logfile, "a")
#
#
def initwin():
global win, msgw, msgwB, msgwHeight
maxY, maxX = stdscr.getmaxyx()
begin_x = 0
begin_y = 2
height = len(htab)+2
if DEBUG:
log("initwin called with %d" % height)
win = curses.newwin(height, maxX, begin_y, begin_x)
a = win.border(0, 0, 0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)
msgwB = curses.newwin(0, 0, height+1, begin_x)
msgwB.border(0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)
msgwHeight = maxY-height-3
msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1)
msgw.setscrreg(0, msgwHeight-1)
msgw.scrollok(1)
stdscr.addstr(0, 0, "hbd Version %s" % VER, curses.A_BOLD)
stdscr.refresh()
msgwB.refresh()
#
def checkoverdue():
for h in hosts.keys():
if hosts[h].state == Host.down:
continue
timeout = hosts[h].interval+grace
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
m = "%s is overdue" % h
if h in watchhosts:
email("overdue", m)
pushover(m)
hosts[h].newstate(Host.overdue, grace)
log(m)
#
#
def displaytime():
maxY, maxX = stdscr.getmaxyx()
stdscr.addstr(0, maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)
for h in hosts.keys():
d = hosts[h].getstate()
attr = 0
if verbose and hosts[h].state != Host.down:
d = dur(now-hosts[h].lastbeat)
if hosts[h].state == Host.overdue:
attr = curses.A_BOLD
win.addstr(hosts[h].num+1, 25, "%8s" % d, attr)
win.refresh()
stdscr.refresh()
#
#
def displaystatetime(h, refresh=1):
win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))
if refresh:
win.refresh()
#
#
def displayaddr(h, refresh=1):
win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr)
if refresh:
win.refresh()
#
#
def displaybody():
for h in hosts.keys():
win.addstr(hosts[h].num+1, 1, "%-25s" % (h))
if hosts[h].addr is not None:
displayaddr(h, 0)
if hosts[h].statetime is not None:
displaystatetime(h, 0)
win.refresh()
#
#
def displaymsgs():
global msgw, msgs
y = 0
for m in msgs[len(msgs)-msgwHeight:]:
msgw.addstr(y, 0, m)
y += 1
msgw.refresh()
#
#
def display():
if visual:
initwin()
displaytime()
displaybody()
displaymsgs()
def log(m, service="heartbeat"):
if DEBUG: print "Log: %s" % m
msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n"
msgs.append(msg)
if logfmt == "msg":
m2 = "%d|%s|%s\n" % (now, service, m)
else:
m2 = msg
logf.write(m2)
logf.flush()
if msgw is not None:
y, x = msgw.getyx()
# if y > = msgwHeight-1:
# msgw.scroll()
msgw.addstr(msg)
msgw.clrtoeol()
msgw.refresh()
pickleit()
#
#
def fromaddr(name, addr, boot, interval, acks):
global htab
if not name in hosts: # was: hosts.has_key(name):
addhost(name, addr)
host = hosts[name]
host.doesack = acks
if host.addr != addr:
if host.addr in htab: # was: htab.has_key(host.addr):
del htab[host.addr]
host.addr = addr
htab[addr] = name
m = "%s changed address to %s" % (host.name, addr)
if name in dyndnshosts:
err = nsupdate(name, addr)
if err:
m += ", DNS failed: %s" % err
else:
m += ", DNS updated."
log(m)
if name in watchhosts:
email("address change", m)
pushover(m)
host.lastbeat = now
if host.getstate() != Host.up and interval > 0:
lasts = host.state
d = host.newstate(Host.up)
m = "%s, back after being %s for %s" % (host.name, lasts, dur(d))
log(m)
if name in watchhosts:
email("back", name)
pushover("%s is back" % name)
host.upcount += 1
#
#
def readsock(sock):
global htab, win
data, addr = sock.recvfrom(1024)
pairs = string.split(data, ';')
boot = 0
shutdown = 0
name = "unknown"
service = "unknown"
msg = None
interval = 0
deltaT = 0.0
acks = -1
for pair in pairs:
l = string.split(pair, "=")
key = l[0]
if len(l) != 2:
val = "0"
else:
val = l[1]
if key == 'boot':
boot += 1
elif key == 'shutdown':
shutdown += 1
elif key == 'interval':
interval = int(val)
elif key == 'name':
name = shortname(val)
elif key == 'msg':
msg = val
elif key == 'service':
service = val
elif key == 'time':
try:
deltaT = now-float(val)
except:
pass
elif key == 'acks':
try:
acks = int(val)
except:
acks = -1
if boot:
if acks == -1:
a = "(%s)" % acks
else:
a = ""
m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a)
log(m)
if name in watchhosts:
email("booted", m)
pushover(m)
if msg:
m = "%s msg: %s" % (name, msg)
log(m, service=service)
if name in watchhosts:
email("msg", m)
pushover(m)
fromaddr(name, addr[0], boot, interval, acks)
if shutdown:
m = "%s shutdown" % name
log(m)
if name in watchhosts:
email("shutdown", m)
pushover(m)
try:
hosts[name].newstate(Host.down)
except:
pass
if interval > 0:
try:
hosts[name].interval = interval
except:
pass
rmsg="ACK"
if len(hosts[name].cmds):
rmsg=hosts[name].cmds[0]
msg="command '%s' initiated" % hosts[name].cmds[0]
email("%s cmd exec" % name, msg)
pushover(msg)
del hosts[name].cmds[0]
log("%s command initiated" % name)
try:
ss=sock.sendto(rmsg, addr)
if DEBUG:
log("msg from %s,%s, sent %s bytes back" % (addr[0], addr[1], ss))
except:
pass
#
#
#
def initcurses():
global stdscr
stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)
if DEBUG:
sys.stderr.write("curses init done: %s\n" % stdscr)
def exitcurses():
curses.nocbreak()
stdscr.keypad(0)
curses.echo()
curses.endwin()
class HtmlServer(SocketServer.TCPServer):
allow_reuse_address = True
#
#
class HtmlHandler(SocketServer.BaseRequestHandler):
def buildhead(self, title="Heartbeat", refresh=None):
res=[]
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append("<html>")
res.append("<head>")
res.append('<title>%s</title>' % (title))
if refresh:
res.append("<meta http-equiv = Refresh content = %d>\n" % refresh)
res.append("</head>")
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000" BACKGROUND = "/~andreas/images/tile.marble.gif">')
return res
def buildpage(self):
res=self.buildhead(refresh=60)
res.append("<H2>Heartbeat status</h2><h4> %s (%s)</H4>" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT')))
res.append("<table>")
res.append("<tr><th>Host</th><th>State</th><th>IP Addr</th><th>Last change</th></tr>\n")
hosts_sorted = hosts.keys()
hosts_sorted.sort()
for h in hosts_sorted:
res.append("<tr><td>%-24s</td><td>%-7s</td><td>%-16s</td><td>%-17s</td></tr>\n" % (h, hosts[h].dispstate(), hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))))
res.append("</table>")
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs)-30:]:
res.append("%s<BR>" % m)
res.append("</body></html>")
return res
def handle(self):
global sig, running
headers=[]
headers.append("Date: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
headers.append("Server: hbd")
headers.append("Last-Modified: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
headers.append("Accept-Ranges: bytes")
headers.append("Connection: close")
headers.append("Content-Type: text/html; charset = ISO-8859-1")
uri = '/unknown'
f = self.request.makefile()
while 1:
line = string.strip(f.readline())
if len(line) == 0:
break
r = line.split()
if r[0] == "GET":
uri = r[1]
html = r[2]
upar=string.split(uri,"?")
if len(upar) == 1:
uarg=[]
else:
uarg=string.split(upar[1],"&")
code = 200
cause = "OK"
if uri == "/":
res=self.buildpage()
elif upar[0] == "/c": # command on host /c?h=melschserver&c=sudo%20ls
uname=""
ucmd=""
if uarg[0][:2] == "h=":
uname=uarg[0][2:]
if uarg[1][:2] == "c=":
ucmd=uarg[1][2:]
if ucmd != "" and uname != "" and hosts.has_key(uname):
hosts[uname].cmds.append(urllib.unquote(ucmd))
res=self.buildhead()
res.append("2Done")
elif upar[0] == "/d": # drop host /d?h=melschserver
if uarg[0][:2] == "h=":
uname=uarg[0][2:]
if uname != "" and hosts.has_key(uname):
del hosts[uname]
log("%s dropped" % uname)
res=self.buildhead()
res.append("Done")
elif upar[0] == "/n": # register name
res=self.buildhead()
if uarg[0][:2] == "h=":
uname=uarg[0][2:]
if uname != "" and hosts.has_key(uname):
err = nsupdate(uname, hosts[uname].addr)
ll="nsupdate request: %s" % err
else:
ll="name %s not found" % uname
res.append(ll)
log(ll)
elif upar[0] == "/r": # restart
res=self.buildhead()
res.append("restart request")
sig=signal.SIGHUP
running=False
log("restart request")
else:
code = 404
cause = "Not Found"
res=[]
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append('<html><head>')
res.append('<title>%s %s</title>' % (code, cause))
res.append('</head><body>')
res.append('<h1>%s</h1>' % (cause))
res.append('<p>The requested URL %s was not found on this server.</p>' % uri)
res.append('<hr>')
res.append('<address>hbd (Unix) Server at %s Port %s</address>' % (hbd_host, hbd_port))
res.append('</body></html>')
self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause))
for h in headers:
self.request.send("%s\r\n" % h)
self.request.send("\r\n")
try:
self.request.send(string.join(res, "\n"))
except:
pass
def saveandrestart():
sock.close()
# serv.shutdown() #N.B. dont shutdown() as we don't use serv_forever
serv.server_close()
log("restarting")
os.execv(sys.argv[0], [sys.argv[0]]+cmdargs)
def pickleit():
pickf = open(PICKFILE, 'w')
pick = cPickle.Pickler(pickf)
pick.dump(hosts)
pick.dump(htab)
pick.dump(msgs)
pickf.close()
#
# Main
#
helpflag = False
forground = False
optlist = []
args = []
home = os.environ['HOME']
cmdargs = []
configfile = "%s/.hbrc" % home
try:
optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:vx')
except:
helpflag = True
for o, a in optlist:
if o == '-c':
configfile = a
cmdargs += [o, a]
if o == '-d':
visual = True
cmdargs += [o]
elif o == '-f':
forground = True
cmdargs += [o]
elif o == '-h':
helpflag = True
elif o == '-v':
verbose = True
cmdargs += [o]
elif o == '-x':
DEBUG = True
if helpflag:
print "hbc HeartBeatDaemon"
print "usage: hbd [-dfhvx] [-c configfile]"
print
print " -c configfile"
print " -d display"
print " -f run in foreground"
print " -h this help"
print " -v verbose"
print " -x debug"
print
print """ config file can contain
logfile = /var/log/heartbeat.log
logfmt = [text|msg]
hb_port = 50003
interval = 20
hbd_port = 50004
hbd_host = www.domain.com
grace = 2
"""
sys.exit(1)
if visual:
forground = True
#
# set defaults
hb_port = PORT
hbd_host = THOST
hbd_port = TPORT
logfile = LOGFILE
logfmt = "text"
interval = INTERVAL
grace = GRACE
watchhosts = []
dyndnshosts = []
drophosts = []
try:
f = open(configfile, "r")
if verbose:
print "notice: using config file %s" % configfile
except:
print "warning: running without conifig file: %s" % configfile
f = None
if f:
while 1:
l = f.readline()
if len(l) == 0:
break
if verbose:
print " %s" % l[:-1]
r = l[:-1].split('=')
if r[0] == 'interval':
interval = eval(r[1])
elif r[0] == 'grace':
grace = eval(r[1])
elif r[0] == 'hbd_port':
hbd_port = eval(r[1])
elif r[0] == 'hbd_host':
hbd_host = r[1]
elif r[0] == 'hb_port':
hb_port = eval(r[1])
elif r[0] == 'logfile':
logfile = r[1]
elif r[0] == 'logfmt':
logfmt = r[1]
elif r[0] == 'watchhosts':
watchhosts = eval(r[1])
elif r[0] == 'dyndnshosts':
dyndnshosts = eval(r[1])
elif r[0] == 'drophosts':
drophosts = eval(r[1])
f.close()
if len(args) != 0:
print "error: args"
sys.exit(1)
if verbose:
print "notice: logging to %s" % logfile
logf = initlog(logfile)
if os.path.exists(PICKFILE):
pickf = open(PICKFILE, 'r')
pick = cPickle.Unpickler(pickf)
try:
hosts = pick.load()
htab = pick.load()
msgs = pick.load()
pickf.close()
except:
os.unlink(PICKFILE)
for h in hosts.keys():
hosts[h].fixup()
for h in drophosts:
if h in hosts: # was: hosts.has_key(h):
del hosts[h]
now = time.time()
startsec = int(now) % interval
if visual:
import curses
initcurses()
display()
stdscr.nodelay(1)
log("Starting %s" % VER)
atexit.register(on_exit)
ilist = []
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.bind(("", hb_port))
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
ilist.append(sock)
sock6 = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
sock6.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock6.bind(("", hb_port))
ilist.append(sock6)
serv = HtmlServer((hbd_host, hbd_port), HtmlHandler)
ilist.append(serv.fileno())
if not forground:
pid = os.fork()
if pid > 0:
if verbose:
print "daemoinizing... pid = %d" % pid
sys.exit(0)
verbose = False
os.close(0)
os.close(1)
os.close(2)
sys.stdin.close()
sys.stdout = NullDevice()
sys.stderr = NullDevice()
os.chdir("/")
os.setsid()
os.umask(0)
running = True
sig = 0
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler)
next = int(now)+15 # 15 seconds time to settle after (re-)start
sleep = next - now
while running:
if visual:
c = stdscr.getch()
if c == ord('c'):
msgs = []
display()
elif c == ord('q'):
break # Exit the while()
elif c == ord('d'):
DEBUG = not DEBUG
elif c == ord('v'):
verbose = not verbose
# elif c == ord('p'):
# PrintDocument()
# elif c == ord('x'):
# x = y = 0
if DEBUG:
sys.stderr.write("about to sleep = %s\n" % (sleep))
try:
sr = select.select(ilist, [], [], sleep)
now = time.time()
except KeyboardInterrupt:
sys.exit(0)
except select.error, value:
if value[0] != 4: # interrupted system call
print select.error, value
#raise os.error, value
continue
if visual:
exitcurses()
initcurses()
display()
continue
except:
sys.exit(1)
for fh in sr[0]:
if fh in [sock, sock6]:
readsock(fh)
elif fh == serv.fileno():
serv.handle_request()
else:
print("what happend just now")
if now >= next:
next = now+1
checkoverdue()
if visual:
stdscr.move(1, 0)
stdscr.clrtoeol()
displaytime()
sleep = next-now
if sleep < 0:
sys.stderr.write("sleep is negaitive! %s next = %s\n" % (sleep, next))
sleep = 0
if DEBUG:
sys.stderr.write("sleep = %s next = %s\n" % (sleep, next))
if sig == signal.SIGHUP:
if DEBUG:
sys.stderr.write("signal 1 exit\n")
saveandrestart()