Files
heartbeat/hbd
T
2013-10-25 13:10:15 -04:00

855 lines
17 KiB
Python
Executable File

#!/usr/bin/env python
# $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence)
#
VER = 1.38
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback, urllib
from subprocess import Popen, STDOUT, PIPE
False = 0
True = 1
LOGFILE = "/home/andreas/public_html/messages/andreas"
PICKFILE = "/var/tmp/hbd.pick"
AEMAIL = ["andreas@wrede.ca"]
NAME = "heatbeat"
SMTPSERVER = "localhost"
hosts = {}
htab = {}
msgs = []
num = 0
upcount = 0
PORT = 50003
TPORT = 50004
THOST = ""
DEBUG = False
verbose = False
INTERVAL = 10
GRACE = 2
visual = 0
os.environ['TZ'] = 'EST5EDT'
stdscr = None
win = None
msgw = None
msgwB = None
msgwHeight = 10
def handler(signum, frame):
global up, sig
if up == 0:
return
sig = signum
if verbose:
print "signal: %s up: %d" % (sig, up)
up = 0
# sys.exit(0)
def shortname(name):
r = string.split(name, '.')
return r[0]
class NullDevice:
def write(self, s):
pass
class Host:
up = "up"
down = "down"
overdue = "overdue"
def __init__(self, name, addr):
global num
self.name = shortname(name)
self.addr = addr
self.num = num
self.lastbeat = time.time()
self.upcount = 0
self.state = Host.up
self.state = "up"
self.statetime = self.lastbeat
self.interval = 0
self.doesack = -1
self.cmds = []
num += 1
def getstate(self):
return self.state
def dispstate(self):
if self.state in ["down", "overdue"]:
state = "<b>%s</b>" % self.state
else:
state = "%s" % self.state
if self.doesack != -1:
return "%s(%s)" % (state, self.doesack)
return state
# set new state, return number of secs in previous state
def newstate(self, state, when=0):
self.state = state
now = time.time()-when
s = now-self.statetime
self.statetime = now
if visual:
displaystatetime(self.name)
return s
def email(s, msg):
ret = "OK"
toaddrs = AEMAIL
fromaddr = "aew.heartbeat@wrede.ca"
subj = "Info from %s: %s" % (NAME, s)
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
try:
server = smtplib.SMTP(SMTPSERVER)
if DEBUG:
server.set_debuglevel(1)
server.sendmail(fromaddr, toaddrs, body)
except smtplib.SMTPRecipientsRefused, errs:
log("cannot send email: %s\n" % (errs))
ret = "Fail"
except:
print("smtp error: "+traceback.format_exc())
saveandrestart()
try:
server.quit()
except:
pass
return ret
#
# nsupdate: set the DNS A record for a fqdn
# return: None if ok, else error text
def nsupdate(hostname, newip):
D = {}
D['domain'] = 'dy.wapanafa.org'
D['fqdn'] = '%s.dy.wapanafa.org' % hostname
D['dnsttl'] = '5'
D['newip'] = newip
D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime())
nsup = """update delete %(fqdn)s A
update add %(fqdn)s %(dnsttl)s A %(newip)s
update delete %(fqdn)s TXT
update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
send
answer
""" % D
# log("DBG: nsup %s" % nsup)
cmd = ["/usr/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"]
# log("DBG: cmd %s" % cmd)
try:
p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
except OSError, e:
return "nsupdate: execution failed: %s" % e
except:
return "nsupdate: some error occured"
(output, err) = p.communicate(nsup)
if output.find('status: NOERROR') >= 0:
return None
return output
#
#
def dur(sec):
sec = int(sec)
h = sec / 3600
m = (sec - h * 3600) / 60
s = (sec - h * 3600) % 60
if h > 0:
return "%d:%02d:%02d" % (h, m, s)
if m > 0:
return "%d:%02d" % (m, s)
return "0:%02d" % s
#
#
def addhost(name, addr):
sname = shortname(name)
if sname in hosts: # was: hosts.has_key(sname):
del htab[hosts[sname].addr]
hosts[sname].addr = addr
if visual:
displayaddr(sname)
htab[addr] = sname
m = "%s, changed address to %s" % (sname, addr)
log(m)
else:
hosts[sname] = Host(sname, addr)
s = hosts.keys()
s.sort()
x = 0
for n in s:
hosts[n].num = x
x += 1
htab[addr] = sname
if visual:
display()
#
def on_exit():
if visual:
exitcurses()
logf.close()
print "exit"
def initlog(logfile):
return open(logfile, "a")
#
#
def initwin():
global win, msgw, msgwB, msgwHeight
maxY, maxX = stdscr.getmaxyx()
begin_x = 0
begin_y = 2
height = len(htab)+2
if DEBUG:
log("initwin called with %d" % height)
win = curses.newwin(height, maxX, begin_y, begin_x)
a = win.border(0, 0, 0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)
msgwB = curses.newwin(0, 0, height+1, begin_x)
msgwB.border(0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)
msgwHeight = maxY-height-3
msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1)
msgw.setscrreg(0, msgwHeight-1)
msgw.scrollok(1)
stdscr.addstr(0, 0, "hbd Version %s" % VER, curses.A_BOLD)
stdscr.refresh()
msgwB.refresh()
#
def checkoverdue():
for h in hosts.keys():
if hosts[h].state == Host.down:
continue
timeout = hosts[h].interval+grace
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
m = "%s is overdue" % h
if h in watchhosts:
email("overdue", m)
hosts[h].newstate(Host.overdue, grace)
log(m)
#
#
def displaytime():
maxY, maxX = stdscr.getmaxyx()
stdscr.addstr(0, maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)
for h in hosts.keys():
d = hosts[h].getstate()
attr = 0
if verbose and hosts[h].state != Host.down:
d = dur(now-hosts[h].lastbeat)
if hosts[h].state == Host.overdue:
attr = curses.A_BOLD
win.addstr(hosts[h].num+1, 25, "%8s" % d, attr)
win.refresh()
stdscr.refresh()
#
#
def displaystatetime(h, refresh=1):
win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))
if refresh:
win.refresh()
#
#
def displayaddr(h, refresh=1):
win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr)
if refresh:
win.refresh()
#
#
def displaybody():
for h in hosts.keys():
win.addstr(hosts[h].num+1, 1, "%-25s" % (h))
if hosts[h].addr is not None:
displayaddr(h, 0)
if hosts[h].statetime is not None:
displaystatetime(h, 0)
win.refresh()
#
#
def displaymsgs():
global msgw, msgs
y = 0
for m in msgs[len(msgs)-msgwHeight:]:
msgw.addstr(y, 0, m)
y += 1
msgw.refresh()
#
#
def display():
if visual:
initwin()
displaytime()
displaybody()
displaymsgs()
def log(m, service="heartbeat"):
msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n"
msgs.append(msg)
if logfmt == "msg":
m2 = "%d|%s|%s\n" % (now, service, m)
else:
m2 = msg
logf.write(m2)
logf.flush()
if msgw is not None:
y, x = msgw.getyx()
# if y > = msgwHeight-1:
# msgw.scroll()
msgw.addstr(msg)
msgw.clrtoeol()
msgw.refresh()
pickleit()
#
#
def fromaddr(name, addr, boot, interval, acks):
global htab
if not name in hosts: # was: hosts.has_key(name):
addhost(name, addr)
host = hosts[name]
host.doesack = acks
if host.addr != addr:
if host.addr in htab: # was: htab.has_key(host.addr):
del htab[host.addr]
host.addr = addr
htab[addr] = name
m = "%s changed address to %s" % (host.name, addr)
if name in dyndnshosts:
err = nsupdate(name, addr)
if err:
m += ", DNS failed: %s" % err
else:
m += ", DNS updated."
log(m)
if name in watchhosts:
email("address change", m)
host.lastbeat = now
if host.getstate() != Host.up and interval > 0:
lasts = host.state
d = host.newstate(Host.up)
m = "%s, back after being %s for %s" % (host.name, lasts, dur(d))
log(m)
if name in watchhosts:
email("back", name)
host.upcount += 1
#
#
def readsock():
global htab, win
data, addr = sock.recvfrom(1024)
pairs = string.split(data, ';')
boot = 0
shutdown = 0
name = "unknown"
service = "unknown"
msg = None
interval = 0
deltaT = 0.0
acks = -1
for pair in pairs:
l = string.split(pair, "=")
key = l[0]
if len(l) != 2:
val = "0"
else:
val = l[1]
if key == 'boot':
boot += 1
elif key == 'shutdown':
shutdown += 1
elif key == 'interval':
interval = int(val)
elif key == 'name':
name = shortname(val)
elif key == 'msg':
msg = val
elif key == 'service':
service = val
elif key == 'time':
try:
deltaT = now-float(val)
except:
pass
elif key == 'acks':
try:
acks = int(val)
except:
acks = -1
if boot:
if acks == -1:
a = "(%s)" % acks
else:
a = ""
m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a)
log(m)
if name in watchhosts:
email("booted", m)
if msg:
m = "%s msg: %s" % (name, msg)
log(m, service=service)
if name in watchhosts:
email("msg", m)
fromaddr(name, addr[0], boot, interval, acks)
if shutdown:
m = "%s shutdown" % name
log(m)
if name in watchhosts:
email("shutdown", m)
try:
hosts[name].newstate(Host.down)
except:
pass
if interval > 0:
try:
hosts[name].interval = interval
except:
pass
rmsg="ACK"
if len(hosts[name].cmds):
rmsg=hosts[name].cmds[0]
email("%s cmd exec" % name, "command '%s' initiated" % hosts[name].cmds[0])
del hosts[name].cmds[0]
log("%s command initiated" % name)
try:
sock.sendto(rmsg, addr)
except:
pass
#
#
#
def initcurses():
global stdscr
stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)
if DEBUG:
sys.stderr.write("curses init done: %s\n" % stdscr)
def exitcurses():
curses.nocbreak()
stdscr.keypad(0)
curses.echo()
curses.endwin()
#
#
class HtmlHandler(SocketServer.BaseRequestHandler):
allow_reuse_address = True
def buildhead(self, title="Heartbeat", refresh=None):
res=[]
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append("<html>")
res.append("<head>")
res.append('<title>%s</title>' % (title))
if refresh:
res.append("<meta http-equiv = Refresh content = %d>\n" % refresh)
res.append("</head>")
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000" BACKGROUND = "/~andreas/images/tile.marble.gif">')
return res
def buildpage(self):
res=self.buildhead(refresh=60)
res.append("<H2>Heartbeat status</h2><h4> %s (%s)</H4>" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT')))
res.append("<table>")
res.append("<tr><th>Host</th><th>State</th><th>IP Addr</th><th>Last change</th></tr>\n")
hosts_sorted = hosts.keys()
hosts_sorted.sort()
for h in hosts_sorted:
res.append("<tr><td>%-24s</td><td>%-7s</td><td>%-16s</td><td>%-17s</td></tr>\n" % (h, hosts[h].dispstate(), hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))))
res.append("</table>")
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs)-30:]:
res.append("%s<BR>" % m)
res.append("</body></html>")
return res
def handle(self):
headers=[]
headers.append("Date: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
headers.append("Server: hbd")
headers.append("Last-Modified: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
headers.append("Accept-Ranges: bytes")
headers.append("Connection: close")
headers.append("Content-Type: text/html; charset = ISO-8859-1")
uri = '/unknown'
f = self.request.makefile()
while 1:
line = string.strip(f.readline())
if len(line) == 0:
break
r = line.split()
if r[0] == "GET":
uri = r[1]
html = r[2]
upar=string.split(uri,"?")
if len(upar) == 1:
uarg=[]
else:
uarg=string.split(upar[1],"&")
code = 200
cause = "OK"
if uri == "/":
res=self.buildpage()
elif upar[0] == "/c": # command on host /c?h=melschserver&c=sudo%20ls
uname=""
ucmd=""
if uarg[0][:2] == "h=":
uname=uarg[0][2:]
if uarg[1][:2] == "c=":
ucmd=uarg[1][2:]
if ucmd != "" and uname != "" and hosts.has_key(uname):
hosts[uname].cmds.append(urllib.unquote(ucmd))
res=self.buildhead()
res.append("2Done")
elif upar[0] == "/d": # drop host /d?h=melschserver
if uarg[0][:2] == "h=":
uname=uarg[0][2:]
if uname != "" and hosts.has_key(uname):
del hosts[uname]
log("%s dropped" % uname)
res=self.buildhead()
res.append("Done")
else:
code = 404
cause = "Not Found"
res=[]
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append('<html><head>')
res.append('<title>%s %s</title>' % (code, cause))
res.append('</head><body>')
res.append('<h1>%s</h1>' % (cause))
res.append('<p>The requested URL %s was not found on this server.</p>' % uri)
res.append('<hr>')
res.append('<address>hbd (Unix) Server at %s Port %s</address>' % (hbd_host, hbd_port))
res.append('</body></html>')
self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause))
for h in headers:
self.request.send("%s\r\n" % h)
self.request.send("\r\n")
try:
self.request.send(string.join(res, "\n"))
except:
pass
def saveandrestart():
sock.close()
serv.shutdown()
serv.server_close()
log("restarting")
os.execv(sys.argv[0], [sys.argv[0]]+cmdargs)
def pickleit():
pickf = open(PICKFILE, 'w')
pick = cPickle.Pickler(pickf)
pick.dump(hosts)
pick.dump(htab)
pick.dump(msgs)
pickf.close()
#
# Main
#
helpflag = False
forground = False
optlist = []
args = []
home = os.environ['HOME']
cmdargs = []
configfile = "%s/.hbrc" % home
try:
optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:v')
except:
helpflag = True
for o, a in optlist:
if o == '-c':
configfile = a
cmdargs += [o, a]
if o == '-d':
visual = True
cmdargs += [o]
elif o == '-f':
forground = True
cmdargs += [o]
elif o == '-h':
helpflag = True
elif o == '-v':
verbose = True
cmdargs += [o]
if helpflag:
print "hbc HeartBeatDaemon"
print "usage: hbd [-dfhv] [-c configfile]"
print
print " -c configfile"
print " -d display"
print " -f run in foreground"
print " -h this help"
print " -v verbose"
print
print """ config file can contain
logfile = /var/log/heartbeat.log
logfmt = [text|msg]
hb_port = 50003
interval = 20
hbd_port = 50004
hbd_host = www.domain.com
grace = 2
"""
sys.exit(1)
if visual:
forground = True
#
# set defaults
hb_port = PORT
hbd_host = THOST
hbd_port = TPORT
logfile = LOGFILE
logfmt = "text"
interval = INTERVAL
grace = GRACE
watchhosts = []
dyndnshosts = []
drophosts = []
try:
f = open(configfile, "r")
if verbose:
print "notice: using config file %s" % configfile
except:
print "warning: running without conifig file: %s" % configfile
f = None
if f:
while 1:
l = f.readline()
if len(l) == 0:
break
if verbose:
print " %s" % l[:-1]
r = l[:-1].split('=')
if r[0] == 'interval':
interval = eval(r[1])
elif r[0] == 'grace':
grace = eval(r[1])
elif r[0] == 'hbd_port':
hbd_port = eval(r[1])
elif r[0] == 'hbd_host':
hbd_host = r[1]
elif r[0] == 'hb_port':
hb_port = eval(r[1])
elif r[0] == 'logfile':
logfile = r[1]
elif r[0] == 'logfmt':
logfmt = r[1]
elif r[0] == 'watchhosts':
watchhosts = eval(r[1])
elif r[0] == 'dyndnshosts':
dyndnshosts = eval(r[1])
elif r[0] == 'drophosts':
drophosts = eval(r[1])
f.close()
if len(args) != 0:
print "error: args"
sys.exit(1)
if verbose:
print "notice: logging to %s" % logfile
logf = initlog(logfile)
if os.path.exists(PICKFILE):
pickf = open(PICKFILE, 'r')
pick = cPickle.Unpickler(pickf)
try:
hosts = pick.load()
htab = pick.load()
msgs = pick.load()
pickf.close()
except:
os.unlink(PICKFILE)
for h in drophosts:
if h in hosts: # was: hosts.has_key(h):
del hosts[h]
now = time.time()
startsec = int(now) % interval
if visual:
import curses
initcurses()
display()
stdscr.nodelay(1)
log("Starting %s" % VER)
atexit.register(on_exit)
ilist = []
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.bind(("", hb_port))
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
ilist.append(sock)
serv = SocketServer.TCPServer((hbd_host, hbd_port), HtmlHandler)
ilist.append(serv.fileno())
if not forground:
pid = os.fork()
if pid > 0:
if verbose:
print "daemoinizing... pid = %d" % pid
sys.exit(0)
verbose = False
os.close(0)
os.close(1)
os.close(2)
sys.stdin.close()
sys.stdout = NullDevice()
sys.stderr = NullDevice()
os.chdir("/")
os.setsid()
os.umask(0)
up = 1
sig = 0
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler)
next = int(now)+15 # 15 seconds time to settle after (re-)start
sleep = next - now
while up:
if visual:
c = stdscr.getch()
if c == ord('c'):
msgs = []
display()
elif c == ord('q'):
break # Exit the while()
elif c == ord('d'):
DEBUG = not DEBUG
elif c == ord('v'):
verbose = not verbose
# elif c == ord('p'):
# PrintDocument()
# elif c == ord('x'):
# x = y = 0
try:
sr = select.select(ilist, [], [], sleep)
now = time.time()
except KeyboardInterrupt:
sys.exit(0)
except select.error, value:
if value[0] != 4: # interrupted system call
print select.error, value
#raise os.error, value
continue
if visual:
exitcurses()
initcurses()
display()
continue
for fh in sr[0]:
if fh == sock:
readsock()
if fh == serv.fileno():
serv.handle_request()
if now >= next:
next = now+1
checkoverdue()
if visual:
stdscr.move(1, 0)
stdscr.clrtoeol()
displaytime()
sleep = next-now
if sleep < 0:
sys.stderr.write("sleep is negaitive! %s next = %s\n" % (sleep, next))
sleep = 0
if DEBUG:
sys.stderr.write("sleep = %s next = %s\n" % (sleep, next))
if sig == signal.SIGHUP:
saveandrestart()