#!/usr/bin/env python
# $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence)
#
VER = 1.38

import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback, urllib

from subprocess import Popen, STDOUT, PIPE


False = 0
True = 1
LOGFILE = "/home/andreas/public_html/messages/andreas"
PICKFILE = "/var/tmp/hbd.pick"
AEMAIL = ["andreas@wrede.ca"]
NAME = "heatbeat"
SMTPSERVER = "localhost"

hosts = {}
htab = {}

msgs = []

num = 0
upcount = 0
PORT = 50003
TPORT = 50004
THOST = ""
DEBUG = False
verbose = False

INTERVAL = 10
GRACE = 2

visual = 0
os.environ['TZ'] = 'EST5EDT'

stdscr = None
win = None
msgw = None
msgwB = None
msgwHeight = 10

def handler(signum, frame):
	global up, sig
	if up == 0:
		return
	sig = signum
	if verbose:
		print "signal: %s up: %d" % (sig, up)
	up = 0
#	sys.exit(0)


def shortname(name):
	r = string.split(name, '.')
	return r[0]


class NullDevice:
	def write(self, s):
		pass


class Host:
	up = "up"
	down = "down"
	overdue = "overdue"

	def __init__(self, name, addr):
		global num
		self.name = shortname(name)
		self.addr = addr
		self.num = num
		self.lastbeat = time.time()
		self.upcount = 0
		self.state = Host.up
		self.state = "up"
		self.statetime = self.lastbeat
		self.interval = 0
		self.doesack = -1
		self.cmds = []
		num += 1

	def getstate(self):
		return self.state

	def dispstate(self):
		if self.state in ["down", "overdue"]:
			state = "<b>%s</b>" % self.state
		else:
			state = "%s" % self.state
		if self.doesack != -1:
			return "%s(%s)" % (state, self.doesack)
		return state

	# set new state, return number of secs in previous state
	def newstate(self, state, when=0):
		self.state = state
		now = time.time()-when
		s = now-self.statetime
		self.statetime = now
		if visual:
			displaystatetime(self.name)
		return s


def email(s, msg):
		ret = "OK"
		toaddrs = AEMAIL
		fromaddr = "aew.heartbeat@wrede.ca"
		subj = "Info from %s: %s" % (NAME, s)
		date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
		body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
		try:
			server = smtplib.SMTP(SMTPSERVER)
			if DEBUG:
				server.set_debuglevel(1)
			server.sendmail(fromaddr, toaddrs, body)
		except smtplib.SMTPRecipientsRefused, errs:
			log("cannot send email: %s\n" % (errs))
			ret = "Fail"
		except:
			print("smtp error: "+traceback.format_exc())
			saveandrestart()
		try:
			server.quit()
		except:
			pass
		return ret


#
# nsupdate:  set the DNS A record for a fqdn
#	return: None if ok, else error text
def nsupdate(hostname, newip):
	D = {}
	D['domain'] = 'dy.wapanafa.org'
	D['fqdn'] = '%s.dy.wapanafa.org' % hostname
	D['dnsttl'] = '5'
	D['newip'] = newip
	D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime())
	nsup = """update delete %(fqdn)s A
update add %(fqdn)s %(dnsttl)s A %(newip)s
update delete %(fqdn)s TXT
update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
send
answer

""" % D
#	log("DBG: nsup %s" % nsup)
	cmd = ["/usr/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"]
#	log("DBG: cmd %s" % cmd)
	try:
		p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
	except OSError, e:
		return "nsupdate: execution failed: %s" % e
	except:
		return "nsupdate: some error occured"

	(output, err) = p.communicate(nsup)
	if output.find('status: NOERROR') >= 0:
		return None
	return output


#
#
def dur(sec):
	sec = int(sec)
	h = sec / 3600
	m = (sec - h * 3600) / 60
	s = (sec - h * 3600) % 60
	if h > 0:
		return "%d:%02d:%02d" % (h, m, s)
	if m > 0:
		return "%d:%02d" % (m, s)
	return "0:%02d" % s


#
#
def addhost(name, addr):
	sname = shortname(name)
	if sname in hosts:		# was: hosts.has_key(sname):
		del htab[hosts[sname].addr]
		hosts[sname].addr = addr
		if visual:
			displayaddr(sname)
		htab[addr] = sname
		m = "%s, changed address to %s" % (sname, addr)
		log(m)
	else:
		hosts[sname] = Host(sname, addr)
		s = hosts.keys()
		s.sort()
		x = 0
		for n in s:
			hosts[n].num = x
			x += 1
		htab[addr] = sname
		if visual:
			display()


#
def on_exit():
	if visual:
		exitcurses()
	logf.close()
	print "exit"


def initlog(logfile):
	return open(logfile, "a")


#
#
def initwin():
	global win, msgw, msgwB, msgwHeight

	maxY, maxX = stdscr.getmaxyx()

	begin_x = 0
	begin_y = 2
	height = len(htab)+2
	if DEBUG:
		log("initwin called with %d" % height)
	win = curses.newwin(height, maxX, begin_y, begin_x)
	a = win.border(0, 0, 0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)

	msgwB = curses.newwin(0, 0, height+1, begin_x)
	msgwB.border(0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)

	msgwHeight = maxY-height-3
	msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1)
	msgw.setscrreg(0, msgwHeight-1)
	msgw.scrollok(1)

	stdscr.addstr(0, 0, "hbd Version %s" % VER, curses.A_BOLD)
	stdscr.refresh()
	msgwB.refresh()


#
def checkoverdue():

	for h in hosts.keys():
		if hosts[h].state == Host.down:
			continue
		timeout = hosts[h].interval+grace
		if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
			m = "%s is overdue" % h
			if h in watchhosts:
				email("overdue", m)
			hosts[h].newstate(Host.overdue, grace)
			log(m)


#
#
def displaytime():
	maxY, maxX = stdscr.getmaxyx()
	stdscr.addstr(0, maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)

	for h in hosts.keys():
		d = hosts[h].getstate()
		attr = 0
		if verbose and hosts[h].state != Host.down:
			d = dur(now-hosts[h].lastbeat)
		if hosts[h].state == Host.overdue:
			attr = curses.A_BOLD
		win.addstr(hosts[h].num+1, 25, "%8s" % d, attr)
	win.refresh()
	stdscr.refresh()


#
#
def displaystatetime(h, refresh=1):
	win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))
	if refresh:
		win.refresh()


#
#
def displayaddr(h, refresh=1):
	win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr)
	if refresh:
		win.refresh()


#
#
def displaybody():
	for h in hosts.keys():
		win.addstr(hosts[h].num+1, 1, "%-25s" % (h))
		if hosts[h].addr is not None:
			displayaddr(h, 0)
		if hosts[h].statetime is not None:
			displaystatetime(h, 0)
	win.refresh()


#
#
def displaymsgs():
	global msgw, msgs
	y = 0
	for m in msgs[len(msgs)-msgwHeight:]:
		msgw.addstr(y, 0, m)
		y += 1
	msgw.refresh()


#
#
def display():
	if visual:
		initwin()
		displaytime()
		displaybody()
		displaymsgs()


def log(m, service="heartbeat"):
	msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n"
	msgs.append(msg)
	if logfmt == "msg":
		m2 = "%d|%s|%s\n" % (now, service, m)
	else:
		m2 = msg
	logf.write(m2)
	logf.flush()
	if msgw is not None:
		y, x = msgw.getyx()
#		if y > =  msgwHeight-1:
#			msgw.scroll()
		msgw.addstr(msg)
		msgw.clrtoeol()
		msgw.refresh()
	pickleit()


#
#
def fromaddr(name, addr, boot, interval, acks):
	global htab

	if not name in hosts:		# was: hosts.has_key(name):
		addhost(name, addr)
	host = hosts[name]
	host.doesack = acks
	if host.addr != addr:
		if host.addr in htab:		# was: htab.has_key(host.addr):
			del htab[host.addr]
		host.addr = addr
		htab[addr] = name
		m = "%s changed address to %s" % (host.name, addr)
		if name in dyndnshosts:
			err = nsupdate(name, addr)
			if err:
				m += ", DNS failed: %s" % err
			else:
				m += ", DNS updated."
		log(m)
		if name in watchhosts:
			email("address change", m)

	host.lastbeat = now
	if host.getstate() != Host.up and interval > 0:
		lasts = host.state
		d = host.newstate(Host.up)
		m = "%s, back after being %s for %s" % (host.name, lasts, dur(d))
		log(m)
		if name in watchhosts:
			email("back", name)
	host.upcount += 1


#
#
def readsock():
	global htab, win
	data, addr = sock.recvfrom(1024)
	pairs = string.split(data, ';')
	boot = 0
	shutdown = 0
	name = "unknown"
	service = "unknown"
	msg = None
	interval = 0
	deltaT = 0.0
	acks = -1
	for pair in pairs:
		l = string.split(pair, "=")
		key = l[0]
		if len(l) != 2:
			val = "0"
		else:
			val = l[1]
		if key == 'boot':
			boot += 1
		elif key == 'shutdown':
			shutdown += 1
		elif key == 'interval':
			interval = int(val)
		elif key == 'name':
			name = shortname(val)
		elif key == 'msg':
			msg = val
		elif key == 'service':
			service = val
		elif key == 'time':
			try:
				deltaT = now-float(val)
			except:
				pass
		elif key == 'acks':
			try:
				acks = int(val)
			except:
				acks = -1

	if boot:
		if acks == -1:
			a = "(%s)" % acks
		else:
			a = ""
		m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a)
		log(m)
		if name in watchhosts:
			email("booted", m)
	if msg:
		m = "%s msg: %s" % (name, msg)
		log(m, service=service)
		if name in watchhosts:
			email("msg", m)
	fromaddr(name, addr[0], boot, interval, acks)
	if shutdown:
		m = "%s shutdown" % name
		log(m)
		if name in watchhosts:
			email("shutdown", m)
		try:
			hosts[name].newstate(Host.down)
		except:
			pass
	if interval > 0:
		try:
			hosts[name].interval = interval
		except:
			pass

	rmsg="ACK"
	if len(hosts[name].cmds):
		rmsg=hosts[name].cmds[0]
		email("%s cmd exec" % name, "command '%s' initiated" %  hosts[name].cmds[0])
		del hosts[name].cmds[0]
		log("%s command initiated" % name)
	try:
		sock.sendto(rmsg, addr)
	except:
		pass


#
#
#
def initcurses():
	global stdscr
	stdscr = curses.initscr()
	curses.noecho()
	curses.cbreak()
	stdscr.keypad(1)
	if DEBUG:
		sys.stderr.write("curses init done: %s\n" % stdscr)


def exitcurses():
	curses.nocbreak()
	stdscr.keypad(0)
	curses.echo()
	curses.endwin()


#
#
class HtmlHandler(SocketServer.BaseRequestHandler):
	allow_reuse_address = True


	def buildhead(self, title="Heartbeat", refresh=None):
		res=[]
		res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
		res.append("<html>")
		res.append("<head>")
		res.append('<title>%s</title>' % (title))
		if refresh:
			res.append("<meta http-equiv = Refresh content = %d>\n" % refresh)
		res.append("</head>")
		res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000" BACKGROUND = "/~andreas/images/tile.marble.gif">')
		return res

	def buildpage(self):
		res=self.buildhead(refresh=60)
		res.append("<H2>Heartbeat status</h2><h4> %s (%s)</H4>" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT')))
		res.append("<table>")
		res.append("<tr><th>Host</th><th>State</th><th>IP Addr</th><th>Last change</th></tr>\n")
		hosts_sorted = hosts.keys()
		hosts_sorted.sort()
		for h in hosts_sorted:
			res.append("<tr><td>%-24s</td><td>%-7s</td><td>%-16s</td><td>%-17s</td></tr>\n" % (h, hosts[h].dispstate(),  hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))))
		res.append("</table>")
		res.append("<h4>Log of Events</h4>")
		for m in msgs[len(msgs)-30:]:
			res.append("%s<BR>" % m)
		res.append("</body></html>")
		return res


	def handle(self):
		headers=[]
		headers.append("Date: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
		headers.append("Server: hbd")
		headers.append("Last-Modified: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
		headers.append("Accept-Ranges: bytes")
		headers.append("Connection: close")
		headers.append("Content-Type: text/html; charset = ISO-8859-1")

		uri = '/unknown'
		f = self.request.makefile()
		while 1:
			line = string.strip(f.readline())
			if len(line) == 0:
				break
			r = line.split()
			if r[0] == "GET":
				uri = r[1]
				html = r[2]
		upar=string.split(uri,"?")
		if len(upar) == 1:
			uarg=[]
		else:
			uarg=string.split(upar[1],"&")

		code = 200
		cause = "OK"
		if uri == "/":
			res=self.buildpage()

		elif upar[0] == "/c":	# command on host /c?h=melschserver&c=sudo%20ls 
			uname=""
			ucmd=""
			if uarg[0][:2] == "h=":
				uname=uarg[0][2:]
			if uarg[1][:2] == "c=":
				ucmd=uarg[1][2:]
			if ucmd != "" and uname != "" and hosts.has_key(uname):
				hosts[uname].cmds.append(urllib.unquote(ucmd))
				res=self.buildhead()
				res.append("2Done")

		elif upar[0] == "/d":	# drop host  /d?h=melschserver
			if uarg[0][:2] == "h=":
				uname=uarg[0][2:]
			if uname != "" and hosts.has_key(uname):
				del hosts[uname]
				log("%s dropped" % uname)
				res=self.buildhead()
				res.append("Done")

		else:
			code = 404
			cause = "Not Found"
			res=[]
			res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
			res.append('<html><head>')
			res.append('<title>%s %s</title>' % (code, cause))
			res.append('</head><body>')
			res.append('<h1>%s</h1>' % (cause))
			res.append('<p>The requested URL %s was not found on this server.</p>' % uri)
			res.append('<hr>')
			res.append('<address>hbd (Unix) Server at %s Port %s</address>' % (hbd_host, hbd_port))
			res.append('</body></html>')

		self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause))
		for h in headers:
			self.request.send("%s\r\n" % h)
		self.request.send("\r\n")

		try:
			self.request.send(string.join(res, "\n"))
		except:
			pass


def saveandrestart():
	sock.close()
	serv.shutdown()
	serv.server_close()
	log("restarting")
	os.execv(sys.argv[0], [sys.argv[0]]+cmdargs)


def pickleit():
	pickf = open(PICKFILE, 'w')
	pick = cPickle.Pickler(pickf)
	pick.dump(hosts)
	pick.dump(htab)
	pick.dump(msgs)
	pickf.close()


#
# Main
#

helpflag = False
forground = False
optlist = []
args = []
home = os.environ['HOME']
cmdargs = []
configfile = "%s/.hbrc" % home

try:
	optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:v')
except:
	helpflag = True

for o, a in optlist:
	if o == '-c':
		configfile = a
		cmdargs += [o, a]
	if o == '-d':
		visual = True
		cmdargs += [o]
	elif o == '-f':
		forground = True
		cmdargs += [o]
	elif o == '-h':
		helpflag = True
	elif o == '-v':
		verbose = True
		cmdargs += [o]


if helpflag:
	print "hbc HeartBeatDaemon"
	print "usage: hbd [-dfhv] [-c configfile]"
	print
	print "	-c configfile"
	print "	-d display"
	print "	-f run in foreground"
	print "	-h this help"
	print "	-v verbose"
	print
	print """ config file can contain
logfile = /var/log/heartbeat.log
logfmt = [text|msg]
hb_port = 50003
interval = 20
hbd_port = 50004
hbd_host = www.domain.com
grace = 2
"""

	sys.exit(1)

if visual:
	forground = True
#
# set defaults

hb_port = PORT
hbd_host = THOST
hbd_port = TPORT
logfile = LOGFILE
logfmt = "text"
interval = INTERVAL
grace = GRACE
watchhosts = []
dyndnshosts = []
drophosts = []

try:
	f = open(configfile, "r")
	if verbose:
		print "notice: using config file %s" % configfile
except:
	print "warning: running without conifig file: %s" % configfile
	f = None

if f:
	while 1:
		l = f.readline()
		if len(l) == 0:
			break
		if verbose:
			print "  %s" % l[:-1]
		r = l[:-1].split('=')
		if r[0] == 'interval':
			interval = eval(r[1])
		elif r[0] == 'grace':
			grace = eval(r[1])
		elif r[0] == 'hbd_port':
			hbd_port = eval(r[1])
		elif r[0] == 'hbd_host':
			hbd_host = r[1]
		elif r[0] == 'hb_port':
			hb_port = eval(r[1])
		elif r[0] == 'logfile':
			logfile = r[1]
		elif r[0] == 'logfmt':
			logfmt = r[1]
		elif r[0] == 'watchhosts':
			watchhosts = eval(r[1])
		elif r[0] == 'dyndnshosts':
			dyndnshosts = eval(r[1])
		elif r[0] == 'drophosts':
			drophosts = eval(r[1])
	f.close()

if len(args) != 0:
	print "error: args"
	sys.exit(1)


if verbose:
	print "notice: logging to %s" % logfile
logf = initlog(logfile)

if os.path.exists(PICKFILE):
	pickf = open(PICKFILE, 'r')
	pick = cPickle.Unpickler(pickf)
	try:
		hosts = pick.load()
		htab = pick.load()
		msgs = pick.load()
		pickf.close()
	except:
		os.unlink(PICKFILE)
	for h in drophosts:
		if h in hosts:		# was: hosts.has_key(h):
			del hosts[h]


now = time.time()
startsec = int(now) % interval

if visual:
	import curses
	initcurses()
	display()
	stdscr.nodelay(1)

log("Starting %s" % VER)
atexit.register(on_exit)

ilist = []

sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

sock.bind(("", hb_port))
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
ilist.append(sock)

serv = SocketServer.TCPServer((hbd_host, hbd_port), HtmlHandler)
ilist.append(serv.fileno())

if not forground:
	pid = os.fork()
	if pid > 0:
		if verbose:
			print "daemoinizing... pid = %d" % pid
		sys.exit(0)

	verbose = False
	os.close(0)
	os.close(1)
	os.close(2)
	sys.stdin.close()
	sys.stdout = NullDevice()
	sys.stderr = NullDevice()
	os.chdir("/")
	os.setsid()
	os.umask(0)

up = 1
sig = 0
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler)

next = int(now)+15   # 15  seconds time to settle after (re-)start
sleep = next - now
while up:
	if visual:
		c = stdscr.getch()
		if c == ord('c'):
			msgs = []
			display()
		elif c == ord('q'):
			break  # Exit the while()
		elif c == ord('d'):
			DEBUG = not DEBUG
		elif c == ord('v'):
			verbose = not verbose
#		elif c == ord('p'):
#			PrintDocument()
#		elif c == ord('x'):
#			x  =  y  =  0

	try:
		sr = select.select(ilist, [], [], sleep)
		now = time.time()
	except KeyboardInterrupt:
		sys.exit(0)
	except select.error, value:
		if value[0] != 4:		# interrupted system call
			print select.error, value
			#raise os.error, value
			continue
		if visual:
			exitcurses()
			initcurses()
			display()
		continue
	for fh in sr[0]:
		if fh == sock:
			readsock()
		if fh == serv.fileno():
			serv.handle_request()
	if now >= next:
		next = now+1
		checkoverdue()
		if visual:
			stdscr.move(1, 0)
			stdscr.clrtoeol()
			displaytime()

	sleep = next-now
	if sleep < 0:
		sys.stderr.write("sleep is negaitive! %s next = %s\n" % (sleep, next))
		sleep = 0
	if DEBUG:
		sys.stderr.write("sleep = %s next = %s\n" % (sleep, next))


if sig == signal.SIGHUP:
	saveandrestart()
