#!/usr/bin/env python
# $Id: hbd,v 1.38 2013/07/14 02:25:05 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence)
#
VER = 1.52

import time
import os
import string
import sys
import socket
import atexit
import select
import SocketServer
import getopt
import signal
import cPickle
import smtplib
import traceback
import urllib
import httplib

from subprocess import Popen, STDOUT, PIPE


SEND_EMAIL=False
SEND_PUSHOVER=True

False = 0
True = 1
LOGFILE = "/home/andreas/public_html/messages/andreas"
PICKFILE = "/var/tmp/hbd.pick"
AEMAIL = ["andreas@wrede.ca"]
NAME = "heatbeat"
SMTPSERVER = "localhost"

hosts = {}
htab = {}

msgs = []

num = 0
upcount = 0
PORT = 50003
TPORT = 50004
THOST = ""
DEBUG = False
verbose = False

INTERVAL = 10
GRACE = 2

visual = 0
os.environ['TZ'] = 'EST5EDT'

stdscr = None
win = None
msgw = None
msgwB = None
msgwHeight = 10

def handler(signum, frame):
	global running, sig
	sig = signum
	if not running:
		if verbose:
			print "NOT runing signal: %s running: %d" % (sig, running)
		return
#	signal.signal(sig, handler)
	if verbose:
		print "signal: %s running: %s frame: %s" % (sig, running, frame)
	running = False
#	sys.exit(0)


def shortname(name):
	r = string.split(name, '.')
	return r[0]


class NullDevice:
	def write(self, s):
		pass


class Host:
	up = "up"
	down = "down"
	overdue = "overdue"

	def __init__(self, name, addr):
		global num
		self.name = shortname(name)
		self.addr = addr
		self.num = num
		self.lastbeat = time.time()
		self.upcount = 0
		self.state = Host.up
		self.state = "up"
		self.statetime = self.lastbeat
		self.interval = 0
		self.doesack = -1
		self.cmds = []
		num += 1


	# called when reloading class from pickle
	def fixup(self):
		try:
			a=self.cmds
		except:
			self.cmds=[]

	def getstate(self):
		return self.state

	def dispstate(self):
		if self.state in ["down", "overdue"]:
			state = "<b>%s</b>" % self.state
		else:
			state = "%s" % self.state
		if self.doesack != -1:
			return "%s(%s)" % (state, self.doesack)
		return state

	# set new state, return number of secs in previous state
	def newstate(self, state, when=0):
		self.state = state
		now = time.time()-when
		s = now-self.statetime
		self.statetime = now
		if visual:
			displaystatetime(self.name)
		return s


def email(s, msg):
	if not SEND_EMAIL:
		return
	ret = "OK"
	toaddrs = AEMAIL
	fromaddr = "aew.heartbeat@wrede.ca"
	subj = "Info from %s: %s" % (NAME, s)
	date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
	body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
	try:
		server = smtplib.SMTP(SMTPSERVER)
		if DEBUG:
			server.set_debuglevel(1)
		server.sendmail(fromaddr, toaddrs, body)
	except smtplib.SMTPRecipientsRefused, errs:
		log("cannot send email: %s\n" % (errs))
		ret = "Fail"
	except:
		print("smtp error: "+traceback.format_exc())
		saveandrestart()
	try:
		server.quit()
	except:
		pass
	return ret


def pushover(msg):
	if not SEND_PUSHOVER:
		return
	conn = httplib.HTTPSConnection("api.pushover.net:443")
	conn.request("POST", "/1/messages.json",
		urllib.urlencode({
		"token": "ac7NLX2rPjXFareeDgLpXNoDf4iFmf",
		"user": "uDhH33UjQQDYtNzJb1ThRiWb9ingGK",
		"message": msg,
	}), { "Content-type": "application/x-www-form-urlencoded" })
	conn.getresponse()


# nsupdate:  set the DNS A record for a fqdn
#	return: None if ok, else error text
def nsupdate(hostname, newip):
	D = {}
	D['domain'] = 'dy.wapanafa.org'
	D['fqdn'] = '%s.dy.wapanafa.org' % hostname
	D['dnsttl'] = '5'
	D['newip'] = newip
	D['ts'] = time.strftime('%Y-%m-%d.%H:%M:%S', time.gmtime())
	nsup = """update delete %(fqdn)s A
update add %(fqdn)s %(dnsttl)s A %(newip)s
update delete %(fqdn)s TXT
update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
send
answer

""" % D
#	log("DBG: nsup %s" % nsup)
	cmd = ["/usr/bin/nsupdate", "-k", "/etc/dhcpc/K%(domain)s.+157+00000." % D, "-v"]
#	log("DBG: cmd %s" % cmd)
	try:
		p = Popen(cmd, shell=False, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
	except OSError, e:
		return "nsupdate: execution failed: %s" % e
	except:
		return "nsupdate: some error occured"

	(output, err) = p.communicate(nsup)
	if output.find('status: NOERROR') >= 0:
		return None
	return output


#
def dur(sec):
	sec = int(sec)
	h = sec / 3600
	m = (sec - h * 3600) / 60
	s = (sec - h * 3600) % 60
	if h > 0:
		return "%d:%02d:%02d" % (h, m, s)
	if m > 0:
		return "%d:%02d" % (m, s)
	return "0:%02d" % s


#
def addhost(name, addr):
	sname = shortname(name)
	if sname in hosts:		# was: hosts.has_key(sname):
		del htab[hosts[sname].addr]
		hosts[sname].addr = addr
		if visual:
			displayaddr(sname)
		htab[addr] = sname
		m = "%s, changed address to %s" % (sname, addr)
		log(m)
	else:
		hosts[sname] = Host(sname, addr)
		s = hosts.keys()
		s.sort()
		x = 0
		for n in s:
			hosts[n].num = x
			x += 1
		htab[addr] = sname
		if visual:
			display()


#
def on_exit():
	if visual:
		exitcurses()
	if DEBUG:
		sys.stderr.write("on_exit\n")
	logf.close()
	print "exit"


def initlog(logfile):
	return open(logfile, "a")


#
#
def initwin():
	global win, msgw, msgwB, msgwHeight

	maxY, maxX = stdscr.getmaxyx()

	begin_x = 0
	begin_y = 2
	height = len(htab)+2
	if DEBUG:
		log("initwin called with %d" % height)
	win = curses.newwin(height, maxX, begin_y, begin_x)
	a = win.border(0, 0, 0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)

	msgwB = curses.newwin(0, 0, height+1, begin_x)
	msgwB.border(0, 0, 0, 0, curses.ACS_LTEE, curses.ACS_RTEE)

	msgwHeight = maxY-height-3
	msgw = curses.newwin(msgwHeight, maxX-2, height+2, begin_x+1)
	msgw.setscrreg(0, msgwHeight-1)
	msgw.scrollok(1)

	stdscr.addstr(0, 0, "hbd Version %s" % VER, curses.A_BOLD)
	stdscr.refresh()
	msgwB.refresh()


#
def checkoverdue():

	for h in hosts.keys():
		if hosts[h].state == Host.down:
			continue
		timeout = hosts[h].interval+grace
		if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
			m = "%s is overdue" % h
			if h in watchhosts:
				email("overdue", m)
				pushover(m)
			hosts[h].newstate(Host.overdue, grace)
			log(m)


#
#
def displaytime():
	maxY, maxX = stdscr.getmaxyx()
	stdscr.addstr(0, maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)

	for h in hosts.keys():
		d = hosts[h].getstate()
		attr = 0
		if verbose and hosts[h].state != Host.down:
			d = dur(now-hosts[h].lastbeat)
		if hosts[h].state == Host.overdue:
			attr = curses.A_BOLD
		win.addstr(hosts[h].num+1, 25, "%8s" % d, attr)
	win.refresh()
	stdscr.refresh()


#
#
def displaystatetime(h, refresh=1):
	win.addstr(hosts[h].num+1, 60, "%-17s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime)))
	if refresh:
		win.refresh()


#
#
def displayaddr(h, refresh=1):
	win.addstr(hosts[h].num+1, 35, "%-16s" % hosts[h].addr)
	if refresh:
		win.refresh()


#
#
def displaybody():
	for h in hosts.keys():
		win.addstr(hosts[h].num+1, 1, "%-25s" % (h))
		if hosts[h].addr is not None:
			displayaddr(h, 0)
		if hosts[h].statetime is not None:
			displaystatetime(h, 0)
	win.refresh()


#
#
def displaymsgs():
	global msgw, msgs
	y = 0
	for m in msgs[len(msgs)-msgwHeight:]:
		msgw.addstr(y, 0, m)
		y += 1
	msgw.refresh()


#
#
def display():
	if visual:
		initwin()
		displaytime()
		displaybody()
		displaymsgs()


def log(m, service="heartbeat"):
	msg = time.strftime("%b %d %H:%M:%S", time.localtime(time.time()))+": "+m+"\n"
	msgs.append(msg)
	if logfmt == "msg":
		m2 = "%d|%s|%s\n" % (now, service, m)
	else:
		m2 = msg
	logf.write(m2)
	logf.flush()
	if msgw is not None:
		y, x = msgw.getyx()
#		if y > =  msgwHeight-1:
#			msgw.scroll()
		msgw.addstr(msg)
		msgw.clrtoeol()
		msgw.refresh()
	pickleit()


#
#
def fromaddr(name, addr, boot, interval, acks):
	global htab

	if not name in hosts:		# was: hosts.has_key(name):
		addhost(name, addr)
	host = hosts[name]
	host.doesack = acks
	if host.addr != addr:
		if host.addr in htab:		# was: htab.has_key(host.addr):
			del htab[host.addr]
		host.addr = addr
		htab[addr] = name
		m = "%s changed address to %s" % (host.name, addr)
		if name in dyndnshosts:
			err = nsupdate(name, addr)
			if err:
				m += ", DNS failed: %s" % err
			else:
				m += ", DNS updated."
		log(m)
		if name in watchhosts:
			email("address change", m)
			pushover(m)

	host.lastbeat = now
	if host.getstate() != Host.up and interval > 0:
		lasts = host.state
		d = host.newstate(Host.up)
		m = "%s, back after being %s for %s" % (host.name, lasts, dur(d))
		log(m)
		if name in watchhosts:
			email("back", name)
			pushover("%s is back" % name)
	host.upcount += 1


#
#
def readsock():
	global htab, win
	data, addr = sock.recvfrom(1024)
	pairs = string.split(data, ';')
	boot = 0
	shutdown = 0
	name = "unknown"
	service = "unknown"
	msg = None
	interval = 0
	deltaT = 0.0
	acks = -1
	for pair in pairs:
		l = string.split(pair, "=")
		key = l[0]
		if len(l) != 2:
			val = "0"
		else:
			val = l[1]
		if key == 'boot':
			boot += 1
		elif key == 'shutdown':
			shutdown += 1
		elif key == 'interval':
			interval = int(val)
		elif key == 'name':
			name = shortname(val)
		elif key == 'msg':
			msg = val
		elif key == 'service':
			service = val
		elif key == 'time':
			try:
				deltaT = now-float(val)
			except:
				pass
		elif key == 'acks':
			try:
				acks = int(val)
			except:
				acks = -1

	if boot:
		if acks == -1:
			a = "(%s)" % acks
		else:
			a = ""
		m = "%s booted, deltaT %0.2g sec %s" % (name, deltaT, a)
		log(m)
		if name in watchhosts:
			email("booted", m)
			pushover(m)
	if msg:
		m = "%s msg: %s" % (name, msg)
		log(m, service=service)
		if name in watchhosts:
			email("msg", m)
			pushover(m)

	fromaddr(name, addr[0], boot, interval, acks)
	if shutdown:
		m = "%s shutdown" % name
		log(m)
		if name in watchhosts:
			email("shutdown", m)
			pushover(m)
		try:
			hosts[name].newstate(Host.down)
		except:
			pass
	if interval > 0:
		try:
			hosts[name].interval = interval
		except:
			pass

	rmsg="ACK"
	if len(hosts[name].cmds):
		rmsg=hosts[name].cmds[0]
		msg="command '%s' initiated" %  hosts[name].cmds[0]
		email("%s cmd exec" % name, msg)
		pushover(msg)
		del hosts[name].cmds[0]
		log("%s command initiated" % name)
	try:
		sock.sendto(rmsg, addr)
	except:
		pass


#
#
#
def initcurses():
	global stdscr
	stdscr = curses.initscr()
	curses.noecho()
	curses.cbreak()
	stdscr.keypad(1)
	if DEBUG:
		sys.stderr.write("curses init done: %s\n" % stdscr)


def exitcurses():
	curses.nocbreak()
	stdscr.keypad(0)
	curses.echo()
	curses.endwin()


class HtmlServer(SocketServer.TCPServer):
	allow_reuse_address = True
#
#
class HtmlHandler(SocketServer.BaseRequestHandler):


	def buildhead(self, title="Heartbeat", refresh=None):
		res=[]
		res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
		res.append("<html>")
		res.append("<head>")
		res.append('<title>%s</title>' % (title))
		if refresh:
			res.append("<meta http-equiv = Refresh content = %d>\n" % refresh)
		res.append("</head>")
		res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000" BACKGROUND = "/~andreas/images/tile.marble.gif">')
		return res

	def buildpage(self):
		res=self.buildhead(refresh=60)
		res.append("<H2>Heartbeat status</h2><h4> %s (%s)</H4>" % (time.strftime("%H:%M:%S", time.localtime(now)), os.environ.get('TZ', 'CET-1CDT')))
		res.append("<table>")
		res.append("<tr><th>Host</th><th>State</th><th>IP Addr</th><th>Last change</th></tr>\n")
		hosts_sorted = hosts.keys()
		hosts_sorted.sort()
		for h in hosts_sorted:
			res.append("<tr><td>%-24s</td><td>%-7s</td><td>%-16s</td><td>%-17s</td></tr>\n" % (h, hosts[h].dispstate(),  hosts[h].addr, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(hosts[h].statetime))))
		res.append("</table>")
		res.append("<h4>Log of Events</h4>")
		for m in msgs[len(msgs)-30:]:
			res.append("%s<BR>" % m)
		res.append("</body></html>")
		return res


	def handle(self):
		global sig, running
		headers=[]
		headers.append("Date: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
		headers.append("Server: hbd")
		headers.append("Last-Modified: %s" % time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(now)))
		headers.append("Accept-Ranges: bytes")
		headers.append("Connection: close")
		headers.append("Content-Type: text/html; charset = ISO-8859-1")

		uri = '/unknown'
		f = self.request.makefile()
		while 1:
			line = string.strip(f.readline())
			if len(line) == 0:
				break
			r = line.split()
			if r[0] == "GET":
				uri = r[1]
				html = r[2]
		upar=string.split(uri,"?")
		if len(upar) == 1:
			uarg=[]
		else:
			uarg=string.split(upar[1],"&")

		code = 200
		cause = "OK"
		if uri == "/":
			res=self.buildpage()

		elif upar[0] == "/c":	# command on host /c?h=melschserver&c=sudo%20ls 
			uname=""
			ucmd=""
			if uarg[0][:2] == "h=":
				uname=uarg[0][2:]
			if uarg[1][:2] == "c=":
				ucmd=uarg[1][2:]
			if ucmd != "" and uname != "" and hosts.has_key(uname):
				hosts[uname].cmds.append(urllib.unquote(ucmd))
				res=self.buildhead()
				res.append("2Done")

		elif upar[0] == "/d":	# drop host  /d?h=melschserver
			if uarg[0][:2] == "h=":
				uname=uarg[0][2:]
			if uname != "" and hosts.has_key(uname):
				del hosts[uname]
				log("%s dropped" % uname)
				res=self.buildhead()
				res.append("Done")

		elif upar[0] == "/n":   # register name
			res=self.buildhead()
			if uarg[0][:2] == "h=":
				uname=uarg[0][2:]
			if uname != "" and hosts.has_key(uname):
				err = nsupdate(uname, hosts[uname].addr)
				ll="nsupdate request: %s" % err
			else:
				ll="name %s not found" % uname
			res.append(ll)
			log(ll)

		elif upar[0] == "/r":   # restart
			res=self.buildhead()
			res.append("restart request")
			sig=signal.SIGHUP
			running=False
			log("restart request")

		else:
			code = 404
			cause = "Not Found"
			res=[]
			res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
			res.append('<html><head>')
			res.append('<title>%s %s</title>' % (code, cause))
			res.append('</head><body>')
			res.append('<h1>%s</h1>' % (cause))
			res.append('<p>The requested URL %s was not found on this server.</p>' % uri)
			res.append('<hr>')
			res.append('<address>hbd (Unix) Server at %s Port %s</address>' % (hbd_host, hbd_port))
			res.append('</body></html>')

		self.request.send("HTTP/1.0 %s %s\r\n" % (code, cause))
		for h in headers:
			self.request.send("%s\r\n" % h)
		self.request.send("\r\n")

		try:
			self.request.send(string.join(res, "\n"))
		except:
			pass


def saveandrestart():
	sock.close()
#	serv.shutdown()	#N.B. dont shutdown() as we don't use serv_forever
	serv.server_close()
	log("restarting")
	os.execv(sys.argv[0], [sys.argv[0]]+cmdargs)


def pickleit():
	pickf = open(PICKFILE, 'w')
	pick = cPickle.Pickler(pickf)
	pick.dump(hosts)
	pick.dump(htab)
	pick.dump(msgs)
	pickf.close()


#
# Main
#

helpflag = False
forground = False
optlist = []
args = []
home = os.environ['HOME']
cmdargs = []
configfile = "%s/.hbrc" % home

try:
	optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:vx')
except:
	helpflag = True

for o, a in optlist:
	if o == '-c':
		configfile = a
		cmdargs += [o, a]
	if o == '-d':
		visual = True
		cmdargs += [o]
	elif o == '-f':
		forground = True
		cmdargs += [o]
	elif o == '-h':
		helpflag = True
	elif o == '-v':
		verbose = True
		cmdargs += [o]
	elif o == '-x':
		DEBUG = True


if helpflag:
	print "hbc HeartBeatDaemon"
	print "usage: hbd [-dfhvx] [-c configfile]"
	print
	print "	-c configfile"
	print "	-d display"
	print "	-f run in foreground"
	print "	-h this help"
	print "	-v verbose"
	print "	-x debug"
	print
	print """ config file can contain
logfile = /var/log/heartbeat.log
logfmt = [text|msg]
hb_port = 50003
interval = 20
hbd_port = 50004
hbd_host = www.domain.com
grace = 2
"""

	sys.exit(1)

if visual:
	forground = True
#
# set defaults

hb_port = PORT
hbd_host = THOST
hbd_port = TPORT
logfile = LOGFILE
logfmt = "text"
interval = INTERVAL
grace = GRACE
watchhosts = []
dyndnshosts = []
drophosts = []

try:
	f = open(configfile, "r")
	if verbose:
		print "notice: using config file %s" % configfile
except:
	print "warning: running without conifig file: %s" % configfile
	f = None

if f:
	while 1:
		l = f.readline()
		if len(l) == 0:
			break
		if verbose:
			print "  %s" % l[:-1]
		r = l[:-1].split('=')
		if r[0] == 'interval':
			interval = eval(r[1])
		elif r[0] == 'grace':
			grace = eval(r[1])
		elif r[0] == 'hbd_port':
			hbd_port = eval(r[1])
		elif r[0] == 'hbd_host':
			hbd_host = r[1]
		elif r[0] == 'hb_port':
			hb_port = eval(r[1])
		elif r[0] == 'logfile':
			logfile = r[1]
		elif r[0] == 'logfmt':
			logfmt = r[1]
		elif r[0] == 'watchhosts':
			watchhosts = eval(r[1])
		elif r[0] == 'dyndnshosts':
			dyndnshosts = eval(r[1])
		elif r[0] == 'drophosts':
			drophosts = eval(r[1])
	f.close()

if len(args) != 0:
	print "error: args"
	sys.exit(1)


if verbose:
	print "notice: logging to %s" % logfile
logf = initlog(logfile)

if os.path.exists(PICKFILE):
	pickf = open(PICKFILE, 'r')
	pick = cPickle.Unpickler(pickf)
	try:
		hosts = pick.load()
		htab = pick.load()
		msgs = pick.load()
		pickf.close()
	except:
		os.unlink(PICKFILE)
	for h in hosts.keys():
		hosts[h].fixup()
	for h in drophosts:
		if h in hosts:		# was: hosts.has_key(h):
			del hosts[h]


now = time.time()
startsec = int(now) % interval

if visual:
	import curses
	initcurses()
	display()
	stdscr.nodelay(1)

log("Starting %s" % VER)
atexit.register(on_exit)

ilist = []

sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

sock.bind(("", hb_port))
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
ilist.append(sock)

serv = HtmlServer((hbd_host, hbd_port), HtmlHandler)
ilist.append(serv.fileno())

if not forground:
	pid = os.fork()
	if pid > 0:
		if verbose:
			print "daemoinizing... pid = %d" % pid
		sys.exit(0)

	verbose = False
	os.close(0)
	os.close(1)
	os.close(2)
	sys.stdin.close()
	sys.stdout = NullDevice()
	sys.stderr = NullDevice()
	os.chdir("/")
	os.setsid()
	os.umask(0)

running = True
sig = 0
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler)

next = int(now)+15   # 15  seconds time to settle after (re-)start
sleep = next - now
while running:
	if visual:
		c = stdscr.getch()
		if c == ord('c'):
			msgs = []
			display()
		elif c == ord('q'):
			break  # Exit the while()
		elif c == ord('d'):
			DEBUG = not DEBUG
		elif c == ord('v'):
			verbose = not verbose
#		elif c == ord('p'):
#			PrintDocument()
#		elif c == ord('x'):
#			x  =  y  =  0

	if DEBUG:
		sys.stderr.write("about to sleep = %s\n" % (sleep))
	try:
		sr = select.select(ilist, [], [], sleep)
		now = time.time()
	except KeyboardInterrupt:
		sys.exit(0)
	except select.error, value:
		if value[0] != 4:		# interrupted system call
			print select.error, value
			#raise os.error, value
			continue
		if visual:
			exitcurses()
			initcurses()
			display()
		continue
	except:
		sys.exit(1)
	for fh in sr[0]:
		if fh == sock:
			readsock()
		if fh == serv.fileno():
			serv.handle_request()
	if now >= next:
		next = now+1
		checkoverdue()
		if visual:
			stdscr.move(1, 0)
			stdscr.clrtoeol()
			displaytime()

	sleep = next-now
	if sleep < 0:
		sys.stderr.write("sleep is negaitive! %s next = %s\n" % (sleep, next))
		sleep = 0
	if DEBUG:
		sys.stderr.write("sleep = %s next = %s\n" % (sleep, next))


if sig == signal.SIGHUP:
	if DEBUG:
		sys.stderr.write("signal 1 exit\n")
	saveandrestart()
