#!/usr/bin/env python # $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $ import sys import time import socket import os import signal import getopt import string import select import errno import traceback import md5 import shutil import zlib import subprocess import syslog import syslog PORT = 50003 INTERVAL = 10 PIDFILE = '/tmp/hbc.pid' VER = 4 MAXRECV = 32767 running = True dorestart = False warned1 = False class NullDevice: def write(self, s): pass class Conn: def __init__(self, conId, addr, port, af): self.conId = conId self.addr = addr self.port = port self.af = af self.ackcount = 0 # num of accks received self.lastack = 0 # time() last ACK was received self.send = 0 self.lastsend = 0 # time() last msg was sent self.rtts = [0] self.sock=socket.socket(af, socket.SOCK_DGRAM) self.sock.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR, \ self.sock.getsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR) | 1) def sendto(self, msg, ID = 'HTB'): # default ID is HearTBeat global warned1 msg['name'] = shortname(iam) msg['id'] = self.conId msg['ver'] = VER msg['time'] = time.time() m = dicttos(ID, msg, True) # always compress if verbose: log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m))) try: self.sock.sendto(m, (self.addr, self.port)) except socket.error as e: if not warned1: log("socket error: %s %s:%s" % (e, self.addr, self.port)) warned1 = True return self.send += 1 self.lastsend = time.time() def ack(self, msgDict): self.lastack = time.time() self.lastacksent = float(msgDict.get('time','0')) if verbose: log("ack RTT: %0.1f ms" % ((self.lastack - self.lastsend) * 1000.0)) self.rtts.append((self.lastack - self.lastsend) * 1000.0) if len(self.rtts) > 10: del self.rtts[0] self.ackcount += 1 def close(self): if self.sock: self.sock.close() self.sock = None def shortname(name): r = string.split(name, '.') return r[0] def dicttos(ID, d, compress=False): s = [] for k in d: if type(d[k]) == type(1.2): s.append("%s=%0.5f" % (k, d[k])) else: s.append("%s=%s" % (k, d[k])) pk = ";".join(s) if compress: zpk = zlib.compress(pk, 6) ID = "!"+ID else: zpk = pk return ID + ":" + zpk def stodict(msg): d = {} r0 = msg.split(':',1) if len(r0) == 1: return None if r0[0][0] == '!': # compressed pk = zlib.decompress(msg[len(r0[0])+1:]) d['ID'] = r0[0][1:] else: pk = r0[1] d['ID'] = r0[0] r = pk.split(';') for v in r: vr = v.split('=', 1) k = vr[0].strip() if len(vr) == 1: d[k] = None else: v = vr[1].strip() try: if v[0].isdigit(): v = eval(v) except: pass d[k] = v return d def syslogtrace(note): logm = '%s hbc died: \n%s' % (note, traceback.format_exc()) log(logm) for l in logm.split('\n'): syslog.syslog(syslog.LOG_ERR, ' tb: %s' % l) if verbose: print logm conId = 1 def createConnections(hosts): global conId for host in hosts: if verbose: log("createConnections for %s" % host) try: rs=socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP) except socket.gaierror: logm = '%s hbc died: \n%s' % ('createConnections', traceback.format_exc()) if verbose: log(logm) return None for r in rs: if verbose: log("address %s" % str(r)) if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD af=socket.AF_INET6 elif r[0] == 2: af=socket.AF_INET else: print "dont know this net type: %s" % r[0][0] sys.exit(1) addr = r[4][0] conns[conId] = Conn(conId, addr, hb_port, af) conId += 1 def doexec(conn, data): try: ro = subprocess.check_output(data, stderr=subprocess.STDOUT, shell=True) fail = "OK" except subprocess.CalledProcessError as e: ro = str(e) fail = "CalledProcessError" except Exception as e: syslogtrace('System') ro = None fail = "cmd failed: %s" % e msg={'service': 'command', 'msg': fail+" "+ro} conns[conn].sendto(msg) def doupdate(conn, msgDict): fail = None try: code = msgDict['code'].decode('base64') csum = msgDict['csum'] except: fail = "csum/code missing" if not fail: fail = doupdateone(code, csum) msg={'service': 'update', 'msg': fail if fail else "OK"} conns[conn].sendto(msg) if not fail: log('hc updates, fs = %s' % (len(code))) return fail def doupdateone(code, csum): m = md5.new() m.update(code) icsum = m.hexdigest() if icsum != csum: return "checksum error" fn = sys.argv[0] ofn = "%s.sav" % fn try: shutil.copy2(fn, ofn) except Exception as e: return "cannot make backup copy: %s" % e try: fh=open(fn, "w") fh.write(code) fh.close() except Exception as e: return "cannot write new code: %s" % e return None def restart(): if verbose: print "restart: execv %s %s" % (sys.argv[0], [sys.argv[0]]+cmdargs) syslog.syslog(syslog.LOG_ERR, 'restart %s' % (sys.argv[0])) e = "fallthrough" try: os.execv(sys.argv[0], [sys.argv[0]]+cmdargs) except Exception as e: pass print "should not be here:", str(e) log('restart failed: %s' % e) def process(): global running, dorestart ifiles = {} conIds = {} for conn in conns: ifiles[conns[conn].sock.fileno()] = conns[conn].sock conIds[conns[conn].sock.fileno()] = conn nextReport = time.time() while running: while time.time() < nextReport: sleep=nextReport - time.time() if sleep <= 0: break try: r=select.select(ifiles.keys(),[],[],sleep) except KeyboardInterrupt: running = False break except SystemExit: log('daemon exit, running was %s' % running) if running: running = False break except: if running: syslogtrace('select') running = False break for rfh in r[0]: conn = conIds[rfh] data, addr = ifiles[rfh].recvfrom(MAXRECV) if verbose: print "sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]) msgDict = stodict(data) if verbose: print "sock.recvfrom: %s (%s) %s" % (addr, len(data), str(msgDict)[:80]) if msgDict['ID'] == "ACK": conns[conn].ack(msgDict) elif msgDict['ID'] == "UPD": if doupdate(conn, msgDict) == None: if verbose: print "process: restart after update" dorestart = True break elif msgDict['ID'] == "CMD": doexec(conn, msgDict['cmd']) else: doexec(conn, data) # deprecated until no more VER - hbc if dorestart: running = False break if not running: break for conn in conns: msg={'acks': conns[conn].ackcount, 'rtt': conns[conn].rtts[-1]} conns[conn].sendto(msg) time.sleep(0.1) #N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed if nextReport + interval >= time.time(): nextReport += interval else: nextReport = time.time() + interval if verbose: log( "process: done running") def cleanup(): global running if verbose: log('cleanup') running = False for conn in conns: msg={'shutdown': 1, 'acks': conns[conn].ackcount} conns[conn].sendto(msg) conns[conn].close() time.sleep(1) closeall() def closeall(): if verbose: syslog.syslog(syslog.LOG_ERR, 'closecall') for conn in conns: conns[conn].close() def daemonize(working_dir="/", stdin='/dev/zero', stdout='/dev/null', stderr='/dev/null'): """ Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the UNIX Environment" for details (ISBN 0201563177) http://www.yendor.com/programming/unix/apue/proc/fork2.c """ try: # first fork pid = os.fork() if pid > 0: # exit from first parent os._exit(0) except OSError, e: sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) os._exit(1) # decouple from parent environment os.chdir(working_dir) os.setsid() os.umask(0) # second fork try: pid = os.fork() if pid > 0: # exit from second parent os._exit(0) except OSError, e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # redirects standard file descriptors sys.stdout.flush() sys.stderr.flush() si = file(stdin, 'r') so = file(stdout, 'a+') se = file(stderr, 'a+', 0) os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) msgonly=False helpflag=False verbose=False fdaemon=False daemonized = False optlist=[] args=[] msgboot={} home=os.environ['HOME'] configfile="%s/.hbrc" % home cmdargs = [] try: optlist, args = getopt.getopt(sys.argv[1:], 'bc:dhm:v') except: helpflag=True for o,a in optlist: if o == '-b': msgboot['boot']=1 elif o == '-c': configfile=a cmdargs += [o, a] elif o == '-d': fdaemon=True cmdargs += [o] elif o == '-h': helpflag=True elif o == '-m': msgboot['service'] = "service" msgboot['msg'] = a msgonly=True elif o == '-v': verbose=True cmdargs += [o] cmdargs += args if verbose: print "cmdargs for restart are %s" % cmdargs if helpflag: print "hbc HeartBeatClient" print "usage: hbc [-bdhv] [-c configfile] [-m msg][host1 [..]]" print print " -b indicate machine boot" print " -c configfile" print " -d daemonize" print " -h this help" print " -m send a message" print " -v verbose" print print """ config file can contain hb_hosts=('host1', 'host2', ..._ hb_port=50003 interval=20 logfile=... logfmt={|test|msg} grace=SECONDS reportstrict={True|False} """ sys.exit(1) # # set defaults hb_port=PORT interval=INTERVAL hb_hosts=[] iam=socket.gethostname() try: f=open(configfile,"r") if verbose: print "notice: using config file %s" % configfile except: if verbose: print "warning: running without config file: %s" % configfile f=None if f: while 1: l=f.readline() if len(l) == 0: break r=l[:-1].split('=') if r[0] == 'hb_hosts': hb_hosts=eval(r[1]) if verbose: print "notice: cfg hb_hosts: %s" % hb_hosts elif r[0] == 'interval': interval=eval(r[1]) elif r[0] == 'hb_port': hb_port=eval(r[1]) elif r[0] == 'name': iam=eval(r[1]) if verbose: print "name set to %s" % iam f.close() if len(args) != 0: hb_hosts=args if len(hb_hosts) == 0: print "no hb server specified" sys.exit(1) # if verbose: print "notice: hb_hosts: %s" % str(hb_hosts) print "notice: hb_port: %s" % hb_port print "notice: interval: %s" % interval print "notice: iam: %s" % iam print "notice: msgonly: %s" % msgonly print "notice: msgboot: %s" % msgboot if not msgonly: msgboot['interval'] = interval conns = {} while True: if verbose: log("create connections") createConnections(hb_hosts) if len(conns) != 0: break if verbose: log("no connections yet, sleep a bit") time.sleep(2) if verbose: log("%s connections created" % (len(conns))) if len(msgboot) > 0: if verbose: print "on boot" msgboot['acks'] = 0 for conn in conns: conns[conn].sendto(msgboot) if msgonly: if verbose: print "msgboot done msgonly=%s" % msgonly closeall() sys.exit(0) # syslog.openlog('hbc', syslog.LOG_PID, syslog.LOG_DAEMON) if fdaemon: print "daemoinizing." daemonize() daemonized = True syslog.syslog(syslog.LOG_ERR, 'starting heartbeat to %s' % ','.join(hb_hosts)) running = True try: process() except: syslogtrace('process') if verbose: print "err: process exit: %s" % e if verbose: log( "main: cleanup") cleanup() if dorestart: restart()