add a watchlist with hostnames that should trigger an email
This commit is contained in:
@@ -1,13 +1,16 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# $Id: hbd,v 1.30 2012/06/09 15:21:07 andreas Exp $
|
# $Id: hbd,v 1.31 2012/06/16 14:18:26 andreas Exp $
|
||||||
# Wait for heartbeat messages and act on them (or their absence)
|
# Wait for heartbeat messages and act on them (or their absence)
|
||||||
#
|
#
|
||||||
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle
|
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback
|
||||||
|
|
||||||
False=0
|
False=0
|
||||||
True=1
|
True=1
|
||||||
LOGFILE="/home/andreas/public_html/messages/andreas"
|
LOGFILE="/home/andreas/public_html/messages/andreas"
|
||||||
PICKFILE="/tmp/hbd.pick"
|
PICKFILE="/tmp/hbd.pick"
|
||||||
|
AEMAIL=["andreas@wrede.ca"]
|
||||||
|
NAME="heatbeat"
|
||||||
|
SMTPSERVER="localhost"
|
||||||
|
|
||||||
hosts={}
|
hosts={}
|
||||||
htab={}
|
htab={}
|
||||||
@@ -49,8 +52,8 @@ def shortname(name):
|
|||||||
return r[0]
|
return r[0]
|
||||||
|
|
||||||
class NullDevice:
|
class NullDevice:
|
||||||
def write(self, s):
|
def write(self, s):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Host:
|
class Host:
|
||||||
up="up"
|
up="up"
|
||||||
@@ -93,6 +96,30 @@ class Host:
|
|||||||
displaystatetime(self.name)
|
displaystatetime(self.name)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
def email(s, msg):
|
||||||
|
ret="OK"
|
||||||
|
toaddrs=AEMAIL
|
||||||
|
fromaddr="aew.heartbeat@wrede.ca"
|
||||||
|
subj="Info from %s: %s" % (NAME, s)
|
||||||
|
date=time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||||
|
body="To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (toaddrs[0], fromaddr, subj, date, msg)
|
||||||
|
try:
|
||||||
|
server = smtplib.SMTP(SMTPSERVER)
|
||||||
|
if DEBUG: server.set_debuglevel(1)
|
||||||
|
server.sendmail(fromaddr, toaddrs, body)
|
||||||
|
except smtplib.SMTPRecipientsRefused, errs:
|
||||||
|
log("cannot send email: %s\n" % (errs))
|
||||||
|
ret="Fail"
|
||||||
|
except:
|
||||||
|
print("smtp error: "+traceback.format_exc())
|
||||||
|
saveandrestart()
|
||||||
|
try:
|
||||||
|
server.quit()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@@ -119,7 +146,10 @@ def addhost(name, addr):
|
|||||||
if visual:
|
if visual:
|
||||||
displayaddr(sname)
|
displayaddr(sname)
|
||||||
htab[addr]=sname
|
htab[addr]=sname
|
||||||
log("%s, changed address to %s" % (sname, addr))
|
m="%s, changed address to %s" % (sname, addr)
|
||||||
|
log(m)
|
||||||
|
if name in watchhosts:
|
||||||
|
email("address change", m)
|
||||||
else:
|
else:
|
||||||
hosts[sname]=Host(sname, addr)
|
hosts[sname]=Host(sname, addr)
|
||||||
s=hosts.keys()
|
s=hosts.keys()
|
||||||
@@ -184,7 +214,10 @@ def checkoverdue():
|
|||||||
gr=5*grace
|
gr=5*grace
|
||||||
timeout=hosts[h].interval+gr
|
timeout=hosts[h].interval+gr
|
||||||
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
|
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
|
||||||
log("%s is overdue" % h)
|
m="%s is overdue" % h
|
||||||
|
log(m)
|
||||||
|
if h in watchhosts:
|
||||||
|
email("overdue", m)
|
||||||
hosts[h].newstate(Host.overdue, gr)
|
hosts[h].newstate(Host.overdue, gr)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -291,7 +324,10 @@ def fromaddr(name, addr, boot, interval, acks):
|
|||||||
if host.getstate() != Host.up and interval > 0:
|
if host.getstate() != Host.up and interval > 0:
|
||||||
lasts=host.state
|
lasts=host.state
|
||||||
d=host.newstate(Host.up)
|
d=host.newstate(Host.up)
|
||||||
log("%s, back after being %s for %s" % (host.name, lasts, dur(d)))
|
m="%s, back after being %s for %s" % (host.name, lasts, dur(d))
|
||||||
|
log(m)
|
||||||
|
if name in watchhosts:
|
||||||
|
email("back", name)
|
||||||
host.upcount+=1
|
host.upcount+=1
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -342,12 +378,21 @@ def readsock():
|
|||||||
a="(%s)" % acks
|
a="(%s)" % acks
|
||||||
else:
|
else:
|
||||||
a=""
|
a=""
|
||||||
log("%s booted, deltaT %0.2g sec %s" % (name, deltaT,a))
|
m="%s booted, deltaT %0.2g sec %s" % (name, deltaT,a)
|
||||||
|
log(m)
|
||||||
|
if name in watchhosts:
|
||||||
|
email("booted", m)
|
||||||
if msg:
|
if msg:
|
||||||
log("%s msg: %s" % (name, msg), service=service)
|
m="%s msg: %s" % (name, msg)
|
||||||
|
log(m, service=service)
|
||||||
|
if name in watchhosts:
|
||||||
|
email("msg", m)
|
||||||
fromaddr(name, addr[0], boot, interval, acks)
|
fromaddr(name, addr[0], boot, interval, acks)
|
||||||
if shutdown:
|
if shutdown:
|
||||||
log("%s shutdown" % name)
|
m="%s shutdown" % name
|
||||||
|
log(m)
|
||||||
|
if name in watchhosts:
|
||||||
|
email("shutdown", m)
|
||||||
try:
|
try:
|
||||||
hosts[name].newstate(Host.down)
|
hosts[name].newstate(Host.down)
|
||||||
except:
|
except:
|
||||||
@@ -442,6 +487,17 @@ class HtmlHandler(SocketServer.BaseRequestHandler):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def saveandrestart():
|
||||||
|
sock.close()
|
||||||
|
serv.socket.close()
|
||||||
|
pickf=open(PICKFILE, 'w')
|
||||||
|
pick=cPickle.Pickler(pickf)
|
||||||
|
pick.dump(hosts)
|
||||||
|
pick.dump(htab)
|
||||||
|
pick.dump(msgs)
|
||||||
|
pickf.close()
|
||||||
|
|
||||||
|
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Main
|
# Main
|
||||||
@@ -459,7 +515,7 @@ configfile="%s/.hbrc" % home
|
|||||||
try:
|
try:
|
||||||
optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:v')
|
optlist, args = getopt.getopt(sys.argv[1:], 'c:dfh:v')
|
||||||
except:
|
except:
|
||||||
helpflag=True
|
helpflag=True
|
||||||
|
|
||||||
for o,a in optlist:
|
for o,a in optlist:
|
||||||
if o == '-c':
|
if o == '-c':
|
||||||
@@ -513,6 +569,7 @@ logfmt="text"
|
|||||||
interval=INTERVAL
|
interval=INTERVAL
|
||||||
grace=GRACE
|
grace=GRACE
|
||||||
reportstrict=False
|
reportstrict=False
|
||||||
|
watchhosts=[]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f=open(configfile,"r")
|
f=open(configfile,"r")
|
||||||
@@ -544,6 +601,8 @@ if f:
|
|||||||
logfmt=r[1]
|
logfmt=r[1]
|
||||||
elif r[0] == 'reportstrict':
|
elif r[0] == 'reportstrict':
|
||||||
reportstrict=r[1] in ["True","true","TRUE","1"]
|
reportstrict=r[1] in ["True","true","TRUE","1"]
|
||||||
|
elif r[0] == 'watchhosts':
|
||||||
|
watchhosts=eval(r[1])
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
if len(args) != 0:
|
if len(args) != 0:
|
||||||
@@ -581,7 +640,7 @@ ilist=[]
|
|||||||
|
|
||||||
sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
sock.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR, \
|
sock.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR, \
|
||||||
sock.getsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR) | 1)
|
sock.getsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR) | 1)
|
||||||
|
|
||||||
sock.bind(("",hb_port))
|
sock.bind(("",hb_port))
|
||||||
ilist.append(sock)
|
ilist.append(sock)
|
||||||
@@ -659,14 +718,6 @@ while up:
|
|||||||
|
|
||||||
|
|
||||||
if sig == signal.SIGHUP:
|
if sig == signal.SIGHUP:
|
||||||
sock.close()
|
saveandrestart()
|
||||||
serv.socket.close()
|
|
||||||
pickf=open(PICKFILE, 'w')
|
|
||||||
pick=cPickle.Pickler(pickf)
|
|
||||||
pick.dump(hosts)
|
|
||||||
pick.dump(htab)
|
|
||||||
pick.dump(msgs)
|
|
||||||
pickf.close()
|
|
||||||
|
|
||||||
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user