always pickle current state when a log msg is set

drop strickt reporting - it's useless
This commit is contained in:
andreas
2012-09-22 19:17:53 +00:00
parent 1df065cf9b
commit e65d9df061
+17 -22
View File
@@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# $Id: hbd,v 1.33 2012/09/22 17:51:45 andreas Exp $ # $Id: hbd,v 1.34 2012/09/22 19:17:53 andreas Exp $
# Wait for heartbeat messages and act on them (or their absence) # Wait for heartbeat messages and act on them (or their absence)
# #
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback
@@ -7,7 +7,7 @@ import time, os, string, sys, socket, atexit, select, SocketServer, getopt, sign
False=0 False=0
True=1 True=1
LOGFILE="/home/andreas/public_html/messages/andreas" LOGFILE="/home/andreas/public_html/messages/andreas"
PICKFILE="/tmp/hbd.pick" PICKFILE="/var/tmp/hbd.pick"
AEMAIL=["andreas@wrede.ca"] AEMAIL=["andreas@wrede.ca"]
NAME="heatbeat" NAME="heatbeat"
SMTPSERVER="localhost" SMTPSERVER="localhost"
@@ -208,22 +208,17 @@ def checkoverdue():
for h in hosts.keys(): for h in hosts.keys():
if hosts[h].state == Host.down: if hosts[h].state == Host.down:
continue continue
if reportstrict: timeout=hosts[h].interval+grace
gr=grace
else:
gr=5*grace
timeout=hosts[h].interval+gr
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout: if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
m="%s is overdue" % h m="%s is overdue" % h
log(m)
if h in watchhosts: if h in watchhosts:
email("overdue", m) email("overdue", m)
hosts[h].newstate(Host.overdue, gr) hosts[h].newstate(Host.overdue, grace)
log(m)
# #
# #
# #
#
def displaytime(): def displaytime():
maxY,maxX=stdscr.getmaxyx() maxY,maxX=stdscr.getmaxyx()
stdscr.addstr(0,maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD) stdscr.addstr(0,maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)
@@ -303,6 +298,7 @@ def log(m, service="heartbeat"):
msgw.addstr(msg) msgw.addstr(msg)
msgw.clrtoeol() msgw.clrtoeol()
msgw.refresh() msgw.refresh()
pickleit()
# #
# #
@@ -493,6 +489,10 @@ class HtmlHandler(SocketServer.BaseRequestHandler):
def saveandrestart(): def saveandrestart():
sock.close() sock.close()
serv.socket.close() serv.socket.close()
log("restarting")
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
def pickleit():
pickf=open(PICKFILE, 'w') pickf=open(PICKFILE, 'w')
pick=cPickle.Pickler(pickf) pick=cPickle.Pickler(pickf)
pick.dump(hosts) pick.dump(hosts)
@@ -500,7 +500,6 @@ def saveandrestart():
pick.dump(msgs) pick.dump(msgs)
pickf.close() pickf.close()
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
# #
# Main # Main
@@ -508,7 +507,6 @@ def saveandrestart():
helpflag=False helpflag=False
forground=False forground=False
restart=None
optlist=[] optlist=[]
args=[] args=[]
home=os.environ['HOME'] home=os.environ['HOME']
@@ -554,7 +552,7 @@ hb_port=50003
interval=20 interval=20
hbd_port=50004 hbd_port=50004
hbd_host=www.domain.com hbd_host=www.domain.com
grace=1 grace=2
""" """
sys.exit(1) sys.exit(1)
@@ -571,7 +569,6 @@ logfile=LOGFILE
logfmt="text" logfmt="text"
interval=INTERVAL interval=INTERVAL
grace=GRACE grace=GRACE
reportstrict=False
watchhosts=[] watchhosts=[]
drophosts=[] drophosts=[]
@@ -603,8 +600,6 @@ if f:
logfile=r[1] logfile=r[1]
elif r[0] == 'logfmt': elif r[0] == 'logfmt':
logfmt=r[1] logfmt=r[1]
elif r[0] == 'reportstrict':
reportstrict=r[1] in ["True","true","TRUE","1"]
elif r[0] == 'watchhosts': elif r[0] == 'watchhosts':
watchhosts=eval(r[1]) watchhosts=eval(r[1])
elif r[0] == 'drophosts': elif r[0] == 'drophosts':
@@ -622,11 +617,13 @@ logf=initlog(logfile)
if os.path.exists(PICKFILE): if os.path.exists(PICKFILE):
pickf=open(PICKFILE, 'r') pickf=open(PICKFILE, 'r')
pick=cPickle.Unpickler(pickf) pick=cPickle.Unpickler(pickf)
try:
hosts=pick.load() hosts=pick.load()
htab=pick.load() htab=pick.load()
msgs=pick.load() msgs=pick.load()
pickf.close() pickf.close()
# os.unlink(PICKFILE) except:
os.unlink(PICKFILE)
for h in drophosts: for h in drophosts:
if hosts.has_key(h): if hosts.has_key(h):
del hosts[h] del hosts[h]
@@ -641,9 +638,7 @@ if visual:
display() display()
stdscr.nodelay(1) stdscr.nodelay(1)
if verbose: log("Starting")
if restart: log("Restarting")
else: log("Starting")
atexit.register(on_exit) atexit.register(on_exit)
ilist=[] ilist=[]
@@ -658,7 +653,7 @@ ilist.append(sock)
serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler) serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler)
ilist.append(serv.fileno()) ilist.append(serv.fileno())
if not forground and not restart: if not forground:
pid=os.fork() pid=os.fork()
if pid > 0: if pid > 0:
if verbose: print "daemoinizing... pid=%d" % pid if verbose: print "daemoinizing... pid=%d" % pid
@@ -680,7 +675,7 @@ sig=0
signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGHUP, handler) signal.signal(signal.SIGHUP, handler)
next=int(now)+1 next=int(now)+15 # 15 seconds time to settle after (re-)start
sleep=next - now sleep=next - now
while up: while up:
if visual: if visual: