always pickle current state when a log msg is set
drop strickt reporting - it's useless
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# $Id: hbd,v 1.33 2012/09/22 17:51:45 andreas Exp $
|
||||
# $Id: hbd,v 1.34 2012/09/22 19:17:53 andreas Exp $
|
||||
# Wait for heartbeat messages and act on them (or their absence)
|
||||
#
|
||||
import time, os, string, sys, socket, atexit, select, SocketServer, getopt, signal, cPickle, smtplib, traceback
|
||||
@@ -7,7 +7,7 @@ import time, os, string, sys, socket, atexit, select, SocketServer, getopt, sign
|
||||
False=0
|
||||
True=1
|
||||
LOGFILE="/home/andreas/public_html/messages/andreas"
|
||||
PICKFILE="/tmp/hbd.pick"
|
||||
PICKFILE="/var/tmp/hbd.pick"
|
||||
AEMAIL=["andreas@wrede.ca"]
|
||||
NAME="heatbeat"
|
||||
SMTPSERVER="localhost"
|
||||
@@ -208,22 +208,17 @@ def checkoverdue():
|
||||
for h in hosts.keys():
|
||||
if hosts[h].state == Host.down:
|
||||
continue
|
||||
if reportstrict:
|
||||
gr=grace
|
||||
else:
|
||||
gr=5*grace
|
||||
timeout=hosts[h].interval+gr
|
||||
timeout=hosts[h].interval+grace
|
||||
if hosts[h].state == Host.up and now-hosts[h].lastbeat > timeout:
|
||||
m="%s is overdue" % h
|
||||
log(m)
|
||||
if h in watchhosts:
|
||||
email("overdue", m)
|
||||
hosts[h].newstate(Host.overdue, gr)
|
||||
hosts[h].newstate(Host.overdue, grace)
|
||||
log(m)
|
||||
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
def displaytime():
|
||||
maxY,maxX=stdscr.getmaxyx()
|
||||
stdscr.addstr(0,maxX-8, time.strftime("%H:%M:%S", time.localtime(now)), curses.A_BOLD)
|
||||
@@ -303,6 +298,7 @@ def log(m, service="heartbeat"):
|
||||
msgw.addstr(msg)
|
||||
msgw.clrtoeol()
|
||||
msgw.refresh()
|
||||
pickleit()
|
||||
|
||||
#
|
||||
#
|
||||
@@ -493,6 +489,10 @@ class HtmlHandler(SocketServer.BaseRequestHandler):
|
||||
def saveandrestart():
|
||||
sock.close()
|
||||
serv.socket.close()
|
||||
log("restarting")
|
||||
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
|
||||
|
||||
def pickleit():
|
||||
pickf=open(PICKFILE, 'w')
|
||||
pick=cPickle.Pickler(pickf)
|
||||
pick.dump(hosts)
|
||||
@@ -500,7 +500,6 @@ def saveandrestart():
|
||||
pick.dump(msgs)
|
||||
pickf.close()
|
||||
|
||||
os.execv(sys.argv[0],[sys.argv[0]]+cmdargs)
|
||||
|
||||
#
|
||||
# Main
|
||||
@@ -508,7 +507,6 @@ def saveandrestart():
|
||||
|
||||
helpflag=False
|
||||
forground=False
|
||||
restart=None
|
||||
optlist=[]
|
||||
args=[]
|
||||
home=os.environ['HOME']
|
||||
@@ -554,7 +552,7 @@ hb_port=50003
|
||||
interval=20
|
||||
hbd_port=50004
|
||||
hbd_host=www.domain.com
|
||||
grace=1
|
||||
grace=2
|
||||
"""
|
||||
|
||||
sys.exit(1)
|
||||
@@ -571,7 +569,6 @@ logfile=LOGFILE
|
||||
logfmt="text"
|
||||
interval=INTERVAL
|
||||
grace=GRACE
|
||||
reportstrict=False
|
||||
watchhosts=[]
|
||||
drophosts=[]
|
||||
|
||||
@@ -603,8 +600,6 @@ if f:
|
||||
logfile=r[1]
|
||||
elif r[0] == 'logfmt':
|
||||
logfmt=r[1]
|
||||
elif r[0] == 'reportstrict':
|
||||
reportstrict=r[1] in ["True","true","TRUE","1"]
|
||||
elif r[0] == 'watchhosts':
|
||||
watchhosts=eval(r[1])
|
||||
elif r[0] == 'drophosts':
|
||||
@@ -622,11 +617,13 @@ logf=initlog(logfile)
|
||||
if os.path.exists(PICKFILE):
|
||||
pickf=open(PICKFILE, 'r')
|
||||
pick=cPickle.Unpickler(pickf)
|
||||
try:
|
||||
hosts=pick.load()
|
||||
htab=pick.load()
|
||||
msgs=pick.load()
|
||||
pickf.close()
|
||||
# os.unlink(PICKFILE)
|
||||
except:
|
||||
os.unlink(PICKFILE)
|
||||
for h in drophosts:
|
||||
if hosts.has_key(h):
|
||||
del hosts[h]
|
||||
@@ -641,9 +638,7 @@ if visual:
|
||||
display()
|
||||
stdscr.nodelay(1)
|
||||
|
||||
if verbose:
|
||||
if restart: log("Restarting")
|
||||
else: log("Starting")
|
||||
log("Starting")
|
||||
atexit.register(on_exit)
|
||||
|
||||
ilist=[]
|
||||
@@ -658,7 +653,7 @@ ilist.append(sock)
|
||||
serv=SocketServer.TCPServer((hbd_host,hbd_port),HtmlHandler)
|
||||
ilist.append(serv.fileno())
|
||||
|
||||
if not forground and not restart:
|
||||
if not forground:
|
||||
pid=os.fork()
|
||||
if pid > 0:
|
||||
if verbose: print "daemoinizing... pid=%d" % pid
|
||||
@@ -680,7 +675,7 @@ sig=0
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
signal.signal(signal.SIGHUP, handler)
|
||||
|
||||
next=int(now)+1
|
||||
next=int(now)+15 # 15 seconds time to settle after (re-)start
|
||||
sleep=next - now
|
||||
while up:
|
||||
if visual:
|
||||
|
||||
Reference in New Issue
Block a user