refactor monitor, add threshold rtesting
This commit is contained in:
+49
-33
@@ -1,50 +1,66 @@
|
||||
"""monitor helper and thread for heartbeat daemon."""
|
||||
"""Monitor helper for heartbeat daemon.
|
||||
|
||||
This module provides monitoring tasks for the heartbeat daemon.
|
||||
The primary reachability monitoring is now event-driven (timers set/reset
|
||||
on HTB arrival in udp.py) rather than periodic polling.
|
||||
|
||||
This module can be extended for additional monitoring tasks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
from . import notify as notify_mod
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
|
||||
def checkoverdue(
|
||||
config: dict,
|
||||
hbdclass,
|
||||
log: callable,
|
||||
pushmsg: callable,
|
||||
msg_to_websockets: callable,
|
||||
):
|
||||
now = time.time()
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
pmsg = []
|
||||
for c in hbdclass.Host.hosts[h].connections:
|
||||
conn = hbdclass.Host.hosts[h].connections[c]
|
||||
if conn.state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
||||
pmsg.append(conn.afam)
|
||||
if (
|
||||
conn.state == hbdclass.Connection.OVERDUE
|
||||
and (now - conn.lastbeat) > DROPOVERDUE
|
||||
):
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
if pmsg != []:
|
||||
if h in config.get("watchhosts", []):
|
||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
||||
log(h, "%s overdue" % " and ".join(pmsg))
|
||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
||||
async def cleanup_connections(hbdclass):
|
||||
"""Clean up connection timers on shutdown.
|
||||
|
||||
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||
"""
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
|
||||
|
||||
async def start(
|
||||
config: dict,
|
||||
hbdclass: callable,
|
||||
log=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
):
|
||||
"""start a monitor loop that checks for overdue hosts every minute"""
|
||||
"""Start monitor background tasks.
|
||||
|
||||
Note: Reachability monitoring is now timer-based and happens in udp.py
|
||||
when HTB messages arrive. This function can be used for additional
|
||||
monitoring tasks.
|
||||
|
||||
Currently runs a simple status logger every 5 minutes.
|
||||
"""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_interval = 300 # Log status every 5 minutes
|
||||
|
||||
while True:
|
||||
await asyncio.sleep(15) # 15 seconds between checks
|
||||
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
|
||||
await asyncio.sleep(logger_interval)
|
||||
|
||||
# Log monitoring status
|
||||
total_hosts = len(hbdclass.Host.hosts)
|
||||
up_count = sum(
|
||||
1 for h in hbdclass.Host.hosts.values()
|
||||
for c in h.connections.values()
|
||||
if c.state == hbdclass.Connection.UP
|
||||
)
|
||||
overdue_count = sum(
|
||||
1 for h in hbdclass.Host.hosts.values()
|
||||
for c in h.connections.values()
|
||||
if c.state == hbdclass.Connection.OVERDUE
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user