refactor monitor, add threshold rtesting

2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
@@ -1,50 +1,66 @@
-"""monitor helper and thread for heartbeat daemon."""
+"""Monitor helper for heartbeat daemon.
+
+This module provides monitoring tasks for the heartbeat daemon.
+The primary reachability monitoring is now event-driven (timers set/reset 
+on HTB arrival in udp.py) rather than periodic polling.
+
+This module can be extended for additional monitoring tasks.
+"""

 from __future__ import annotations
 import asyncio
 import time
+from . import notify as notify_mod

 DROPOVERDUE = 7 * 24 * 3600
+eventlog = notify_mod.eventlog


-def checkoverdue(
-    config: dict,
-    hbdclass,
-    log: callable,
-    pushmsg: callable,
-    msg_to_websockets: callable,
-):
-    now = time.time()
-    for h in list(hbdclass.Host.hosts.keys()):
-        pmsg = []
-        for c in hbdclass.Host.hosts[h].connections:
-            conn = hbdclass.Host.hosts[h].connections[c]
-            if conn.state == hbdclass.Connection.DOWN:
-                continue
-            timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
-            if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
-                conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
-                pmsg.append(conn.afam)
-            if (
-                conn.state == hbdclass.Connection.OVERDUE
-                and (now - conn.lastbeat) > DROPOVERDUE
-            ):
-                conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
-        if pmsg != []:
-            if h in config.get("watchhosts", []):
-                pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
-            log(h, "%s overdue" % " and ".join(pmsg))
-            msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
+async def cleanup_connections(hbdclass):
+    """Clean up connection timers on shutdown.
+    
+    Cancels all active overdue timers to prevent callbacks after shutdown.
+    """
+    for hostname, host in list(hbdclass.Host.hosts.items()):
+        for conn_type, conn in host.connections.items():
+            if hasattr(conn, 'cancel_overdue_timer'):
+                conn.cancel_overdue_timer()


 async def start(
    config: dict,
    hbdclass: callable,
-    log=None,
    pushmsg=None,
    msg_to_websockets=None,
 ):
-    """start a monitor loop that checks for overdue hosts every minute"""
+    """Start monitor background tasks.
+    
+    Note: Reachability monitoring is now timer-based and happens in udp.py
+    when HTB messages arrive. This function can be used for additional
+    monitoring tasks.
+    
+    Currently runs a simple status logger every 5 minutes.
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+    logger_interval = 300  # Log status every 5 minutes
+    
    while True:
-        await asyncio.sleep(15)  # 15 seconds between checks
-        checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
+        await asyncio.sleep(logger_interval)
+        
+        # Log monitoring status
+        total_hosts = len(hbdclass.Host.hosts)
+        up_count = sum(
+            1 for h in hbdclass.Host.hosts.values()
+            for c in h.connections.values()
+            if c.state == hbdclass.Connection.UP
+        )
+        overdue_count = sum(
+            1 for h in hbdclass.Host.hosts.values()
+            for c in h.connections.values()
+            if c.state == hbdclass.Connection.OVERDUE
+        )
+        
+        logger.debug(
+            f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
+        )