refactor monitor, add threshold rtesting

This commit is contained in:
Andreas Wrede
2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
+49 -33
View File
@@ -1,50 +1,66 @@
"""monitor helper and thread for heartbeat daemon."""
"""Monitor helper for heartbeat daemon.
This module provides monitoring tasks for the heartbeat daemon.
The primary reachability monitoring is now event-driven (timers set/reset
on HTB arrival in udp.py) rather than periodic polling.
This module can be extended for additional monitoring tasks.
"""
from __future__ import annotations
import asyncio
import time
from . import notify as notify_mod
DROPOVERDUE = 7 * 24 * 3600
eventlog = notify_mod.eventlog
def checkoverdue(
config: dict,
hbdclass,
log: callable,
pushmsg: callable,
msg_to_websockets: callable,
):
now = time.time()
for h in list(hbdclass.Host.hosts.keys()):
pmsg = []
for c in hbdclass.Host.hosts[h].connections:
conn = hbdclass.Host.hosts[h].connections[c]
if conn.state == hbdclass.Connection.DOWN:
continue
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
pmsg.append(conn.afam)
if (
conn.state == hbdclass.Connection.OVERDUE
and (now - conn.lastbeat) > DROPOVERDUE
):
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
if pmsg != []:
if h in config.get("watchhosts", []):
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
log(h, "%s overdue" % " and ".join(pmsg))
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
async def cleanup_connections(hbdclass):
"""Clean up connection timers on shutdown.
Cancels all active overdue timers to prevent callbacks after shutdown.
"""
for hostname, host in list(hbdclass.Host.hosts.items()):
for conn_type, conn in host.connections.items():
if hasattr(conn, 'cancel_overdue_timer'):
conn.cancel_overdue_timer()
async def start(
config: dict,
hbdclass: callable,
log=None,
pushmsg=None,
msg_to_websockets=None,
):
"""start a monitor loop that checks for overdue hosts every minute"""
"""Start monitor background tasks.
Note: Reachability monitoring is now timer-based and happens in udp.py
when HTB messages arrive. This function can be used for additional
monitoring tasks.
Currently runs a simple status logger every 5 minutes.
"""
import logging
logger = logging.getLogger(__name__)
logger_interval = 300 # Log status every 5 minutes
while True:
await asyncio.sleep(15) # 15 seconds between checks
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
await asyncio.sleep(logger_interval)
# Log monitoring status
total_hosts = len(hbdclass.Host.hosts)
up_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.UP
)
overdue_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.OVERDUE
)
logger.debug(
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
)