save state to pickle file, restart timers on restart

This commit is contained in:
2026-04-06 17:24:59 -04:00
parent 57c4b86430
commit 832a8b0bda
6 changed files with 195 additions and 91 deletions
+101 -47
View File
@@ -61,6 +61,102 @@ def dicttos(ID, d):
return opk
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
def _make_timer_callbacks(uname, host, watchhosts, ctx):
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
Captured values are bound at call time so callbacks are safe to use in loops.
"""
msg_to_websockets = ctx.get("msg_to_websockets")
threshold_checker = ctx.get("threshold_checker")
cfg = ctx.get("config", {})
async def on_unknown(connection):
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
async def on_overdue(connection):
import time
if connection.getstate() != connection.__class__.UP:
return
now = time.time()
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
if threshold_checker:
threshold_checker.check_value(
host_name=uname,
metric_path="rtt",
value=float("inf"),
alert_states=host.alert_states,
)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
return on_overdue, on_unknown
def restore_connection_timers(hbdclass, ctx):
"""Restore overdue timers for all loaded connections after a pickle restore.
For UP connections, the remaining time until overdue is calculated from
lastbeat so that clients that vanished during hbd's downtime are detected.
For OVERDUE connections, the UNKNOWN drop timer is restored.
"""
import time
now = time.time()
cfg = ctx.get("config", {})
grace = cfg.get("grace", 2)
from . import config as config_mod
watchhosts = config_mod.get_watchhosts(cfg)
restored = 0
for uname, host in list(hbdclass.Host.hosts.items()):
interval = host.interval
for afam, conn in list(host.connections.items()):
state = conn.getstate()
if state == hbdclass.Connection.DOWN:
continue
on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
if state == hbdclass.Connection.UP and interval > 0:
elapsed = now - conn.lastbeat
remaining = max(1.0, (interval + grace) - elapsed)
conn.reset_overdue_timer(remaining, on_overdue)
logger.debug(
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)",
uname, afam, remaining, elapsed,
)
restored += 1
elif state == hbdclass.Connection.OVERDUE:
elapsed_overdue = now - conn.statetime
remaining = DROPOVERDUE - elapsed_overdue
if remaining <= 1:
# Already past the drop window — mark UNKNOWN immediately
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
logger.info(
"Marking %s/%s UNKNOWN (overdue %.1f days)",
uname, afam, elapsed_overdue / 86400,
)
else:
conn.reset_overdue_timer(remaining, on_unknown)
logger.debug(
"Restored OVERDUE timer %s/%s: %.0fs remaining",
uname, afam, remaining,
)
restored += 1
logger.info("Restored timers for %d connection(s)", restored)
def handle_datagram(msg: dict, addr, transport, ctx: dict):
"""Handle a parsed datagram message.
@@ -138,8 +234,9 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# Handle plugin data message
plugin_name = msg.get("plugin")
if plugin_name:
# Extract all fields except ID and plugin name
plugin_data = {k: v for k, v in msg.items() if k not in ["ID", "plugin"]}
# Extract plugin fields, dropping protocol metadata fields
plugin_data = {k: v for k, v in msg.items()
if k not in ("ID", "plugin", "id", "name")}
# Store plugin data with timestamp
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
if DEBUG > 1:
@@ -229,51 +326,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# Reset overdue timer on every heartbeat
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
grace = cfg.get("grace", 2)
timeout_seconds = (interval + grace) if interval > 0 else 30
# Create callback for timer expiration
async def on_overdue(connection):
"""Called when connection timer expires (no heartbeat received)."""
import time
now = time.time()
# Only mark as overdue if still in UP state (not already marked)
if connection.getstate() == hbdcls.Connection.UP:
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
# Check RTT thresholds with infinite RTT for overdue hosts
threshold_checker = ctx.get("threshold_checker")
if threshold_checker:
metric_path = "rtt"
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=float('inf'),
alert_states=host.alert_states
)
# Notify websockets
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
# Set a longer timer for marking as UNKNOWN (7 days)
DROPOVERDUE = 7 * 24 * 3600
async def on_unknown(connection):
"""Mark connection as unknown after extended absence."""
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
# Reset the timer
timeout_seconds = interval + grace
on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
conn.reset_overdue_timer(timeout_seconds, on_overdue)
# Check RTT thresholds using the threshold checker