save state to pickle file, restart timers on restart

2026-04-06 17:24:59 -04:00
parent 57c4b86430
commit 832a8b0bda
6 changed files with 195 additions and 91 deletions
@@ -61,6 +61,102 @@ def dicttos(ID, d):
    return opk


+DROPOVERDUE = 7 * 24 * 3600  # seconds before an overdue host becomes UNKNOWN
+
+
+def _make_timer_callbacks(uname, host, watchhosts, ctx):
+    """Return (on_overdue, on_unknown) async callbacks for connection timer logic.
+
+    Captured values are bound at call time so callbacks are safe to use in loops.
+    """
+    msg_to_websockets = ctx.get("msg_to_websockets")
+    threshold_checker = ctx.get("threshold_checker")
+    cfg = ctx.get("config", {})
+
+    async def on_unknown(connection):
+        connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
+        if msg_to_websockets:
+            msg_to_websockets("host", host.stateinfo())
+
+    async def on_overdue(connection):
+        import time
+        if connection.getstate() != connection.__class__.UP:
+            return
+        now = time.time()
+        connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
+        msg = f"{connection.afam} overdue"
+        eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
+        if uname in watchhosts:
+            notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
+        if threshold_checker:
+            threshold_checker.check_value(
+                host_name=uname,
+                metric_path="rtt",
+                value=float("inf"),
+                alert_states=host.alert_states,
+            )
+        if msg_to_websockets:
+            msg_to_websockets("host", host.stateinfo())
+        connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
+
+    return on_overdue, on_unknown
+
+
+def restore_connection_timers(hbdclass, ctx):
+    """Restore overdue timers for all loaded connections after a pickle restore.
+
+    For UP connections, the remaining time until overdue is calculated from
+    lastbeat so that clients that vanished during hbd's downtime are detected.
+    For OVERDUE connections, the UNKNOWN drop timer is restored.
+    """
+    import time
+    now = time.time()
+    cfg = ctx.get("config", {})
+    grace = cfg.get("grace", 2)
+    from . import config as config_mod
+    watchhosts = config_mod.get_watchhosts(cfg)
+
+    restored = 0
+    for uname, host in list(hbdclass.Host.hosts.items()):
+        interval = host.interval
+        for afam, conn in list(host.connections.items()):
+            state = conn.getstate()
+            if state == hbdclass.Connection.DOWN:
+                continue
+
+            on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
+
+            if state == hbdclass.Connection.UP and interval > 0:
+                elapsed = now - conn.lastbeat
+                remaining = max(1.0, (interval + grace) - elapsed)
+                conn.reset_overdue_timer(remaining, on_overdue)
+                logger.debug(
+                    "Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)",
+                    uname, afam, remaining, elapsed,
+                )
+                restored += 1
+
+            elif state == hbdclass.Connection.OVERDUE:
+                elapsed_overdue = now - conn.statetime
+                remaining = DROPOVERDUE - elapsed_overdue
+                if remaining <= 1:
+                    # Already past the drop window — mark UNKNOWN immediately
+                    conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
+                    logger.info(
+                        "Marking %s/%s UNKNOWN (overdue %.1f days)",
+                        uname, afam, elapsed_overdue / 86400,
+                    )
+                else:
+                    conn.reset_overdue_timer(remaining, on_unknown)
+                    logger.debug(
+                        "Restored OVERDUE timer %s/%s: %.0fs remaining",
+                        uname, afam, remaining,
+                    )
+                restored += 1
+
+    logger.info("Restored timers for %d connection(s)", restored)
+
+
 def handle_datagram(msg: dict, addr, transport, ctx: dict):
    """Handle a parsed datagram message.

@@ -138,8 +234,9 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        # Handle plugin data message
        plugin_name = msg.get("plugin")
        if plugin_name:
-            # Extract all fields except ID and plugin name
-            plugin_data = {k: v for k, v in msg.items() if k not in ["ID", "plugin"]}
+            # Extract plugin fields, dropping protocol metadata fields
+            plugin_data = {k: v for k, v in msg.items()
+                           if k not in ("ID", "plugin", "id", "name")}
            # Store plugin data with timestamp
            host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
            if DEBUG > 1:
@@ -229,51 +326,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    # Reset overdue timer on every heartbeat
    if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
        grace = cfg.get("grace", 2)
-        timeout_seconds = (interval + grace) if interval > 0 else 30
-        
-        # Create callback for timer expiration
-        async def on_overdue(connection):
-            """Called when connection timer expires (no heartbeat received)."""
-            import time
-            now = time.time()
-            
-            # Only mark as overdue if still in UP state (not already marked)
-            if connection.getstate() == hbdcls.Connection.UP:
-                connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
-                
-                msg = f"{connection.afam} overdue"
-                eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
-                
-                if uname in watchhosts:
-                    notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
-                
-                # Check RTT thresholds with infinite RTT for overdue hosts
-                threshold_checker = ctx.get("threshold_checker")
-                if threshold_checker:
-                    metric_path = "rtt"
-                    threshold_checker.check_value(
-                        host_name=uname,
-                        metric_path=metric_path,
-                        value=float('inf'),
-                        alert_states=host.alert_states
-                    )
-                
-                # Notify websockets
-                if msg_to_websockets:
-                    msg_to_websockets("host", host.stateinfo())
-                
-                # Set a longer timer for marking as UNKNOWN (7 days)
-                DROPOVERDUE = 7 * 24 * 3600
-                
-                async def on_unknown(connection):
-                    """Mark connection as unknown after extended absence."""
-                    connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
-                    if msg_to_websockets:
-                        msg_to_websockets("host", host.stateinfo())
-                
-                connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
-        
-        # Reset the timer
+        timeout_seconds = interval + grace
+        on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
        conn.reset_overdue_timer(timeout_seconds, on_overdue)
    
    # Check RTT thresholds using the threshold checker