fix: restore connectivity alerts for overdue/unknown/down hosts on startup

restore_connection_timers now calls _set_connectivity_alert("CRITICAL") for DOWN, OVERDUE, and UNKNOWN connections, ensuring alerts are present even if hbd was shut down before the transition callbacks recorded them. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-06 14:40:04 -04:00
parent cf6e19704f
commit 3a0c48e32b
1 changed files with 11 additions and 0 deletions
@@ -266,10 +266,15 @@ def restore_connection_timers(hbdclass, ctx):
        for afam, conn in list(host.connections.items()):
            state = conn.getstate()
            if state == hbdclass.Connection.DOWN:
                _set_connectivity_alert(host, afam, "CRITICAL")
                continue
            on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
            if state == hbdclass.Connection.UNKNOWN:
                _set_connectivity_alert(host, afam, "CRITICAL")
                continue
            if state == hbdclass.Connection.UP and interval > 0:
                elapsed = now - conn.lastbeat
                # Give hosts one full (interval + grace) of extra time on startup
@@ -300,6 +305,10 @@ def restore_connection_timers(hbdclass, ctx):
                        "Restored OVERDUE timer %s/%s: %.0fs remaining",
                        uname, afam, remaining,
                    )
                # Ensure the connectivity alert is set — it may be missing if
                # hbd was shut down before the on_overdue callback had a chance
                # to record it.
                _set_connectivity_alert(host, afam, "CRITICAL")
                restored += 1
    logger.info("Restored timers for %d connection(s)", restored)
@@ -470,6 +479,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    boot = msg.get("boot", 0)
    if boot:
        # hbc was stared with a -b flag
        eventlog(uname, "INFO", "booted")
        if host.watched:
            asyncio.create_task(notify_mod.send_notification(
@@ -480,6 +490,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        eventlog(uname, "INFO", message, service=service)
    if conn.getstate() != hbdcls.Connection.UP:
        # Transition to UP and log/notify if appropriate
        lasts = conn.state
        d = conn.newstate(hbdcls.Connection.UP, now)
        # On reboot, pre-boot plugin data and derived alerts are stale.