fix: restore connectivity alerts for overdue/unknown/down hosts on startup

restore_connection_timers now calls _set_connectivity_alert("CRITICAL")
for DOWN, OVERDUE, and UNKNOWN connections, ensuring alerts are present
even if hbd was shut down before the transition callbacks recorded them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Andreas Wrede
2026-06-06 14:40:04 -04:00
parent cf6e19704f
commit 3a0c48e32b
+11
View File
@@ -266,10 +266,15 @@ def restore_connection_timers(hbdclass, ctx):
for afam, conn in list(host.connections.items()): for afam, conn in list(host.connections.items()):
state = conn.getstate() state = conn.getstate()
if state == hbdclass.Connection.DOWN: if state == hbdclass.Connection.DOWN:
_set_connectivity_alert(host, afam, "CRITICAL")
continue continue
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx) on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
if state == hbdclass.Connection.UNKNOWN:
_set_connectivity_alert(host, afam, "CRITICAL")
continue
if state == hbdclass.Connection.UP and interval > 0: if state == hbdclass.Connection.UP and interval > 0:
elapsed = now - conn.lastbeat elapsed = now - conn.lastbeat
# Give hosts one full (interval + grace) of extra time on startup # Give hosts one full (interval + grace) of extra time on startup
@@ -300,6 +305,10 @@ def restore_connection_timers(hbdclass, ctx):
"Restored OVERDUE timer %s/%s: %.0fs remaining", "Restored OVERDUE timer %s/%s: %.0fs remaining",
uname, afam, remaining, uname, afam, remaining,
) )
# Ensure the connectivity alert is set — it may be missing if
# hbd was shut down before the on_overdue callback had a chance
# to record it.
_set_connectivity_alert(host, afam, "CRITICAL")
restored += 1 restored += 1
logger.info("Restored timers for %d connection(s)", restored) logger.info("Restored timers for %d connection(s)", restored)
@@ -470,6 +479,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
boot = msg.get("boot", 0) boot = msg.get("boot", 0)
if boot: if boot:
# hbc was stared with a -b flag
eventlog(uname, "INFO", "booted") eventlog(uname, "INFO", "booted")
if host.watched: if host.watched:
asyncio.create_task(notify_mod.send_notification( asyncio.create_task(notify_mod.send_notification(
@@ -480,6 +490,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
eventlog(uname, "INFO", message, service=service) eventlog(uname, "INFO", message, service=service)
if conn.getstate() != hbdcls.Connection.UP: if conn.getstate() != hbdcls.Connection.UP:
# Transition to UP and log/notify if appropriate
lasts = conn.state lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now) d = conn.newstate(hbdcls.Connection.UP, now)
# On reboot, pre-boot plugin data and derived alerts are stale. # On reboot, pre-boot plugin data and derived alerts are stale.