fix: restore connectivity alerts for overdue/unknown/down hosts on startup
restore_connection_timers now calls _set_connectivity_alert("CRITICAL")
for DOWN, OVERDUE, and UNKNOWN connections, ensuring alerts are present
even if hbd was shut down before the transition callbacks recorded them.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -266,10 +266,15 @@ def restore_connection_timers(hbdclass, ctx):
|
|||||||
for afam, conn in list(host.connections.items()):
|
for afam, conn in list(host.connections.items()):
|
||||||
state = conn.getstate()
|
state = conn.getstate()
|
||||||
if state == hbdclass.Connection.DOWN:
|
if state == hbdclass.Connection.DOWN:
|
||||||
|
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||||
|
|
||||||
|
if state == hbdclass.Connection.UNKNOWN:
|
||||||
|
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||||
|
continue
|
||||||
|
|
||||||
if state == hbdclass.Connection.UP and interval > 0:
|
if state == hbdclass.Connection.UP and interval > 0:
|
||||||
elapsed = now - conn.lastbeat
|
elapsed = now - conn.lastbeat
|
||||||
# Give hosts one full (interval + grace) of extra time on startup
|
# Give hosts one full (interval + grace) of extra time on startup
|
||||||
@@ -300,6 +305,10 @@ def restore_connection_timers(hbdclass, ctx):
|
|||||||
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||||
uname, afam, remaining,
|
uname, afam, remaining,
|
||||||
)
|
)
|
||||||
|
# Ensure the connectivity alert is set — it may be missing if
|
||||||
|
# hbd was shut down before the on_overdue callback had a chance
|
||||||
|
# to record it.
|
||||||
|
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||||
restored += 1
|
restored += 1
|
||||||
|
|
||||||
logger.info("Restored timers for %d connection(s)", restored)
|
logger.info("Restored timers for %d connection(s)", restored)
|
||||||
@@ -470,6 +479,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
boot = msg.get("boot", 0)
|
boot = msg.get("boot", 0)
|
||||||
|
|
||||||
if boot:
|
if boot:
|
||||||
|
# hbc was stared with a -b flag
|
||||||
eventlog(uname, "INFO", "booted")
|
eventlog(uname, "INFO", "booted")
|
||||||
if host.watched:
|
if host.watched:
|
||||||
asyncio.create_task(notify_mod.send_notification(
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
@@ -480,6 +490,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
eventlog(uname, "INFO", message, service=service)
|
eventlog(uname, "INFO", message, service=service)
|
||||||
|
|
||||||
if conn.getstate() != hbdcls.Connection.UP:
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
|
# Transition to UP and log/notify if appropriate
|
||||||
lasts = conn.state
|
lasts = conn.state
|
||||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||||
# On reboot, pre-boot plugin data and derived alerts are stale.
|
# On reboot, pre-boot plugin data and derived alerts are stale.
|
||||||
|
|||||||
Reference in New Issue
Block a user