fix non-alerting of overdue hosts

This commit is contained in:
Andreas Wrede
2026-04-12 18:44:36 -04:00
parent 2d8166d04a
commit 6bc8de192e
+24
View File
@@ -171,6 +171,24 @@ def dicttos(ID, d):
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
def _set_connectivity_alert(host, afam, level_name):
"""Update (or clear) a connectivity alert_state entry for a host/address-family.
level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so
that recovered hosts don't clutter the Alerts Dashboard.
"""
from .threshold import AlertState, AlertLevel
metric_path = f"connectivity.{afam}"
level = getattr(AlertLevel, level_name, AlertLevel.OK)
if level == AlertLevel.OK:
host.alert_states.pop(metric_path, None)
return
if metric_path not in host.alert_states:
host.alert_states[metric_path] = AlertState(metric_path)
state = host.alert_states[metric_path]
state.update(level, level_name)
def _make_timer_callbacks(uname, host, ctx): def _make_timer_callbacks(uname, host, ctx):
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic. """Return (on_overdue, on_unknown) async callbacks for connection timer logic.
@@ -182,6 +200,7 @@ def _make_timer_callbacks(uname, host, ctx):
async def on_unknown(connection): async def on_unknown(connection):
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat) connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
# Keep connectivity alert active when host transitions to unknown
if msg_to_websockets: if msg_to_websockets:
msg_to_websockets("host", host.stateinfo()) msg_to_websockets("host", host.stateinfo())
@@ -196,6 +215,8 @@ def _make_timer_callbacks(uname, host, ctx):
uname, uname,
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"), notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
) )
# Track in alert_states so the Alerts Dashboard shows this
_set_connectivity_alert(host, connection.afam, "CRITICAL")
if threshold_checker: if threshold_checker:
threshold_checker.check_value( threshold_checker.check_value(
host_name=uname, host_name=uname,
@@ -410,6 +431,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if conn.getstate() != hbdcls.Connection.UP: if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now) d = conn.newstate(hbdcls.Connection.UP, now)
# Clear connectivity alert now that the host is back up
_set_connectivity_alert(host, conn.afam, "OK")
# Don't log/notify RECOVER for a brand-new host seen for the first time — # Don't log/notify RECOVER for a brand-new host seen for the first time —
# it was never down, it just hasn't been seen before. # it was never down, it just hasn't been seen before.
if not newh: if not newh:
@@ -436,6 +459,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"), notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
) )
conn.newstate(hbdcls.Connection.DOWN, now) conn.newstate(hbdcls.Connection.DOWN, now)
_set_connectivity_alert(host, conn.afam, "CRITICAL")
if interval > 0: if interval > 0:
host.interval = interval host.interval = interval