fix non-alerting of overdue hosts

2026-04-12 18:44:36 -04:00
parent 2d8166d04a
commit 6bc8de192e
1 changed files with 24 additions and 0 deletions
@@ -171,6 +171,24 @@ def dicttos(ID, d):
 DROPOVERDUE = 7 * 24 * 3600  # seconds before an overdue host becomes UNKNOWN
 def _set_connectivity_alert(host, afam, level_name):
    """Update (or clear) a connectivity alert_state entry for a host/address-family.
    level_name is "CRITICAL", "WARNING", or "OK".  "OK" removes the entry so
    that recovered hosts don't clutter the Alerts Dashboard.
    """
    from .threshold import AlertState, AlertLevel
    metric_path = f"connectivity.{afam}"
    level = getattr(AlertLevel, level_name, AlertLevel.OK)
    if level == AlertLevel.OK:
        host.alert_states.pop(metric_path, None)
        return
    if metric_path not in host.alert_states:
        host.alert_states[metric_path] = AlertState(metric_path)
    state = host.alert_states[metric_path]
    state.update(level, level_name)
 def _make_timer_callbacks(uname, host, ctx):
    """Return (on_overdue, on_unknown) async callbacks for connection timer logic.
@@ -182,6 +200,7 @@ def _make_timer_callbacks(uname, host, ctx):
    async def on_unknown(connection):
        connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
        # Keep connectivity alert active when host transitions to unknown
        if msg_to_websockets:
            msg_to_websockets("host", host.stateinfo())
@@ -196,6 +215,8 @@ def _make_timer_callbacks(uname, host, ctx):
            uname,
            notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
        )
        # Track in alert_states so the Alerts Dashboard shows this
        _set_connectivity_alert(host, connection.afam, "CRITICAL")
        if threshold_checker:
            threshold_checker.check_value(
                host_name=uname,
@@ -410,6 +431,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    if conn.getstate() != hbdcls.Connection.UP:
        lasts = conn.state
        d = conn.newstate(hbdcls.Connection.UP, now)
        # Clear connectivity alert now that the host is back up
        _set_connectivity_alert(host, conn.afam, "OK")
        # Don't log/notify RECOVER for a brand-new host seen for the first time —
        # it was never down, it just hasn't been seen before.
        if not newh:
@@ -436,6 +459,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
            notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
        )
        conn.newstate(hbdcls.Connection.DOWN, now)
        _set_connectivity_alert(host, conn.afam, "CRITICAL")
    if interval > 0:
        host.interval = interval