diff --git a/hbd/server/udp.py b/hbd/server/udp.py index 7ce3411..83e7360 100644 --- a/hbd/server/udp.py +++ b/hbd/server/udp.py @@ -171,6 +171,24 @@ def dicttos(ID, d): DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN +def _set_connectivity_alert(host, afam, level_name): + """Update (or clear) a connectivity alert_state entry for a host/address-family. + + level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so + that recovered hosts don't clutter the Alerts Dashboard. + """ + from .threshold import AlertState, AlertLevel + metric_path = f"connectivity.{afam}" + level = getattr(AlertLevel, level_name, AlertLevel.OK) + if level == AlertLevel.OK: + host.alert_states.pop(metric_path, None) + return + if metric_path not in host.alert_states: + host.alert_states[metric_path] = AlertState(metric_path) + state = host.alert_states[metric_path] + state.update(level, level_name) + + def _make_timer_callbacks(uname, host, ctx): """Return (on_overdue, on_unknown) async callbacks for connection timer logic. @@ -182,6 +200,7 @@ def _make_timer_callbacks(uname, host, ctx): async def on_unknown(connection): connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat) + # Keep connectivity alert active when host transitions to unknown if msg_to_websockets: msg_to_websockets("host", host.stateinfo()) @@ -196,6 +215,8 @@ def _make_timer_callbacks(uname, host, ctx): uname, notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"), ) + # Track in alert_states so the Alerts Dashboard shows this + _set_connectivity_alert(host, connection.afam, "CRITICAL") if threshold_checker: threshold_checker.check_value( host_name=uname, @@ -410,6 +431,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): if conn.getstate() != hbdcls.Connection.UP: lasts = conn.state d = conn.newstate(hbdcls.Connection.UP, now) + # Clear connectivity alert now that the host is back up + _set_connectivity_alert(host, conn.afam, "OK") # Don't log/notify RECOVER for a brand-new host seen for the first time — # it was never down, it just hasn't been seen before. if not newh: @@ -436,6 +459,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"), ) conn.newstate(hbdcls.Connection.DOWN, now) + _set_connectivity_alert(host, conn.afam, "CRITICAL") if interval > 0: host.interval = interval