re-factor notifications, add sms and matrix as channels

2026-04-12 11:04:00 -04:00
parent 7f049a4e26
commit 0199ca4693
11 changed files with 887 additions and 864 deletions
@@ -171,7 +171,7 @@ def dicttos(ID, d):
 DROPOVERDUE = 7 * 24 * 3600  # seconds before an overdue host becomes UNKNOWN


-def _make_timer_callbacks(uname, host, watchhosts, ctx):
+def _make_timer_callbacks(uname, host, ctx):
    """Return (on_overdue, on_unknown) async callbacks for connection timer logic.

    Captured values are bound at call time so callbacks are safe to use in loops.
@@ -191,9 +191,11 @@ def _make_timer_callbacks(uname, host, watchhosts, ctx):
        now = time.time()
        connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
        msg = f"{connection.afam} overdue"
-        eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
-        if uname in watchhosts:
-            notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
+        eventlog(uname, "CRITICAL", msg)
+        notify_mod.send_notification(
+            uname,
+            notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
+        )
        if threshold_checker:
            threshold_checker.check_value(
                host_name=uname,
@@ -218,8 +220,6 @@ def restore_connection_timers(hbdclass, ctx):
    now = time.time()
    cfg = ctx.get("config", {})
    grace = cfg.get("grace", 2)
-    from . import config as config_mod
-    watchhosts = config_mod.get_watchhosts(cfg)

    restored = 0
    for uname, host in list(hbdclass.Host.hosts.items()):
@@ -229,7 +229,7 @@ def restore_connection_timers(hbdclass, ctx):
            if state == hbdclass.Connection.DOWN:
                continue

-            on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
+            on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)

            if state == hbdclass.Connection.UP and interval > 0:
                elapsed = now - conn.lastbeat
@@ -322,9 +322,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        host = hbdcls.Host.hosts[uname]
        newh = False
    
-    # Get watchhosts once for use throughout message handling
-    watchhosts = config_mod.get_watchhosts(cfg)
-
    cid = msg.get("id", 0)
    try:
        rtt = float(msg.get("rtt"))
@@ -390,8 +387,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):

    if res:
        eventlog(uname, "WARNING", res)
-        if uname in watchhosts:
-            notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
+        notify_mod.send_notification(
+            uname,
+            notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
+        )

    interval = int(msg.get("interval", 0) or 0)
    shutdown = msg.get("shutdown", 0)
@@ -401,13 +400,12 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):

    if boot:
        eventlog(uname, "INFO", "booted")
-        if uname in watchhosts:
-            m = "%s booted" % (host.name)
-            notify_mod.pushmsg_for_host(uname, m)
+        notify_mod.send_notification(
+            uname,
+            notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
+        )
    if message:
        eventlog(uname, "INFO", "msg: %s" % message, service=service)
-        if uname in watchhosts:
-            notify_mod.pushmsg_for_host(uname, message)

    if conn.getstate() != hbdcls.Connection.UP:
        lasts = conn.state
@@ -420,8 +418,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
            else:
                m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
            eventlog(uname, "RECOVER", m)
-            if uname in watchhosts:
-                notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
+            notify_mod.send_notification(
+                uname,
+                notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
+            )

    if boot or newh:
        host.upcount = host.doesack
@@ -429,20 +429,23 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        host.upcount += 1

    if shutdown:
-        eventlog(uname, "INFO", "%s shutdown" % conn.afam)
-        if uname in watchhosts:
-            notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
+        m = "%s shutdown" % conn.afam
+        eventlog(uname, "INFO", m)
+        notify_mod.send_notification(
+            uname,
+            notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
+        )
        conn.newstate(hbdcls.Connection.DOWN, now)

    if interval > 0:
        host.interval = interval
-    
+
    # Timer-based reachability monitoring
    # Reset overdue timer on every heartbeat
    if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
        grace = cfg.get("grace", 2)
        timeout_seconds = interval + grace
-        on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
+        on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
        conn.reset_overdue_timer(timeout_seconds, on_overdue)
    
    # Check RTT thresholds using the threshold checker