re-factor notifications, add sms and matrix as channels

This commit is contained in:
Andreas Wrede
2026-04-12 11:04:00 -04:00
parent 7f049a4e26
commit 0199ca4693
11 changed files with 887 additions and 864 deletions
+27 -24
View File
@@ -171,7 +171,7 @@ def dicttos(ID, d):
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
def _make_timer_callbacks(uname, host, watchhosts, ctx):
def _make_timer_callbacks(uname, host, ctx):
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
Captured values are bound at call time so callbacks are safe to use in loops.
@@ -191,9 +191,11 @@ def _make_timer_callbacks(uname, host, watchhosts, ctx):
now = time.time()
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
eventlog(uname, "CRITICAL", msg)
notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
)
if threshold_checker:
threshold_checker.check_value(
host_name=uname,
@@ -218,8 +220,6 @@ def restore_connection_timers(hbdclass, ctx):
now = time.time()
cfg = ctx.get("config", {})
grace = cfg.get("grace", 2)
from . import config as config_mod
watchhosts = config_mod.get_watchhosts(cfg)
restored = 0
for uname, host in list(hbdclass.Host.hosts.items()):
@@ -229,7 +229,7 @@ def restore_connection_timers(hbdclass, ctx):
if state == hbdclass.Connection.DOWN:
continue
on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
if state == hbdclass.Connection.UP and interval > 0:
elapsed = now - conn.lastbeat
@@ -322,9 +322,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
host = hbdcls.Host.hosts[uname]
newh = False
# Get watchhosts once for use throughout message handling
watchhosts = config_mod.get_watchhosts(cfg)
cid = msg.get("id", 0)
try:
rtt = float(msg.get("rtt"))
@@ -390,8 +387,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if res:
eventlog(uname, "WARNING", res)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
)
interval = int(msg.get("interval", 0) or 0)
shutdown = msg.get("shutdown", 0)
@@ -401,13 +400,12 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if boot:
eventlog(uname, "INFO", "booted")
if uname in watchhosts:
m = "%s booted" % (host.name)
notify_mod.pushmsg_for_host(uname, m)
notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
)
if message:
eventlog(uname, "INFO", "msg: %s" % message, service=service)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, message)
if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state
@@ -420,8 +418,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
)
if boot or newh:
host.upcount = host.doesack
@@ -429,20 +429,23 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
host.upcount += 1
if shutdown:
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
m = "%s shutdown" % conn.afam
eventlog(uname, "INFO", m)
notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
)
conn.newstate(hbdcls.Connection.DOWN, now)
if interval > 0:
host.interval = interval
# Timer-based reachability monitoring
# Reset overdue timer on every heartbeat
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
grace = cfg.get("grace", 2)
timeout_seconds = interval + grace
on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
conn.reset_overdue_timer(timeout_seconds, on_overdue)
# Check RTT thresholds using the threshold checker