re-factor notifications, add sms and matrix as channels
This commit is contained in:
+27
-24
@@ -171,7 +171,7 @@ def dicttos(ID, d):
|
||||
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||
|
||||
|
||||
def _make_timer_callbacks(uname, host, watchhosts, ctx):
|
||||
def _make_timer_callbacks(uname, host, ctx):
|
||||
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||
|
||||
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||
@@ -191,9 +191,11 @@ def _make_timer_callbacks(uname, host, watchhosts, ctx):
|
||||
now = time.time()
|
||||
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
eventlog(uname, "CRITICAL", msg)
|
||||
notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||
)
|
||||
if threshold_checker:
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
@@ -218,8 +220,6 @@ def restore_connection_timers(hbdclass, ctx):
|
||||
now = time.time()
|
||||
cfg = ctx.get("config", {})
|
||||
grace = cfg.get("grace", 2)
|
||||
from . import config as config_mod
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
restored = 0
|
||||
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||
@@ -229,7 +229,7 @@ def restore_connection_timers(hbdclass, ctx):
|
||||
if state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
@@ -322,9 +322,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
# Get watchhosts once for use throughout message handling
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt"))
|
||||
@@ -390,8 +387,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
|
||||
notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
|
||||
)
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
@@ -401,13 +400,12 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if uname in watchhosts:
|
||||
m = "%s booted" % (host.name)
|
||||
notify_mod.pushmsg_for_host(uname, m)
|
||||
notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||
)
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
@@ -420,8 +418,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||
)
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
@@ -429,20 +429,23 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
|
||||
m = "%s shutdown" % conn.afam
|
||||
eventlog(uname, "INFO", m)
|
||||
notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||
)
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
# Reset overdue timer on every heartbeat
|
||||
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||
grace = cfg.get("grace", 2)
|
||||
timeout_seconds = interval + grace
|
||||
on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
|
||||
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||
|
||||
# Check RTT thresholds using the threshold checker
|
||||
|
||||
Reference in New Issue
Block a user