Fix rtt, including bug in time compute

This commit is contained in:
Andreas Wrede
2026-04-01 19:41:53 -04:00
parent 090d341244
commit 460d2be9e9
13 changed files with 1366 additions and 372 deletions
+33 -24
View File
@@ -68,7 +68,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
- config: dict of configuration
- hbdclass: module providing Host/Connection classes
- log: callable(loghost, message)
- pushmsg: callable(message)
- msg_to_websockets: callable(typ, data)
- msg_journal: MessageJournal instance for logging all messages
- DEBUG, verbose
@@ -91,7 +90,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
cfg = ctx.get("config", {})
hbdcls = ctx.get("hbdclass")
log = ctx.get("log")
pushmsg = ctx.get("pushmsg")
msg_to_websockets = ctx.get("msg_to_websockets")
DEBUG = ctx.get("DEBUG", 0)
verbose = ctx.get("verbose", False)
@@ -100,18 +98,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
name = msg.get("name", "unknown")
from ..common.utils import shortname
from . import config as config_mod
uname = shortname(name)
if uname not in hbdcls.Host.hosts:
host = hbdcls.Host(uname)
host.dyn = uname in cfg.get("dyndnshosts", [])
# Use new config function to check dyndns
dyndnshosts = config_mod.get_dyndnshosts(cfg)
host.dyn = uname in dyndnshosts
if verbose:
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
newh = True
else:
host = hbdcls.Host.hosts[uname]
newh = False
# Get watchhosts once for use throughout message handling
watchhosts = config_mod.get_watchhosts(cfg)
cid = msg.get("id", 0)
try:
@@ -181,9 +185,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if res:
eventlog(uname, "WARNING", res)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s" % (host.name, res))
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
interval = int(msg.get("interval", 0) or 0)
shutdown = msg.get("shutdown", 0)
@@ -193,15 +196,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if boot:
eventlog(uname, "INFO", "booted")
if uname in cfg.get("watchhosts", []):
if uname in watchhosts:
m = "%s booted" % (host.name)
if pushmsg:
pushmsg(m)
notify_mod.pushmsg_for_host(uname, m)
if message:
eventlog(uname, "INFO", "msg: %s" % message, service=service)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(message)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, message)
if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state
@@ -211,9 +212,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s is back" % (uname, conn.afam))
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
if boot or newh:
host.upcount = host.doesack
@@ -222,9 +222,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if shutdown:
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s shutdown" % (uname, conn.afam))
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
conn.newstate(hbdcls.Connection.DOWN, now)
if interval > 0:
@@ -247,11 +246,21 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(f"{uname} {msg}")
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
# Check RTT thresholds with infinite RTT for overdue hosts
threshold_checker = ctx.get("threshold_checker")
if threshold_checker:
metric_path = "rtt"
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=float('inf'),
alert_states=host.alert_states
)
# Notify websockets
if msg_to_websockets:
@@ -274,8 +283,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# Check RTT thresholds using the threshold checker
threshold_checker = ctx.get("threshold_checker")
if threshold_checker and rtt and rtt > 0:
# Metric path for RTT is "rtt.<hostname>"
metric_path = f"rtt.{uname}"
# Metric path for RTT is simply "rtt"
metric_path = "rtt"
# Check against configured thresholds (handles alerts, notifications, etc.)
threshold_checker.check_value(