Fix rtt, including bug in time compute
This commit is contained in:
+33
-24
@@ -68,7 +68,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- pushmsg: callable(message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- msg_journal: MessageJournal instance for logging all messages
|
||||
- DEBUG, verbose
|
||||
@@ -91,7 +90,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
log = ctx.get("log")
|
||||
pushmsg = ctx.get("pushmsg")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
@@ -100,18 +98,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from ..common.utils import shortname
|
||||
from . import config as config_mod
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
||||
# Use new config function to check dyndns
|
||||
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||
host.dyn = uname in dyndnshosts
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
# Get watchhosts once for use throughout message handling
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
@@ -181,9 +185,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s" % (host.name, res))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
@@ -193,15 +196,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if uname in watchhosts:
|
||||
m = "%s booted" % (host.name)
|
||||
if pushmsg:
|
||||
pushmsg(m)
|
||||
notify_mod.pushmsg_for_host(uname, m)
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(message)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
@@ -211,9 +212,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
@@ -222,9 +222,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if shutdown:
|
||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
@@ -247,11 +246,21 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(f"{uname} {msg}")
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
|
||||
# Check RTT thresholds with infinite RTT for overdue hosts
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker:
|
||||
metric_path = "rtt"
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=float('inf'),
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# Notify websockets
|
||||
if msg_to_websockets:
|
||||
@@ -274,8 +283,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
# Check RTT thresholds using the threshold checker
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker and rtt and rtt > 0:
|
||||
# Metric path for RTT is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{uname}"
|
||||
# Metric path for RTT is simply "rtt"
|
||||
metric_path = "rtt"
|
||||
|
||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||
threshold_checker.check_value(
|
||||
|
||||
Reference in New Issue
Block a user