refactor monitor, add threshold rtesting

This commit is contained in:
Andreas Wrede
2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
+69 -13
View File
@@ -6,8 +6,10 @@ import logging
from ..common.proto import stodict, oldmtodict
from ..common.utils import dur
from . import notify as notify_mod
logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog
class EchoServerProtocol(asyncio.DatagramProtocol):
@@ -170,8 +172,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
return
if res:
if log:
log(uname, res)
eventlog(uname, "WARNING", res)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s" % (host.name, res))
@@ -183,15 +184,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
boot = msg.get("boot", 0)
if boot:
if log:
log(uname, "booted")
eventlog(uname, "INFO", "booted")
if uname in cfg.get("watchhosts", []):
m = "%s booted" % (host.name)
if pushmsg:
pushmsg(m)
if message:
if log:
log(uname, "msg: %s" % message, service=service)
eventlog(uname, "INFO", "msg: %s" % message, service=service)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(message)
@@ -199,9 +198,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now)
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
if log:
log(uname, m)
if d == 0 or lasts == "unknown":
m = "%s is up" % (conn.afam)
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s is back" % (uname, conn.afam))
@@ -212,8 +213,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
host.upcount += 1
if shutdown:
if log:
log(uname, "%s shutdown" % conn.afam)
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s shutdown" % (uname, conn.afam))
@@ -221,6 +221,61 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if interval > 0:
host.interval = interval
# Timer-based reachability monitoring
# Reset overdue timer on every heartbeat
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
grace = cfg.get("grace", 2)
timeout_seconds = (interval + grace) if interval > 0 else 30
# Create callback for timer expiration
async def on_overdue(connection):
"""Called when connection timer expires (no heartbeat received)."""
import time
now = time.time()
# Only mark as overdue if still in UP state (not already marked)
if connection.getstate() == hbdcls.Connection.UP:
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(f"{uname} {msg}")
# Notify websockets
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
# Set a longer timer for marking as UNKNOWN (7 days)
DROPOVERDUE = 7 * 24 * 3600
async def on_unknown(connection):
"""Mark connection as unknown after extended absence."""
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
# Reset the timer
conn.reset_overdue_timer(timeout_seconds, on_overdue)
# Check RTT thresholds using the threshold checker
threshold_checker = ctx.get("threshold_checker")
if threshold_checker and rtt and rtt > 0:
# Metric path for RTT is "rtt.<hostname>"
metric_path = f"rtt.{uname}"
# Check against configured thresholds (handles alerts, notifications, etc.)
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=rtt,
alert_states=host.alert_states
)
# send ACK back
rmsg = {"time": __import__("time").time()}
@@ -266,5 +321,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if msg_to_websockets:
try:
msg_to_websockets("host", host.stateinfo())
except Exception:
pass
except Exception as e:
if DEBUG > 0:
print(("cannot send websocket message: %s" % e))