refactor monitor, add threshold rtesting
This commit is contained in:
+69
-13
@@ -6,8 +6,10 @@ import logging
|
||||
|
||||
from ..common.proto import stodict, oldmtodict
|
||||
from ..common.utils import dur
|
||||
from . import notify as notify_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
|
||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||
@@ -170,8 +172,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
return
|
||||
|
||||
if res:
|
||||
if log:
|
||||
log(uname, res)
|
||||
eventlog(uname, "WARNING", res)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s" % (host.name, res))
|
||||
@@ -183,15 +184,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
if log:
|
||||
log(uname, "booted")
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
m = "%s booted" % (host.name)
|
||||
if pushmsg:
|
||||
pushmsg(m)
|
||||
if message:
|
||||
if log:
|
||||
log(uname, "msg: %s" % message, service=service)
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(message)
|
||||
@@ -199,9 +198,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
if log:
|
||||
log(uname, m)
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||
@@ -212,8 +213,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
if log:
|
||||
log(uname, "%s shutdown" % conn.afam)
|
||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||
@@ -221,6 +221,61 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
# Reset overdue timer on every heartbeat
|
||||
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||
grace = cfg.get("grace", 2)
|
||||
timeout_seconds = (interval + grace) if interval > 0 else 30
|
||||
|
||||
# Create callback for timer expiration
|
||||
async def on_overdue(connection):
|
||||
"""Called when connection timer expires (no heartbeat received)."""
|
||||
import time
|
||||
now = time.time()
|
||||
|
||||
# Only mark as overdue if still in UP state (not already marked)
|
||||
if connection.getstate() == hbdcls.Connection.UP:
|
||||
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
|
||||
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(f"{uname} {msg}")
|
||||
|
||||
# Notify websockets
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
# Set a longer timer for marking as UNKNOWN (7 days)
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
|
||||
async def on_unknown(connection):
|
||||
"""Mark connection as unknown after extended absence."""
|
||||
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||
|
||||
# Reset the timer
|
||||
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||
|
||||
# Check RTT thresholds using the threshold checker
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker and rtt and rtt > 0:
|
||||
# Metric path for RTT is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{uname}"
|
||||
|
||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=rtt,
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# send ACK back
|
||||
rmsg = {"time": __import__("time").time()}
|
||||
@@ -266,5 +321,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send websocket message: %s" % e))
|
||||
|
||||
Reference in New Issue
Block a user