refactor monitor, add threshold rtesting

This commit is contained in:
Andreas Wrede
2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
+14 -1
View File
@@ -64,4 +64,17 @@ thresholds:
percent: percent:
warning: 8.0 warning: 8.0
critical: 90.0 critical: 90.0
nagios_runner:
load_status:
warning: WARNING
operator: = "="
critical: CRITICAL
operator: = "="
UPS_status_code:
warning: 1
critical: 2
operator: ">="
rtt:
y:
warning: 0.1
critical: 10.0
BIN
View File
Binary file not shown.
+1 -1
View File
@@ -9,7 +9,7 @@
"type": "debugpy", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "hbd.server.cli", "module": "hbd.server.cli",
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x"], "args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x", "-x"],
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
"env": { "env": {
"PYTHONPATH": "${workspaceFolder}" "PYTHONPATH": "${workspaceFolder}"
+44
View File
@@ -150,6 +150,18 @@ Heartbeat includes a sophisticated threshold alerting system that monitors plugi
```yaml ```yaml
thresholds: thresholds:
# RTT (Round-Trip Time) thresholds for heartbeat monitoring
# These are checked on every HTB message arrival
rtt:
webserver01:
warning: 100.0 # Warn when RTT > 100ms
critical: 500.0 # Critical when RTT > 500ms
database01:
warning: 50.0
critical: 200.0
# Plugin metric thresholds
cpu_monitor: cpu_monitor:
cpu_percent: cpu_percent:
warning: 80.0 # Warn when CPU > 80% warning: 80.0 # Warn when CPU > 80%
@@ -177,6 +189,38 @@ thresholds:
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
``` ```
### RTT Monitoring
Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
- **Per-host configuration**: Set different thresholds for each monitored host
- **Real-time checking**: Thresholds evaluated on every HTB message arrival
- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
- **Re-notification**: Periodic reminders for ongoing RTT issues
- **Event & journal logging**: All RTT events logged for audit trail
**Configuration format:**
```yaml
thresholds:
rtt:
<hostname>:
warning: <milliseconds> # Warn when RTT > this value
critical: <milliseconds> # Critical when RTT > this value
hysteresis: 0.1 # Optional: 10% hysteresis (default)
```
**Example alerts:**
```
WARNING: webserver01 - rtt.webserver01 = 125.3
CRITICAL: database01 - rtt.database01 = 520.1
RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
```
RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
### Alert Behavior ### Alert Behavior
1. **State Changes**: Notifications sent when crossing thresholds 1. **State Changes**: Notifications sent when crossing thresholds
+2 -3
View File
@@ -43,9 +43,8 @@ def main(argv=None):
config["verbose"] = True config["verbose"] = True
if args.pushsrv: if args.pushsrv:
config["pushsrv"] = args.pushsrv config["pushsrv"] = args.pushsrv
if args.debug: if args.debug > 0:
config.setdefault("debug", 0) config["debug"] = args.debug
config["debug"] += args.debug
run_server(config) run_server(config)
+12
View File
@@ -0,0 +1,12 @@
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
class Data:
def __init__(self, config):
self.config = config
self.data = {}
def update(self, new_data):
self.data.update(new_data)
def get(self, key, default=None):
return self.data.get(key, default)
+135 -5
View File
@@ -42,6 +42,11 @@ class Connection:
self.statetime = self.lastbeat self.statetime = self.lastbeat
self.deltastatetime = "computed" self.deltastatetime = "computed"
self.state = Connection.UNKNOWN self.state = Connection.UNKNOWN
# Timer-based reachability monitoring
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
if host: if host:
Connection.htab[addr] = self.host.name Connection.htab[addr] = self.host.name
@@ -49,6 +54,27 @@ class Connection:
log(self.host.name, "dns update %s" % self.addr) log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr)) Host.dnsQ.put((self.host.name, self.addr))
def __getstate__(self):
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
state = self.__dict__.copy()
# Remove asyncio timer objects that can't be pickled
# These will be recreated when the next HTB arrives after unpickling
state['overdue_timer'] = None
state['overdue_callback'] = None
state['timeout_duration'] = None
return state
def __setstate__(self, state):
"""Restore Connection from pickle, reinitializing timer fields."""
self.__dict__.update(state)
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
if not hasattr(self, 'overdue_timer'):
self.overdue_timer = None
if not hasattr(self, 'overdue_callback'):
self.overdue_callback = None
if not hasattr(self, 'timeout_duration'):
self.timeout_duration = None
def registerDns(self): def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr)) Host.dnsQ.put((self.host.name, self.addr))
@@ -123,7 +149,18 @@ class Connection:
return d return d
def jsons(self): def jsons(self):
return json.dumps(self.__dict__) """Serialize connection to JSON, excluding non-serializable timer objects."""
data = {}
for key, value in self.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
data[key] = value.name if value else None
else:
data[key] = value
return json.dumps(data)
# set new state, return number of secs in previous state # set new state, return number of secs in previous state
def newstate(self, state, now, when=0): def newstate(self, state, now, when=0):
@@ -151,10 +188,87 @@ class Connection:
except Exception: except Exception:
pass pass
self.addr = addr self.addr = addr
Connection.htab[addr] = self.host.name Connection.htab[addr] = self.host.nameconnection_count
if self.host.isDynDns(): if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr)) Host.dnsQ.put((self.host.name, self.addr))
return r return r
def reset_overdue_timer(self, timeout_seconds, callback):
"""Reset the overdue timer for this connection.
Cancels any existing timer and sets a new one that will mark
the connection as overdue if no heartbeat arrives before timeout.
Args:
timeout_seconds: Seconds before marking as overdue
callback: Async function to call when timer expires
"""
import asyncio
# Cancel existing timer if any
if self.overdue_timer and not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
# Store parameters for later reference
self.timeout_duration = timeout_seconds
self.overdue_callback = callback
# Create new timer
async def timer_expired():
await callback(self)
try:
loop = asyncio.get_event_loop()
self.overdue_timer = loop.call_later(timeout_seconds,
lambda: asyncio.create_task(timer_expired()))
except RuntimeError:
# No event loop running yet
pass
def cancel_overdue_timer(self):
"""Cancel the overdue timer if it exists and clear all timer references."""
if self.overdue_timer:
try:
if not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
except Exception:
pass
# Clear all timer-related references
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
def get_avg_rtt(self):
"""Get average RTT from recent samples."""
valid_rtts = [r for r in self.rtts if r > 0]
if valid_rtts:
return sum(valid_rtts) / len(valid_rtts)
return 0
def get_current_rtt(self):
"""Get most recent RTT value."""
return self.rtts[-1] if self.rtts else 0
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
"""Check if RTT exceeds thresholds.
Args:
warning_threshold: RTT in ms for warning level
critical_threshold: RTT in ms for critical level
Returns:
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
"""
rtt = self.get_current_rtt()
if rtt <= 0:
return (None, rtt)
if critical_threshold and rtt > critical_threshold:
return ('CRITICAL', rtt)
elif warning_threshold and rtt > warning_threshold:
return ('WARNING', rtt)
return (None, rtt)
# #
@@ -224,14 +338,30 @@ class Host:
def stateinfo(self): def stateinfo(self):
ddict = {} ddict = {}
for d in self.__dict__: for d in self.__dict__:
if d in ["alert_states", "plugin_data"]:
continue
if d == "connections": if d == "connections":
cl = [] cl = []
for c in ["IPv4", "IPv6"]: for c in ["IPv4", "IPv6"]:
if c not in self.connections: if c not in self.connections:
continue continue
# dirty ugly hack: fix conn to host backpointer # Create connection dict, excluding non-serializable timer objects
cld = copy.deepcopy(self.connections[c].__dict__) conn = self.connections[c]
cld["host"] = cld["host"].name cld = {}
for key, value in conn.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
cld[key] = value.name if value else None
else:
# Safe copy for serializable values
try:
cld[key] = copy.deepcopy(value)
except Exception:
# If deepcopy fails, use shallow copy
cld[key] = value
cl.append(cld) cl.append(cld)
ddict[d] = cl ddict[d] = cl
else: else:
+4 -4
View File
@@ -8,6 +8,7 @@ import os
import logging import logging
from aiohttp import web from aiohttp import web
import jinja2 import jinja2
from . import data
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -22,7 +23,6 @@ async def start(
port: int, port: int,
config, config,
hbdclass, hbdclass,
msgs_getter,
log=None, log=None,
email=None, email=None,
pushmsg=None, pushmsg=None,
@@ -52,7 +52,7 @@ async def start(
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">') res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
res.append(f"<H2>Heartbeat status {VER}</h2>") res.append(f"<H2>Heartbeat status {VER}</h2>")
res += hbdclass.ubHost.buildhosttable() res += hbdclass.ubHost.buildhosttable()
res += hbdclass.ubHost.buildmsgtable(msgs_getter()) res += hbdclass.ubHost.buildmsgtable(data.msgs)
res.append( res.append(
"<p> %s (%s)</p>" "<p> %s (%s)</p>"
% ( % (
@@ -69,7 +69,7 @@ async def start(
return web.json_response(json.loads("[" + ",".join(lst) + "]")) return web.json_response(json.loads("[" + ",".join(lst) + "]"))
async def api_messages(request): async def api_messages(request):
lst = msgs_getter()[-30:] lst = data.msgs[-30:]
return web.json_response(lst) return web.json_response(lst)
async def cmd(request): async def cmd(request):
@@ -155,7 +155,7 @@ async def start(
hosts=[ hosts=[
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts) hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
], ],
messages=msgs_getter()[-30:], messages=data.msgs[-30:],
) )
return web.Response(text=body, content_type="text/html") return web.Response(text=body, content_type="text/html")
+26 -18
View File
@@ -14,28 +14,35 @@ from . import hbdclass
from . import ws as ws_mod from . import ws as ws_mod
from . import notify as notify_mod from . import notify as notify_mod
from . import data
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
msg_to_websockets = ws_mod.broadcast msg_to_websockets = ws_mod.broadcast
eventlog = notify_mod.log eventlog = notify_mod.eventlog
lastfm = ["", "", ""]
# shared runtime collections and helpers # shared runtime collections and helpers
msgs = notify_mod.msgs
def cleanup_function(config): def cleanup_function(config, hbdclass):
"""This function will be executed upon program exit.""" """This function will be executed upon program exit."""
logger.info("Running cleanup function...") logger.info("Running cleanup function...")
import pickle import pickle
# Ensure all timer references are cleared before pickling
for hostname, host in list(hbdclass.Host.hosts.items()):
for conn_type, conn in host.connections.items():
if hasattr(conn, 'overdue_timer'):
conn.overdue_timer = None
if hasattr(conn, 'overdue_callback'):
conn.overdue_callback = None
if hasattr(conn, 'timeout_duration'):
conn.timeout_duration = None
pickfile = config.get("pickfile", "hbd.pickle") pickfile = config.get("pickfile", "hbd.pickle")
pickf = open(pickfile, "wb") pickf = open(pickfile, "wb")
pick = pickle.Pickler(pickf) pick = pickle.Pickler(pickf)
pick.dump(hbdclass.Host.hosts) pick.dump(hbdclass.Host.hosts)
pick.dump(msgs) pick.dump(data.msgs)
pick.dump(lastfm)
pickf.close() pickf.close()
logger.info("Cleanup complete.") logger.info("Cleanup complete.")
@@ -124,7 +131,6 @@ async def _run_async(config):
port=config.get("hbd_port", 50004), port=config.get("hbd_port", 50004),
config=config, config=config,
hbdclass=hbdclass, hbdclass=hbdclass,
msgs_getter=lambda: msgs,
log=eventlog, log=eventlog,
pushmsg=pushmsg, pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets, msg_to_websockets=msg_to_websockets,
@@ -186,7 +192,7 @@ async def _run_async(config):
hbdclass.Host.hosts[h].stateinfo() hbdclass.Host.hosts[h].stateinfo()
for h in sorted(hbdclass.Host.hosts) for h in sorted(hbdclass.Host.hosts)
], ],
get_msgs=lambda: msgs, # get_msgs=lambda: msgs,
config=config, config=config,
) )
) )
@@ -200,7 +206,6 @@ async def _run_async(config):
monitor_mod.start( monitor_mod.start(
config=config, config=config,
hbdclass=hbdclass, hbdclass=hbdclass,
log=eventlog,
pushmsg=pushmsg, pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets, msg_to_websockets=msg_to_websockets,
) )
@@ -216,6 +221,13 @@ async def _run_async(config):
except Exception as e: except Exception as e:
logger.exception("Error in main loop: %s", e) logger.exception("Error in main loop: %s", e)
finally: finally:
# Clean up connection timers
try:
logger.info("Cleaning up connection timers...")
await monitor_mod.cleanup_connections(hbdclass)
except Exception as e:
logger.warning("Error cleaning up connection timers: %s", e)
# Cancel all running tasks # Cancel all running tasks
logger.info("Cancelling tasks...") logger.info("Cancelling tasks...")
try: try:
@@ -279,7 +291,6 @@ async def _run_async(config):
def load_pickled_hosts(config, hbdclass): def load_pickled_hosts(config, hbdclass):
"""Load pickled hosts from file, if available.""" """Load pickled hosts from file, if available."""
global lastfm, msgs
import os import os
import pickle import pickle
@@ -294,11 +305,7 @@ def load_pickled_hosts(config, hbdclass):
pick = pickle.Unpickler(pickf) pick = pickle.Unpickler(pickf)
try: try:
hbdclass.Host.hosts = pick.load() hbdclass.Host.hosts = pick.load()
msgs = pick.load() data.msgs = pick.load()
try:
lastfm = pick.load()
except Exception:
lastfm = ["", "", ""]
pickf.close() pickf.close()
except Exception as e: except Exception as e:
logger.exception("load pickled failed: %s", e) logger.exception("load pickled failed: %s", e)
@@ -331,7 +338,7 @@ def run(config):
load_pickled_hosts(config, hbdclass) load_pickled_hosts(config, hbdclass)
notify_mod.initlog(logfile=config.get("logfile", "messages.log")) notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
eventlog(None, f"hbd version {__version__} starting up") eventlog(None, "INFO", f"hbd version {__version__} starting up")
# Create and set the event loop manually # Create and set the event loop manually
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
@@ -344,8 +351,9 @@ def run(config):
except Exception as e: except Exception as e:
logger.exception("Unhandled exception in main: %s", e) logger.exception("Unhandled exception in main: %s", e)
finally: finally:
cleanup_function(config) cleanup_function(config, hbdclass)
logger.info("hbd shutdown complete") logger.info("hbd shutdown complete")
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
notify_mod.closelog() notify_mod.closelog()
# Explicitly close the loop # Explicitly close the loop
try: try:
+49 -33
View File
@@ -1,50 +1,66 @@
"""monitor helper and thread for heartbeat daemon.""" """Monitor helper for heartbeat daemon.
This module provides monitoring tasks for the heartbeat daemon.
The primary reachability monitoring is now event-driven (timers set/reset
on HTB arrival in udp.py) rather than periodic polling.
This module can be extended for additional monitoring tasks.
"""
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import time import time
from . import notify as notify_mod
DROPOVERDUE = 7 * 24 * 3600 DROPOVERDUE = 7 * 24 * 3600
eventlog = notify_mod.eventlog
def checkoverdue( async def cleanup_connections(hbdclass):
config: dict, """Clean up connection timers on shutdown.
hbdclass,
log: callable, Cancels all active overdue timers to prevent callbacks after shutdown.
pushmsg: callable, """
msg_to_websockets: callable, for hostname, host in list(hbdclass.Host.hosts.items()):
): for conn_type, conn in host.connections.items():
now = time.time() if hasattr(conn, 'cancel_overdue_timer'):
for h in list(hbdclass.Host.hosts.keys()): conn.cancel_overdue_timer()
pmsg = []
for c in hbdclass.Host.hosts[h].connections:
conn = hbdclass.Host.hosts[h].connections[c]
if conn.state == hbdclass.Connection.DOWN:
continue
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
pmsg.append(conn.afam)
if (
conn.state == hbdclass.Connection.OVERDUE
and (now - conn.lastbeat) > DROPOVERDUE
):
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
if pmsg != []:
if h in config.get("watchhosts", []):
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
log(h, "%s overdue" % " and ".join(pmsg))
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
async def start( async def start(
config: dict, config: dict,
hbdclass: callable, hbdclass: callable,
log=None,
pushmsg=None, pushmsg=None,
msg_to_websockets=None, msg_to_websockets=None,
): ):
"""start a monitor loop that checks for overdue hosts every minute""" """Start monitor background tasks.
Note: Reachability monitoring is now timer-based and happens in udp.py
when HTB messages arrive. This function can be used for additional
monitoring tasks.
Currently runs a simple status logger every 5 minutes.
"""
import logging
logger = logging.getLogger(__name__)
logger_interval = 300 # Log status every 5 minutes
while True: while True:
await asyncio.sleep(15) # 15 seconds between checks await asyncio.sleep(logger_interval)
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
# Log monitoring status
total_hosts = len(hbdclass.Host.hosts)
up_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.UP
)
overdue_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.OVERDUE
)
logger.debug(
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
)
+10 -6
View File
@@ -8,7 +8,9 @@ import subprocess
import smtplib import smtplib
import time import time
import sys import sys
from . import data
from . import ws as ws_mod from . import ws as ws_mod
from . import main as main_mod
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"] DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
msg_to_websockets = ws_mod.broadcast msg_to_websockets = ws_mod.broadcast
@@ -17,19 +19,18 @@ msg_to_websockets = ws_mod.broadcast
_config = {} _config = {}
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
msgs = []
logf = None logf = None
def initlog(logfile): def initlog(logfile):
global logf global logf
try: try:
logf = open(logfile, "a+") logf = open(logfile, "a+")
return logf
except Exception as e: except Exception as e:
import sys import sys
print("cannot open logfile %s, using STDERR: %s" % (logfile, e)) print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
return sys.stderr logf = sys.stderr
return logf
def closelog(): def closelog():
global logf global logf
@@ -39,10 +40,13 @@ def closelog():
except Exception: except Exception:
pass pass
def log(host, m, service=None): def eventlog(host, lvl, m, service=None):
ts = time.time() ts = time.time()
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}" s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
msgs.append(s) if host:
s += f"{host} "
s += m
data.msgs.append(s)
logger.info(s) logger.info(s)
if logf: if logf:
try: try:
+109 -7
View File
@@ -16,7 +16,7 @@ from typing import Dict, Any, Optional, Tuple, Callable
from . import notify as notify_mod from . import notify as notify_mod
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
eventlog = notify_mod.log eventlog = notify_mod.eventlog
class AlertLevel(Enum): class AlertLevel(Enum):
"""Alert severity levels.""" """Alert severity levels."""
@@ -96,6 +96,8 @@ class AlertState:
"notification_count": self.notification_count, "notification_count": self.notification_count,
} }
def __str__(self):
return self.to_dict().__str__()
class ThresholdConfig: class ThresholdConfig:
"""Configuration for a single threshold check.""" """Configuration for a single threshold check."""
@@ -280,6 +282,11 @@ class ThresholdChecker:
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]): def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
"""Parse thresholds for a specific plugin.""" """Parse thresholds for a specific plugin."""
# Special handling for RTT thresholds (per-host)
if plugin_name == "rtt":
self._parse_rtt_thresholds(thresholds)
return
for metric_name, threshold_config in thresholds.items(): for metric_name, threshold_config in thresholds.items():
if not isinstance(threshold_config, dict): if not isinstance(threshold_config, dict):
continue continue
@@ -353,6 +360,97 @@ class ThresholdChecker:
self.thresholds[metric_path] = threshold self.thresholds[metric_path] = threshold
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
"""Parse RTT thresholds (per-host network latency thresholds).
RTT thresholds are configured as:
thresholds:
rtt:
hostname1:
warning: 100.0 # ms
critical: 500.0 # ms
"""
for hostname, threshold_config in rtt_thresholds.items():
if not isinstance(threshold_config, dict):
continue
# Metric path is "rtt.<hostname>"
metric_path = f"rtt.{hostname}"
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
if warning is None and critical is None:
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
continue
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
)
self.thresholds[metric_path] = threshold
logger.debug(
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
hostname,
warning,
critical
)
def check_value(
self,
host_name: str,
metric_path: str,
value: float,
alert_states: Dict[str, AlertState],
) -> Optional[Tuple[AlertLevel, AlertLevel]]:
"""
Check a single value against configured threshold.
Args:
host_name: Name of the host
metric_path: Full metric path (e.g., "rtt.hostname")
value: The metric value to check
alert_states: Host's alert_states dictionary
Returns:
Tuple of (old_level, new_level) if state changed, None otherwise
"""
if metric_path not in self.thresholds:
return None
threshold = self.thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path]
# Evaluate threshold with hysteresis
new_level = threshold.evaluate_with_hysteresis(
value,
alert_state.level
)
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
return (old_level, new_level)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value)
return None
def check_plugin_data( def check_plugin_data(
self, self,
host_name: str, host_name: str,
@@ -476,18 +574,22 @@ class ThresholdChecker:
"""Trigger a notification for an alert state change.""" """Trigger a notification for an alert state change."""
# Format message # Format message
if new_level == AlertLevel.OK: if new_level == AlertLevel.OK:
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)" lvl = "RECOVERED"
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING: elif new_level == AlertLevel.WARNING:
message = f"WARNING: {host_name} - {metric_path} = {value}" lvl = "WARNING"
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL: elif new_level == AlertLevel.CRITICAL:
message = f"CRITICAL: {host_name} - {metric_path} = {value}" lvl = "CRITICAL"
message = f"{metric_path} = {value}"
else: else:
message = f"UNKNOWN: {host_name} - {metric_path} = {value}" lvl = "UNKNOWN"
message = f"{metric_path} = {value}"
# Send notification # Send notification
if self.notification_callback is not None: if self.notification_callback is not None:
try: try:
self.notification_callback(message) self.notification_callback(f"{lvl}: {host_name} - {message}")
logger.info("Notification sent: %s", message) logger.info("Notification sent: %s", message)
except Exception as e: except Exception as e:
logger.error("Failed to send notification: %s", e) logger.error("Failed to send notification: %s", e)
@@ -507,7 +609,7 @@ class ThresholdChecker:
except Exception as e: except Exception as e:
logger.debug(f"Failed to log threshold event to journal: {e}") logger.debug(f"Failed to log threshold event to journal: {e}")
# Log to eventlog as well # Log to eventlog as well
eventlog(host_name, message, service="threshold") eventlog(host_name, lvl, message, service="threshold")
def _check_renotify( def _check_renotify(
self, self,
+69 -13
View File
@@ -6,8 +6,10 @@ import logging
from ..common.proto import stodict, oldmtodict from ..common.proto import stodict, oldmtodict
from ..common.utils import dur from ..common.utils import dur
from . import notify as notify_mod
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog
class EchoServerProtocol(asyncio.DatagramProtocol): class EchoServerProtocol(asyncio.DatagramProtocol):
@@ -170,8 +172,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
return return
if res: if res:
if log: eventlog(uname, "WARNING", res)
log(uname, res)
if uname in cfg.get("watchhosts", []): if uname in cfg.get("watchhosts", []):
if pushmsg: if pushmsg:
pushmsg("%s %s" % (host.name, res)) pushmsg("%s %s" % (host.name, res))
@@ -183,15 +184,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
boot = msg.get("boot", 0) boot = msg.get("boot", 0)
if boot: if boot:
if log: eventlog(uname, "INFO", "booted")
log(uname, "booted")
if uname in cfg.get("watchhosts", []): if uname in cfg.get("watchhosts", []):
m = "%s booted" % (host.name) m = "%s booted" % (host.name)
if pushmsg: if pushmsg:
pushmsg(m) pushmsg(m)
if message: if message:
if log: eventlog(uname, "INFO", "msg: %s" % message, service=service)
log(uname, "msg: %s" % message, service=service)
if uname in cfg.get("watchhosts", []): if uname in cfg.get("watchhosts", []):
if pushmsg: if pushmsg:
pushmsg(message) pushmsg(message)
@@ -199,9 +198,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if conn.getstate() != hbdcls.Connection.UP: if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now) d = conn.newstate(hbdcls.Connection.UP, now)
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d)) if d == 0 or lasts == "unknown":
if log: m = "%s is up" % (conn.afam)
log(uname, m) else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in cfg.get("watchhosts", []): if uname in cfg.get("watchhosts", []):
if pushmsg: if pushmsg:
pushmsg("%s %s is back" % (uname, conn.afam)) pushmsg("%s %s is back" % (uname, conn.afam))
@@ -212,8 +213,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
host.upcount += 1 host.upcount += 1
if shutdown: if shutdown:
if log: eventlog(uname, "INFO", "%s shutdown" % conn.afam)
log(uname, "%s shutdown" % conn.afam)
if uname in cfg.get("watchhosts", []): if uname in cfg.get("watchhosts", []):
if pushmsg: if pushmsg:
pushmsg("%s %s shutdown" % (uname, conn.afam)) pushmsg("%s %s shutdown" % (uname, conn.afam))
@@ -221,6 +221,61 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if interval > 0: if interval > 0:
host.interval = interval host.interval = interval
# Timer-based reachability monitoring
# Reset overdue timer on every heartbeat
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
grace = cfg.get("grace", 2)
timeout_seconds = (interval + grace) if interval > 0 else 30
# Create callback for timer expiration
async def on_overdue(connection):
"""Called when connection timer expires (no heartbeat received)."""
import time
now = time.time()
# Only mark as overdue if still in UP state (not already marked)
if connection.getstate() == hbdcls.Connection.UP:
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(f"{uname} {msg}")
# Notify websockets
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
# Set a longer timer for marking as UNKNOWN (7 days)
DROPOVERDUE = 7 * 24 * 3600
async def on_unknown(connection):
"""Mark connection as unknown after extended absence."""
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
# Reset the timer
conn.reset_overdue_timer(timeout_seconds, on_overdue)
# Check RTT thresholds using the threshold checker
threshold_checker = ctx.get("threshold_checker")
if threshold_checker and rtt and rtt > 0:
# Metric path for RTT is "rtt.<hostname>"
metric_path = f"rtt.{uname}"
# Check against configured thresholds (handles alerts, notifications, etc.)
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=rtt,
alert_states=host.alert_states
)
# send ACK back # send ACK back
rmsg = {"time": __import__("time").time()} rmsg = {"time": __import__("time").time()}
@@ -266,5 +321,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if msg_to_websockets: if msg_to_websockets:
try: try:
msg_to_websockets("host", host.stateinfo()) msg_to_websockets("host", host.stateinfo())
except Exception: except Exception as e:
pass if DEBUG > 0:
print(("cannot send websocket message: %s" % e))
+13 -10
View File
@@ -8,6 +8,7 @@ import asyncio
import json import json
import logging import logging
from typing import Callable, Iterable, Optional from typing import Callable, Iterable, Optional
from . import data
import websockets import websockets
@@ -16,7 +17,7 @@ logger.setLevel(logging.INFO)
_connections = set() _connections = set()
_loop: Optional[asyncio.AbstractEventLoop] = None _loop: Optional[asyncio.AbstractEventLoop] = None
_get_hosts: Optional[Callable[[], Iterable]] = None _get_hosts: Optional[Callable[[], Iterable]] = None
_get_msgs: Optional[Callable[[], Iterable]] = None #_get_msgs: Optional[Callable[[], Iterable]] = None
_verbose = False _verbose = False
@@ -38,11 +39,11 @@ async def _handler(websocket, path=None):
except Exception as e: except Exception as e:
logger.error("Error sending initial hosts: %s", e, exc_info=True) logger.error("Error sending initial hosts: %s", e, exc_info=True)
# send recent messages # send recent messages
if _get_msgs: if data.msgs:
try: try:
msgs = list(_get_msgs())[-100:] # msgs = list(_get_msgs())[-100:]
logger.debug("Sending %d recent messages to new WebSocket client", len(msgs)) logger.debug("Sending %d recent messages to new WebSocket client", len(data.msgs))
for m in msgs: for m in data.msgs:
jmsg = json.dumps({"type": "message", "data": m}) jmsg = json.dumps({"type": "message", "data": m})
await websocket.send(jmsg) await websocket.send(jmsg)
except Exception as e: except Exception as e:
@@ -77,7 +78,7 @@ async def start(
wss_port: Optional[int] = None, wss_port: Optional[int] = None,
ssl_context=None, ssl_context=None,
get_hosts: Optional[Callable] = None, get_hosts: Optional[Callable] = None,
get_msgs: Optional[Callable] = None, # get_msgs: Optional[Callable] = None,
config: dict = {}, config: dict = {},
): ):
"""Start WebSocket servers and block until cancelled. """Start WebSocket servers and block until cancelled.
@@ -86,17 +87,19 @@ async def start(
If `wss_port` and `ssl_context` are provided, a WSS server will also be If `wss_port` and `ssl_context` are provided, a WSS server will also be
started. started.
""" """
global _loop, _get_hosts, _get_msgs, _verbose global _loop, _get_hosts, _verbose
_loop = asyncio.get_running_loop() _loop = asyncio.get_running_loop()
_get_hosts = get_hosts _get_hosts = get_hosts
_get_msgs = get_msgs
_verbose = config.get("verbose", False), _verbose = config.get("verbose", False),
_debug = config.get("debug", False), _debug = config.get("debug", 0),
servers = [] servers = []
# plain WebSocket # plain WebSocket
websockets_logger = logging.getLogger("websockets.server") websockets_logger = logging.getLogger("websockets.server")
websockets_logger.setLevel(logging.DEBUG if _debug > 2 else logging.INFO) #if _debug > 2:
# websockets_logger.setLevel(logging.DEBUG)
#else:
# websockets_logger.setLevel(logging.INFO)
# regular WebSocket # regular WebSocket
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"]) ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
servers.append(ws_server) servers.append(ws_server)
View File