refactor monitor, add threshold rtesting

This commit is contained in:
Andreas Wrede
2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
+109 -7
View File
@@ -16,7 +16,7 @@ from typing import Dict, Any, Optional, Tuple, Callable
from . import notify as notify_mod
logger = logging.getLogger(__name__)
eventlog = notify_mod.log
eventlog = notify_mod.eventlog
class AlertLevel(Enum):
"""Alert severity levels."""
@@ -96,6 +96,8 @@ class AlertState:
"notification_count": self.notification_count,
}
def __str__(self):
return self.to_dict().__str__()
class ThresholdConfig:
"""Configuration for a single threshold check."""
@@ -280,6 +282,11 @@ class ThresholdChecker:
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
"""Parse thresholds for a specific plugin."""
# Special handling for RTT thresholds (per-host)
if plugin_name == "rtt":
self._parse_rtt_thresholds(thresholds)
return
for metric_name, threshold_config in thresholds.items():
if not isinstance(threshold_config, dict):
continue
@@ -353,6 +360,97 @@ class ThresholdChecker:
self.thresholds[metric_path] = threshold
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
"""Parse RTT thresholds (per-host network latency thresholds).
RTT thresholds are configured as:
thresholds:
rtt:
hostname1:
warning: 100.0 # ms
critical: 500.0 # ms
"""
for hostname, threshold_config in rtt_thresholds.items():
if not isinstance(threshold_config, dict):
continue
# Metric path is "rtt.<hostname>"
metric_path = f"rtt.{hostname}"
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
if warning is None and critical is None:
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
continue
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
)
self.thresholds[metric_path] = threshold
logger.debug(
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
hostname,
warning,
critical
)
def check_value(
self,
host_name: str,
metric_path: str,
value: float,
alert_states: Dict[str, AlertState],
) -> Optional[Tuple[AlertLevel, AlertLevel]]:
"""
Check a single value against configured threshold.
Args:
host_name: Name of the host
metric_path: Full metric path (e.g., "rtt.hostname")
value: The metric value to check
alert_states: Host's alert_states dictionary
Returns:
Tuple of (old_level, new_level) if state changed, None otherwise
"""
if metric_path not in self.thresholds:
return None
threshold = self.thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path]
# Evaluate threshold with hysteresis
new_level = threshold.evaluate_with_hysteresis(
value,
alert_state.level
)
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
return (old_level, new_level)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value)
return None
def check_plugin_data(
self,
host_name: str,
@@ -476,18 +574,22 @@ class ThresholdChecker:
"""Trigger a notification for an alert state change."""
# Format message
if new_level == AlertLevel.OK:
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
lvl = "RECOVERED"
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
message = f"WARNING: {host_name} - {metric_path} = {value}"
lvl = "WARNING"
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL:
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
lvl = "CRITICAL"
message = f"{metric_path} = {value}"
else:
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
lvl = "UNKNOWN"
message = f"{metric_path} = {value}"
# Send notification
if self.notification_callback is not None:
try:
self.notification_callback(message)
self.notification_callback(f"{lvl}: {host_name} - {message}")
logger.info("Notification sent: %s", message)
except Exception as e:
logger.error("Failed to send notification: %s", e)
@@ -507,7 +609,7 @@ class ThresholdChecker:
except Exception as e:
logger.debug(f"Failed to log threshold event to journal: {e}")
# Log to eventlog as well
eventlog(host_name, message, service="threshold")
eventlog(host_name, lvl, message, service="threshold")
def _check_renotify(
self,