refactor monitor, add threshold rtesting
This commit is contained in:
+109
-7
@@ -16,7 +16,7 @@ from typing import Dict, Any, Optional, Tuple, Callable
|
||||
from . import notify as notify_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
eventlog = notify_mod.log
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
class AlertLevel(Enum):
|
||||
"""Alert severity levels."""
|
||||
@@ -96,6 +96,8 @@ class AlertState:
|
||||
"notification_count": self.notification_count,
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
return self.to_dict().__str__()
|
||||
|
||||
class ThresholdConfig:
|
||||
"""Configuration for a single threshold check."""
|
||||
@@ -280,6 +282,11 @@ class ThresholdChecker:
|
||||
|
||||
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
||||
"""Parse thresholds for a specific plugin."""
|
||||
# Special handling for RTT thresholds (per-host)
|
||||
if plugin_name == "rtt":
|
||||
self._parse_rtt_thresholds(thresholds)
|
||||
return
|
||||
|
||||
for metric_name, threshold_config in thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
@@ -353,6 +360,97 @@ class ThresholdChecker:
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
|
||||
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
|
||||
"""Parse RTT thresholds (per-host network latency thresholds).
|
||||
|
||||
RTT thresholds are configured as:
|
||||
thresholds:
|
||||
rtt:
|
||||
hostname1:
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0 # ms
|
||||
"""
|
||||
for hostname, threshold_config in rtt_thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Metric path is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{hostname}"
|
||||
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
||||
hostname,
|
||||
warning,
|
||||
critical
|
||||
)
|
||||
|
||||
def check_value(
|
||||
self,
|
||||
host_name: str,
|
||||
metric_path: str,
|
||||
value: float,
|
||||
alert_states: Dict[str, AlertState],
|
||||
) -> Optional[Tuple[AlertLevel, AlertLevel]]:
|
||||
"""
|
||||
Check a single value against configured threshold.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
metric_path: Full metric path (e.g., "rtt.hostname")
|
||||
value: The metric value to check
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
Tuple of (old_level, new_level) if state changed, None otherwise
|
||||
"""
|
||||
if metric_path not in self.thresholds:
|
||||
return None
|
||||
|
||||
threshold = self.thresholds[metric_path]
|
||||
|
||||
# Get or create alert state
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
|
||||
alert_state = alert_states[metric_path]
|
||||
|
||||
# Evaluate threshold with hysteresis
|
||||
new_level = threshold.evaluate_with_hysteresis(
|
||||
value,
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
return (old_level, new_level)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
|
||||
return None
|
||||
|
||||
def check_plugin_data(
|
||||
self,
|
||||
host_name: str,
|
||||
@@ -476,18 +574,22 @@ class ThresholdChecker:
|
||||
"""Trigger a notification for an alert state change."""
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
|
||||
lvl = "RECOVERED"
|
||||
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
message = f"WARNING: {host_name} - {metric_path} = {value}"
|
||||
lvl = "WARNING"
|
||||
message = f"{metric_path} = {value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
|
||||
lvl = "CRITICAL"
|
||||
message = f"{metric_path} = {value}"
|
||||
else:
|
||||
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {value}"
|
||||
|
||||
# Send notification
|
||||
if self.notification_callback is not None:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
@@ -507,7 +609,7 @@ class ThresholdChecker:
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to log threshold event to journal: {e}")
|
||||
# Log to eventlog as well
|
||||
eventlog(host_name, message, service="threshold")
|
||||
eventlog(host_name, lvl, message, service="threshold")
|
||||
|
||||
def _check_renotify(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user