""" Threshold checking and alerting for plugin metrics. This module provides a flexible threshold checking system that: - Evaluates plugin metrics against configured warning/critical thresholds - Tracks alert states per host and metric - Prevents alert flapping with hysteresis - Triggers notifications only on state changes - Supports multiple comparison operators """ import logging import time from enum import Enum from typing import Dict, Any, Optional, Tuple, Callable from . import notify as notify_mod logger = logging.getLogger(__name__) eventlog = notify_mod.eventlog class AlertLevel(Enum): """Alert severity levels.""" OK = 0 WARNING = 1 CRITICAL = 2 UNKNOWN = 3 class ComparisonOperator(Enum): """Supported comparison operators for threshold checks.""" GT = ">" # Greater than GTE = ">=" # Greater than or equal LT = "<" # Less than LTE = "<=" # Less than or equal EQ = "==" # Equal to NEQ = "!=" # Not equal to class AlertState: """Represents the current alert state for a specific metric.""" def __init__(self, metric_path: str): """ Initialize alert state. Args: metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") """ self.metric_path = metric_path self.level = AlertLevel.OK self.since = time.time() self.last_value = None self.last_check = time.time() self.notification_count = 0 self.last_notification = None def update(self, level: AlertLevel, value: Any) -> bool: """ Update alert state. Args: level: New alert level value: Current metric value Returns: True if state changed (notification needed), False otherwise """ now = time.time() self.last_check = now self.last_value = value # Check if state changed if level != self.level: logger.info( "Alert state change for %s: %s -> %s (value: %s)", self.metric_path, self.level.name, level.name, value ) self.level = level self.since = now self.notification_count = 0 return True return False def to_dict(self) -> dict: """Convert alert state to dictionary for serialization.""" return { "metric_path": self.metric_path, "level": self.level.name, "since": self.since, "last_value": self.last_value, "last_check": self.last_check, "notification_count": self.notification_count, } def __str__(self): return self.to_dict().__str__() class ThresholdConfig: """Configuration for a single threshold check.""" def __init__( self, metric_path: str, warning: Optional[float] = None, critical: Optional[float] = None, operator: str = ">", hysteresis: float = 0.0, enabled: bool = True, ): """ Initialize threshold configuration. Args: metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") warning: Warning threshold value critical: Critical threshold value operator: Comparison operator (>, >=, <, <=, ==, !=) hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) enabled: Whether this threshold is enabled """ self.metric_path = metric_path self.warning = warning self.critical = critical self.enabled = enabled self.hysteresis = hysteresis # Parse operator try: self.operator = ComparisonOperator(operator) except ValueError: logger.warning( "Invalid operator '%s' for %s, using '>' as default", operator, metric_path ) self.operator = ComparisonOperator.GT def evaluate(self, value: float) -> AlertLevel: """ Evaluate a value against this threshold. Args: value: Metric value to check Returns: AlertLevel indicating the severity """ if not self.enabled: return AlertLevel.OK try: # Convert value to float for comparison value = float(value) except (TypeError, ValueError): logger.warning("Cannot convert value %s to float for %s", value, self.metric_path) return AlertLevel.UNKNOWN # Check critical threshold first if self.critical is not None: if self._compare(value, self.critical): return AlertLevel.CRITICAL # Then check warning threshold if self.warning is not None: if self._compare(value, self.warning): return AlertLevel.WARNING return AlertLevel.OK def evaluate_with_hysteresis( self, value: float, current_level: AlertLevel ) -> AlertLevel: """ Evaluate with hysteresis to prevent flapping. Args: value: Current metric value current_level: Current alert level Returns: New alert level considering hysteresis """ new_level = self.evaluate(value) # If no hysteresis, return new level if self.hysteresis == 0.0: return new_level # If improving (going to a lower severity), apply hysteresis if new_level.value < current_level.value: # For recovery, value must be better by hysteresis amount if current_level == AlertLevel.CRITICAL and self.critical is not None: threshold = self.critical elif current_level == AlertLevel.WARNING and self.warning is not None: threshold = self.warning else: return new_level # Calculate hysteresis threshold hysteresis_amount = abs(threshold * self.hysteresis) if self.operator in [ComparisonOperator.GT, ComparisonOperator.GTE]: # For "greater than" thresholds, value must go below by hysteresis recovery_threshold = threshold - hysteresis_amount if value >= recovery_threshold: # Not enough improvement, keep current level return current_level elif self.operator in [ComparisonOperator.LT, ComparisonOperator.LTE]: # For "less than" thresholds, value must go above by hysteresis recovery_threshold = threshold + hysteresis_amount if value <= recovery_threshold: # Not enough improvement, keep current level return current_level return new_level def _compare(self, value: float, threshold: float) -> bool: """Perform comparison based on operator.""" if self.operator == ComparisonOperator.GT: return value > threshold elif self.operator == ComparisonOperator.GTE: return value >= threshold elif self.operator == ComparisonOperator.LT: return value < threshold elif self.operator == ComparisonOperator.LTE: return value <= threshold elif self.operator == ComparisonOperator.EQ: return abs(value - threshold) < 1e-9 # Float comparison elif self.operator == ComparisonOperator.NEQ: return abs(value - threshold) >= 1e-9 return False class ThresholdChecker: """Main threshold checking and alerting system.""" def __init__( self, config: Dict[str, Any], notification_callback: Optional[Callable] = None, renotify_interval: int = 3600, journal: Optional[Any] = None, ): """ Initialize threshold checker. Args: config: Threshold configuration dictionary from YAML notification_callback: Function to call for notifications renotify_interval: Seconds between repeat notifications (default: 1 hour) journal: Optional MessageJournal instance for logging threshold events """ self.thresholds = {} # {metric_path: ThresholdConfig} self.notification_callback = notification_callback self.renotify_interval = renotify_interval self.journal = journal # Parse configuration self._parse_config(config) logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds)) def _parse_config(self, config: Dict[str, Any]): """Parse threshold configuration from YAML structure.""" if not config or "thresholds" not in config: logger.info("No thresholds configured") return thresholds_config = config["thresholds"] for plugin_name, plugin_thresholds in thresholds_config.items(): if not isinstance(plugin_thresholds, dict): continue self._parse_plugin_thresholds(plugin_name, plugin_thresholds) def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]): """Parse thresholds for a specific plugin.""" # Special handling for RTT thresholds (per-host) if plugin_name == "rtt": self._parse_rtt_thresholds(thresholds) return for metric_name, threshold_config in thresholds.items(): if not isinstance(threshold_config, dict): continue # Handle nested metrics (e.g., partitions./.percent) if metric_name == "partitions": self._parse_partition_thresholds(plugin_name, threshold_config) continue metric_path = f"{plugin_name}.{metric_name}" # Extract threshold values warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default enabled = threshold_config.get("enabled", True) if warning is None and critical is None: logger.warning("No thresholds defined for %s, skipping", metric_path) continue threshold = ThresholdConfig( metric_path=metric_path, warning=warning, critical=critical, operator=operator, hysteresis=hysteresis, enabled=enabled, ) self.thresholds[metric_path] = threshold logger.debug( "Registered threshold for %s: warn=%s, crit=%s, op=%s", metric_path, warning, critical, operator ) def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]): """Parse partition-specific thresholds for disk monitoring.""" for partition, metrics in partitions.items(): if not isinstance(metrics, dict): continue for metric_name, threshold_config in metrics.items(): if not isinstance(threshold_config, dict): continue # Create metric path like "disk_monitor./dev/sda1.percent" metric_path = f"{plugin_name}.{partition}.{metric_name}" warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") hysteresis = threshold_config.get("hysteresis", 0.1) enabled = threshold_config.get("enabled", True) if warning is None and critical is None: continue threshold = ThresholdConfig( metric_path=metric_path, warning=warning, critical=critical, operator=operator, hysteresis=hysteresis, enabled=enabled, ) self.thresholds[metric_path] = threshold def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]): """Parse RTT thresholds (per-host network latency thresholds). RTT thresholds are configured as: thresholds: rtt: hostname1: warning: 100.0 # ms critical: 500.0 # ms """ for hostname, threshold_config in rtt_thresholds.items(): if not isinstance(threshold_config, dict): continue # Metric path is "rtt." metric_path = f"rtt.{hostname}" warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default enabled = threshold_config.get("enabled", True) if warning is None and critical is None: logger.warning("No RTT thresholds defined for %s, skipping", hostname) continue threshold = ThresholdConfig( metric_path=metric_path, warning=warning, critical=critical, operator=operator, hysteresis=hysteresis, enabled=enabled, ) self.thresholds[metric_path] = threshold logger.debug( "Registered RTT threshold for %s: warn=%s ms, crit=%s ms", hostname, warning, critical ) def check_value( self, host_name: str, metric_path: str, value: float, alert_states: Dict[str, AlertState], ) -> Optional[Tuple[AlertLevel, AlertLevel]]: """ Check a single value against configured threshold. Args: host_name: Name of the host metric_path: Full metric path (e.g., "rtt.hostname") value: The metric value to check alert_states: Host's alert_states dictionary Returns: Tuple of (old_level, new_level) if state changed, None otherwise """ if metric_path not in self.thresholds: return None threshold = self.thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) alert_state = alert_states[metric_path] # Evaluate threshold with hysteresis new_level = threshold.evaluate_with_hysteresis( value, alert_state.level ) # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value): self._trigger_notification(host_name, metric_path, old_level, new_level, value) return (old_level, new_level) elif new_level != AlertLevel.OK: # Check if we should re-notify self._check_renotify(host_name, alert_state, metric_path, value) return None def check_plugin_data( self, host_name: str, plugin_name: str, data: Dict[str, Any], alert_states: Dict[str, AlertState], ) -> list: """ Check plugin data against configured thresholds. Args: host_name: Name of the host plugin_name: Name of the plugin data: Plugin data dictionary alert_states: Host's alert_states dictionary Returns: List of (metric_path, old_level, new_level, value) tuples for state changes """ state_changes = [] # Check flat metrics for metric_name, value in data.items(): metric_path = f"{plugin_name}.{metric_name}" if metric_path not in self.thresholds: continue threshold = self.thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) alert_state = alert_states[metric_path] # Evaluate threshold with hysteresis new_level = threshold.evaluate_with_hysteresis( value, alert_state.level ) # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value): state_changes.append((metric_path, old_level, new_level, value)) self._trigger_notification(host_name, metric_path, old_level, new_level, value) elif new_level != AlertLevel.OK: # Check if we should re-notify self._check_renotify(host_name, alert_state, metric_path, value) # Check nested metrics (e.g., partition data in disk_monitor) self._check_nested_metrics( host_name, plugin_name, data, alert_states, state_changes ) return state_changes def _check_nested_metrics( self, host_name: str, plugin_name: str, data: Dict[str, Any], alert_states: Dict[str, AlertState], state_changes: list, ): """Check nested metrics like partition-specific thresholds.""" # Look for partition data in disk_monitor if plugin_name == "disk_monitor" and "partitions" in data: partitions = data["partitions"] if not isinstance(partitions, dict): return for partition, metrics in partitions.items(): if not isinstance(metrics, dict): continue for metric_name, value in metrics.items(): metric_path = f"{plugin_name}.{partition}.{metric_name}" if metric_path not in self.thresholds: continue threshold = self.thresholds[metric_path] if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) alert_state = alert_states[metric_path] new_level = threshold.evaluate_with_hysteresis( value, alert_state.level ) old_level = alert_state.level if alert_state.update(new_level, value): state_changes.append((metric_path, old_level, new_level, value)) self._trigger_notification( host_name, metric_path, old_level, new_level, value ) elif new_level != AlertLevel.OK: self._check_renotify(host_name, alert_state, metric_path, value) def _trigger_notification( self, host_name: str, metric_path: str, old_level: AlertLevel, new_level: AlertLevel, value: Any, ): """Trigger a notification for an alert state change.""" # Format message if new_level == AlertLevel.OK: lvl = "RECOVERED" message = f"{metric_path} = {value} ({old_level.name} -> OK)" elif new_level == AlertLevel.WARNING: lvl = "WARNING" message = f"{metric_path} = {value}" elif new_level == AlertLevel.CRITICAL: lvl = "CRITICAL" message = f"{metric_path} = {value}" else: lvl = "UNKNOWN" message = f"{metric_path} = {value}" # Send notification if self.notification_callback is not None: try: self.notification_callback(f"{lvl}: {host_name} - {message}") logger.info("Notification sent: %s", message) except Exception as e: logger.error("Failed to send notification: %s", e) # Log to journal if self.journal is not None: try: import asyncio loop = asyncio.get_event_loop() loop.create_task(self.journal.log_threshold_event( host_name=host_name, metric_path=metric_path, old_level=old_level.name, new_level=new_level.name, value=value, )) except Exception as e: logger.debug(f"Failed to log threshold event to journal: {e}") # Log to eventlog as well eventlog(host_name, lvl, message, service="threshold") def _check_renotify( self, host_name: str, alert_state: AlertState, metric_path: str, value: Any, ): """Check if we should send a repeat notification.""" if alert_state.level == AlertLevel.OK: return now = time.time() # Check if we should re-notify if alert_state.last_notification is None: # First notification already sent during state change alert_state.last_notification = now alert_state.notification_count = 1 return if (now - alert_state.last_notification) >= self.renotify_interval: # Time to re-notify message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" if self.notification_callback: try: self.notification_callback(message) alert_state.last_notification = now alert_state.notification_count += 1 logger.info("Re-notification sent: %s", message) except Exception as e: logger.error("Failed to send re-notification: %s", e) def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list: """ Get all currently active (non-OK) alerts. Args: alert_states: Host's alert_states dictionary Returns: List of AlertState objects that are not OK """ return [ state for state in alert_states.values() if state.level != AlertLevel.OK ] def get_alert_summary(self, alert_states: Dict[str, AlertState]) -> Dict[str, int]: """ Get summary counts of alert levels. Args: alert_states: Host's alert_states dictionary Returns: Dictionary with counts: {"ok": N, "warning": N, "critical": N} """ summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0} for state in alert_states.values(): if state.level == AlertLevel.OK: summary["ok"] += 1 elif state.level == AlertLevel.WARNING: summary["warning"] += 1 elif state.level == AlertLevel.CRITICAL: summary["critical"] += 1 elif state.level == AlertLevel.UNKNOWN: summary["unknown"] += 1 return summary