Move threshhold to server, move eventlog to notify
This commit is contained in:
@@ -1,579 +0,0 @@
|
||||
"""
|
||||
Threshold checking and alerting for plugin metrics.
|
||||
|
||||
This module provides a flexible threshold checking system that:
|
||||
- Evaluates plugin metrics against configured warning/critical thresholds
|
||||
- Tracks alert states per host and metric
|
||||
- Prevents alert flapping with hysteresis
|
||||
- Triggers notifications only on state changes
|
||||
- Supports multiple comparison operators
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertLevel(Enum):
|
||||
"""Alert severity levels."""
|
||||
OK = 0
|
||||
WARNING = 1
|
||||
CRITICAL = 2
|
||||
UNKNOWN = 3
|
||||
|
||||
|
||||
class ComparisonOperator(Enum):
|
||||
"""Supported comparison operators for threshold checks."""
|
||||
GT = ">" # Greater than
|
||||
GTE = ">=" # Greater than or equal
|
||||
LT = "<" # Less than
|
||||
LTE = "<=" # Less than or equal
|
||||
EQ = "==" # Equal to
|
||||
NEQ = "!=" # Not equal to
|
||||
|
||||
|
||||
class AlertState:
|
||||
"""Represents the current alert state for a specific metric."""
|
||||
|
||||
def __init__(self, metric_path: str):
|
||||
"""
|
||||
Initialize alert state.
|
||||
|
||||
Args:
|
||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.level = AlertLevel.OK
|
||||
self.since = time.time()
|
||||
self.last_value = None
|
||||
self.last_check = time.time()
|
||||
self.notification_count = 0
|
||||
self.last_notification = None
|
||||
|
||||
def update(self, level: AlertLevel, value: Any) -> bool:
|
||||
"""
|
||||
Update alert state.
|
||||
|
||||
Args:
|
||||
level: New alert level
|
||||
value: Current metric value
|
||||
|
||||
Returns:
|
||||
True if state changed (notification needed), False otherwise
|
||||
"""
|
||||
now = time.time()
|
||||
self.last_check = now
|
||||
self.last_value = value
|
||||
|
||||
# Check if state changed
|
||||
if level != self.level:
|
||||
logger.info(
|
||||
"Alert state change for %s: %s -> %s (value: %s)",
|
||||
self.metric_path,
|
||||
self.level.name,
|
||||
level.name,
|
||||
value
|
||||
)
|
||||
self.level = level
|
||||
self.since = now
|
||||
self.notification_count = 0
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert alert state to dictionary for serialization."""
|
||||
return {
|
||||
"metric_path": self.metric_path,
|
||||
"level": self.level.name,
|
||||
"since": self.since,
|
||||
"last_value": self.last_value,
|
||||
"last_check": self.last_check,
|
||||
"notification_count": self.notification_count,
|
||||
}
|
||||
|
||||
|
||||
class ThresholdConfig:
|
||||
"""Configuration for a single threshold check."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
metric_path: str,
|
||||
warning: Optional[float] = None,
|
||||
critical: Optional[float] = None,
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize threshold configuration.
|
||||
|
||||
Args:
|
||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||
warning: Warning threshold value
|
||||
critical: Critical threshold value
|
||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||
enabled: Whether this threshold is enabled
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.warning = warning
|
||||
self.critical = critical
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
self.operator = ComparisonOperator(operator)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Invalid operator '%s' for %s, using '>' as default",
|
||||
operator,
|
||||
metric_path
|
||||
)
|
||||
self.operator = ComparisonOperator.GT
|
||||
|
||||
def evaluate(self, value: float) -> AlertLevel:
|
||||
"""
|
||||
Evaluate a value against this threshold.
|
||||
|
||||
Args:
|
||||
value: Metric value to check
|
||||
|
||||
Returns:
|
||||
AlertLevel indicating the severity
|
||||
"""
|
||||
if not self.enabled:
|
||||
return AlertLevel.OK
|
||||
|
||||
try:
|
||||
# Convert value to float for comparison
|
||||
value = float(value)
|
||||
except (TypeError, ValueError):
|
||||
logger.warning("Cannot convert value %s to float for %s", value, self.metric_path)
|
||||
return AlertLevel.UNKNOWN
|
||||
|
||||
# Check critical threshold first
|
||||
if self.critical is not None:
|
||||
if self._compare(value, self.critical):
|
||||
return AlertLevel.CRITICAL
|
||||
|
||||
# Then check warning threshold
|
||||
if self.warning is not None:
|
||||
if self._compare(value, self.warning):
|
||||
return AlertLevel.WARNING
|
||||
|
||||
return AlertLevel.OK
|
||||
|
||||
def evaluate_with_hysteresis(
|
||||
self,
|
||||
value: float,
|
||||
current_level: AlertLevel
|
||||
) -> AlertLevel:
|
||||
"""
|
||||
Evaluate with hysteresis to prevent flapping.
|
||||
|
||||
Args:
|
||||
value: Current metric value
|
||||
current_level: Current alert level
|
||||
|
||||
Returns:
|
||||
New alert level considering hysteresis
|
||||
"""
|
||||
new_level = self.evaluate(value)
|
||||
|
||||
# If no hysteresis, return new level
|
||||
if self.hysteresis == 0.0:
|
||||
return new_level
|
||||
|
||||
# If improving (going to a lower severity), apply hysteresis
|
||||
if new_level.value < current_level.value:
|
||||
# For recovery, value must be better by hysteresis amount
|
||||
if current_level == AlertLevel.CRITICAL and self.critical is not None:
|
||||
threshold = self.critical
|
||||
elif current_level == AlertLevel.WARNING and self.warning is not None:
|
||||
threshold = self.warning
|
||||
else:
|
||||
return new_level
|
||||
|
||||
# Calculate hysteresis threshold
|
||||
hysteresis_amount = abs(threshold * self.hysteresis)
|
||||
|
||||
if self.operator in [ComparisonOperator.GT, ComparisonOperator.GTE]:
|
||||
# For "greater than" thresholds, value must go below by hysteresis
|
||||
recovery_threshold = threshold - hysteresis_amount
|
||||
if value >= recovery_threshold:
|
||||
# Not enough improvement, keep current level
|
||||
return current_level
|
||||
elif self.operator in [ComparisonOperator.LT, ComparisonOperator.LTE]:
|
||||
# For "less than" thresholds, value must go above by hysteresis
|
||||
recovery_threshold = threshold + hysteresis_amount
|
||||
if value <= recovery_threshold:
|
||||
# Not enough improvement, keep current level
|
||||
return current_level
|
||||
|
||||
return new_level
|
||||
|
||||
def _compare(self, value: float, threshold: float) -> bool:
|
||||
"""Perform comparison based on operator."""
|
||||
if self.operator == ComparisonOperator.GT:
|
||||
return value > threshold
|
||||
elif self.operator == ComparisonOperator.GTE:
|
||||
return value >= threshold
|
||||
elif self.operator == ComparisonOperator.LT:
|
||||
return value < threshold
|
||||
elif self.operator == ComparisonOperator.LTE:
|
||||
return value <= threshold
|
||||
elif self.operator == ComparisonOperator.EQ:
|
||||
return abs(value - threshold) < 1e-9 # Float comparison
|
||||
elif self.operator == ComparisonOperator.NEQ:
|
||||
return abs(value - threshold) >= 1e-9
|
||||
return False
|
||||
|
||||
|
||||
class ThresholdChecker:
|
||||
"""Main threshold checking and alerting system."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Dict[str, Any],
|
||||
notification_callback: Optional[Callable] = None,
|
||||
renotify_interval: int = 3600,
|
||||
journal: Optional[Any] = None,
|
||||
):
|
||||
"""
|
||||
Initialize threshold checker.
|
||||
|
||||
Args:
|
||||
config: Threshold configuration dictionary from YAML
|
||||
notification_callback: Function to call for notifications
|
||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||
journal: Optional MessageJournal instance for logging threshold events
|
||||
"""
|
||||
self.thresholds = {} # {metric_path: ThresholdConfig}
|
||||
self.notification_callback = notification_callback
|
||||
self.renotify_interval = renotify_interval
|
||||
self.journal = journal
|
||||
|
||||
# Parse configuration
|
||||
self._parse_config(config)
|
||||
|
||||
logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
|
||||
|
||||
def _parse_config(self, config: Dict[str, Any]):
|
||||
"""Parse threshold configuration from YAML structure."""
|
||||
if not config or "thresholds" not in config:
|
||||
logger.info("No thresholds configured")
|
||||
return
|
||||
|
||||
thresholds_config = config["thresholds"]
|
||||
|
||||
for plugin_name, plugin_thresholds in thresholds_config.items():
|
||||
if not isinstance(plugin_thresholds, dict):
|
||||
continue
|
||||
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
|
||||
|
||||
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
||||
"""Parse thresholds for a specific plugin."""
|
||||
for metric_name, threshold_config in thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Handle nested metrics (e.g., partitions./.percent)
|
||||
if metric_name == "partitions":
|
||||
self._parse_partition_thresholds(plugin_name, threshold_config)
|
||||
continue
|
||||
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
# Extract threshold values
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
|
||||
metric_path,
|
||||
warning,
|
||||
critical,
|
||||
operator
|
||||
)
|
||||
|
||||
def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
|
||||
"""Parse partition-specific thresholds for disk monitoring."""
|
||||
for partition, metrics in partitions.items():
|
||||
if not isinstance(metrics, dict):
|
||||
continue
|
||||
|
||||
for metric_name, threshold_config in metrics.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Create metric path like "disk_monitor./dev/sda1.percent"
|
||||
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
||||
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
|
||||
def check_plugin_data(
|
||||
self,
|
||||
host_name: str,
|
||||
plugin_name: str,
|
||||
data: Dict[str, Any],
|
||||
alert_states: Dict[str, AlertState],
|
||||
) -> list:
|
||||
"""
|
||||
Check plugin data against configured thresholds.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
plugin_name: Name of the plugin
|
||||
data: Plugin data dictionary
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
List of (metric_path, old_level, new_level, value) tuples for state changes
|
||||
"""
|
||||
state_changes = []
|
||||
|
||||
# Check flat metrics
|
||||
for metric_name, value in data.items():
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
if metric_path not in self.thresholds:
|
||||
continue
|
||||
|
||||
threshold = self.thresholds[metric_path]
|
||||
|
||||
# Get or create alert state
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
|
||||
alert_state = alert_states[metric_path]
|
||||
|
||||
# Evaluate threshold with hysteresis
|
||||
new_level = threshold.evaluate_with_hysteresis(
|
||||
value,
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
host_name,
|
||||
plugin_name,
|
||||
data,
|
||||
alert_states,
|
||||
state_changes
|
||||
)
|
||||
|
||||
return state_changes
|
||||
|
||||
def _check_nested_metrics(
|
||||
self,
|
||||
host_name: str,
|
||||
plugin_name: str,
|
||||
data: Dict[str, Any],
|
||||
alert_states: Dict[str, AlertState],
|
||||
state_changes: list,
|
||||
):
|
||||
"""Check nested metrics like partition-specific thresholds."""
|
||||
# Look for partition data in disk_monitor
|
||||
if plugin_name == "disk_monitor" and "partitions" in data:
|
||||
partitions = data["partitions"]
|
||||
if not isinstance(partitions, dict):
|
||||
return
|
||||
|
||||
for partition, metrics in partitions.items():
|
||||
if not isinstance(metrics, dict):
|
||||
continue
|
||||
|
||||
for metric_name, value in metrics.items():
|
||||
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
||||
|
||||
if metric_path not in self.thresholds:
|
||||
continue
|
||||
|
||||
threshold = self.thresholds[metric_path]
|
||||
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
|
||||
alert_state = alert_states[metric_path]
|
||||
|
||||
new_level = threshold.evaluate_with_hysteresis(
|
||||
value,
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(
|
||||
host_name,
|
||||
metric_path,
|
||||
old_level,
|
||||
new_level,
|
||||
value
|
||||
)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
|
||||
def _trigger_notification(
|
||||
self,
|
||||
host_name: str,
|
||||
metric_path: str,
|
||||
old_level: AlertLevel,
|
||||
new_level: AlertLevel,
|
||||
value: Any,
|
||||
):
|
||||
"""Trigger a notification for an alert state change."""
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
message = f"WARNING: {host_name} - {metric_path} = {value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
|
||||
else:
|
||||
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
|
||||
|
||||
# Send notification
|
||||
if self.notification_callback is not None:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
|
||||
# Log to journal
|
||||
if self.journal is not None:
|
||||
try:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(self.journal.log_threshold_event(
|
||||
host_name=host_name,
|
||||
metric_path=metric_path,
|
||||
old_level=old_level.name,
|
||||
new_level=new_level.name,
|
||||
value=value,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to log threshold event to journal: {e}")
|
||||
|
||||
def _check_renotify(
|
||||
self,
|
||||
host_name: str,
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
value: Any,
|
||||
):
|
||||
"""Check if we should send a repeat notification."""
|
||||
if alert_state.level == AlertLevel.OK:
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
|
||||
# Check if we should re-notify
|
||||
if alert_state.last_notification is None:
|
||||
# First notification already sent during state change
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count = 1
|
||||
return
|
||||
|
||||
if (now - alert_state.last_notification) >= self.renotify_interval:
|
||||
# Time to re-notify
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
|
||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||
"""
|
||||
Get all currently active (non-OK) alerts.
|
||||
|
||||
Args:
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
List of AlertState objects that are not OK
|
||||
"""
|
||||
return [
|
||||
state for state in alert_states.values()
|
||||
if state.level != AlertLevel.OK
|
||||
]
|
||||
|
||||
def get_alert_summary(self, alert_states: Dict[str, AlertState]) -> Dict[str, int]:
|
||||
"""
|
||||
Get summary counts of alert levels.
|
||||
|
||||
Args:
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
Dictionary with counts: {"ok": N, "warning": N, "critical": N}
|
||||
"""
|
||||
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
|
||||
|
||||
for state in alert_states.values():
|
||||
if state.level == AlertLevel.OK:
|
||||
summary["ok"] += 1
|
||||
elif state.level == AlertLevel.WARNING:
|
||||
summary["warning"] += 1
|
||||
elif state.level == AlertLevel.CRITICAL:
|
||||
summary["critical"] += 1
|
||||
elif state.level == AlertLevel.UNKNOWN:
|
||||
summary["unknown"] += 1
|
||||
|
||||
return summary
|
||||
Reference in New Issue
Block a user