per-client threshold config

This commit is contained in:
Andreas Wrede
2026-04-01 15:22:42 -04:00
parent 079e84f729
commit 090d341244
7 changed files with 873 additions and 77 deletions
+330 -35
View File
@@ -55,6 +55,7 @@ class AlertState:
self.last_notification = None
self.threshold_value = None # The threshold value that triggered alert
self.operator = None # The comparison operator (>, <, >=, etc.)
self.formatted_message = None # Formatted display message for UI
def update(
self,
@@ -120,6 +121,8 @@ class AlertState:
result["threshold_value"] = self.threshold_value
if self.operator is not None:
result["operator"] = self.operator
if self.formatted_message is not None:
result["formatted_message"] = self.formatted_message
return result
@@ -285,7 +288,18 @@ class ThresholdChecker:
renotify_interval: Seconds between repeat notifications (default: 1 hour)
journal: Optional MessageJournal instance for logging threshold events
"""
self.thresholds = {} # {metric_path: ThresholdConfig}
# Named threshold configurations: {config_name: {metric_path: ThresholdConfig}}
self.threshold_configs = {}
# Single threshold set for backward compatibility: {metric_path: ThresholdConfig}
self.thresholds = {}
# Host to config name mapping: {host_name: config_name}
self.host_config_mapping = {}
# Default config name to use when no mapping exists
self.default_config = "default"
self.notification_callback = notification_callback
self.renotify_interval = renotify_interval
self.journal = journal
@@ -293,10 +307,84 @@ class ThresholdChecker:
# Parse configuration
self._parse_config(config)
logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values())
if total_thresholds == 0 and len(self.thresholds) > 0:
# Backward compatibility: using single threshold set
total_thresholds = len(self.thresholds)
logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds)
else:
logger.info(
"ThresholdChecker initialized with %d named configurations (%d total thresholds)",
len(self.threshold_configs),
total_thresholds
)
def _parse_config(self, config: Dict[str, Any]):
"""Parse threshold configuration from YAML structure."""
"""Parse threshold configuration from YAML structure.
Supports two formats:
1. Legacy format with direct 'thresholds' section
2. New format with 'threshold_configs' and 'host_threshold_mapping'
"""
# Check for new multi-config format
if "threshold_configs" in config:
self._parse_multi_config(config)
elif "thresholds" in config:
# Legacy single threshold configuration
self._parse_legacy_config(config)
else:
logger.info("No thresholds configured")
def _parse_multi_config(self, config: Dict[str, Any]):
"""Parse multiple named threshold configurations."""
threshold_configs = config.get("threshold_configs", {})
if not threshold_configs:
logger.info("No threshold configurations defined")
return
# Parse each named configuration
for config_name, config_data in threshold_configs.items():
if not isinstance(config_data, dict):
logger.warning("Invalid threshold config '%s', skipping", config_name)
continue
if "thresholds" not in config_data:
logger.warning("No thresholds in config '%s', skipping", config_name)
continue
logger.info("Parsing threshold configuration: %s", config_name)
self.threshold_configs[config_name] = {}
thresholds_config = config_data["thresholds"]
for plugin_name, plugin_thresholds in thresholds_config.items():
if not isinstance(plugin_thresholds, dict):
continue
self._parse_plugin_thresholds(
plugin_name,
plugin_thresholds,
target_dict=self.threshold_configs[config_name]
)
# Parse host to config mapping
self.host_config_mapping = config.get("host_threshold_mapping", {})
# Set default config (first one alphabetically or explicitly set)
self.default_config = config.get("default_threshold_config", "default")
if self.default_config not in self.threshold_configs and self.threshold_configs:
# Use first available config as default
self.default_config = sorted(self.threshold_configs.keys())[0]
logger.info("Using '%s' as default threshold config", self.default_config)
logger.info(
"Loaded %d threshold configurations with %d host mappings",
len(self.threshold_configs),
len(self.host_config_mapping)
)
def _parse_legacy_config(self, config: Dict[str, Any]):
"""Parse legacy single threshold configuration for backward compatibility."""
if not config or "thresholds" not in config:
logger.info("No thresholds configured")
return
@@ -307,13 +395,27 @@ class ThresholdChecker:
if not isinstance(plugin_thresholds, dict):
continue
self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds)
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
"""Parse thresholds for a specific plugin."""
def _parse_plugin_thresholds(
self,
plugin_name: str,
thresholds: Dict[str, Any],
target_dict: Optional[Dict[str, ThresholdConfig]] = None
):
"""Parse thresholds for a specific plugin.
Args:
plugin_name: Name of the plugin
thresholds: Threshold configuration dictionary
target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds)
"""
if target_dict is None:
target_dict = self.thresholds
# Special handling for RTT thresholds (per-host)
if plugin_name == "rtt":
self._parse_rtt_thresholds(thresholds)
self._parse_rtt_thresholds(thresholds, target_dict)
return
for metric_name, threshold_config in thresholds.items():
@@ -322,7 +424,7 @@ class ThresholdChecker:
# Handle nested metrics (e.g., partitions./.percent)
if metric_name == "partitions":
self._parse_partition_thresholds(plugin_name, threshold_config)
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
continue
metric_path = f"{plugin_name}.{metric_name}"
@@ -331,7 +433,7 @@ class ThresholdChecker:
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
display = threshold_config.get("display")
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
@@ -349,7 +451,7 @@ class ThresholdChecker:
display=display
)
self.thresholds[metric_path] = threshold
target_dict[metric_path] = threshold
logger.debug(
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
metric_path,
@@ -358,8 +460,22 @@ class ThresholdChecker:
operator
)
def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
"""Parse partition-specific thresholds for disk monitoring."""
def _parse_partition_thresholds(
self,
plugin_name: str,
partitions: Dict[str, Any],
target_dict: Optional[Dict[str, ThresholdConfig]] = None
):
"""Parse partition-specific thresholds for disk monitoring.
Args:
plugin_name: Name of the plugin
partitions: Partition threshold configuration
target_dict: Dictionary to store parsed thresholds
"""
if target_dict is None:
target_dict = self.thresholds
for partition, metrics in partitions.items():
if not isinstance(metrics, dict):
continue
@@ -390,9 +506,13 @@ class ThresholdChecker:
display=display
)
self.thresholds[metric_path] = threshold
target_dict[metric_path] = threshold
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
def _parse_rtt_thresholds(
self,
rtt_thresholds: Dict[str, Any],
target_dict: Optional[Dict[str, ThresholdConfig]] = None
):
"""Parse RTT thresholds (per-host network latency thresholds).
RTT thresholds are configured as:
@@ -401,7 +521,14 @@ class ThresholdChecker:
hostname1:
warning: 100.0 # ms
critical: 500.0 # ms
Args:
rtt_thresholds: RTT threshold configuration
target_dict: Dictionary to store parsed thresholds
"""
if target_dict is None:
target_dict = self.thresholds
for hostname, threshold_config in rtt_thresholds.items():
if not isinstance(threshold_config, dict):
continue
@@ -430,7 +557,7 @@ class ThresholdChecker:
display=display
)
self.thresholds[metric_path] = threshold
target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
hostname,
@@ -438,6 +565,37 @@ class ThresholdChecker:
critical
)
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
"""Get the appropriate threshold configuration for a host.
Args:
host_name: Name of the host
Returns:
Dictionary of thresholds for this host
"""
# Legacy mode: single threshold set for all hosts
if self.thresholds and not self.threshold_configs:
return self.thresholds
# Multi-config mode: look up host-specific configuration
if self.threshold_configs:
config_name = self.host_config_mapping.get(host_name, self.default_config)
if config_name in self.threshold_configs:
return self.threshold_configs[config_name]
else:
logger.warning(
"Threshold config '%s' not found for host '%s', using default '%s'",
config_name,
host_name,
self.default_config
)
return self.threshold_configs.get(self.default_config, {})
# No thresholds configured
return {}
def check_value(
self,
host_name: str,
@@ -457,10 +615,13 @@ class ThresholdChecker:
Returns:
Tuple of (old_level, new_level) if state changed, None otherwise
"""
if metric_path not in self.thresholds:
# Get host-specific thresholds
thresholds = self.get_thresholds_for_host(host_name)
if metric_path not in thresholds:
return None
threshold = self.thresholds[metric_path]
threshold = thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
@@ -484,14 +645,17 @@ class ThresholdChecker:
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
# For check_value, we don't have full plugin data, pass None
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
# Update alert state with formatted message
alert_state.formatted_message = formatted_msg
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
return (old_level, new_level)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
return None
def check_plugin_data(
self,
host_name: str,
@@ -513,14 +677,17 @@ class ThresholdChecker:
"""
state_changes = []
# Get host-specific thresholds
thresholds = self.get_thresholds_for_host(host_name)
# Check flat metrics
for metric_name, value in data.items():
metric_path = f"{plugin_name}.{metric_name}"
if metric_path not in self.thresholds:
if metric_path not in thresholds:
continue
threshold = self.thresholds[metric_path]
threshold = thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
@@ -545,10 +712,13 @@ class ThresholdChecker:
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
# Update alert state with formatted message
alert_state.formatted_message = formatted_msg
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
# Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics(
@@ -570,6 +740,9 @@ class ThresholdChecker:
state_changes: list,
):
"""Check nested metrics like partition-specific thresholds."""
# Get host-specific thresholds
thresholds = self.get_thresholds_for_host(host_name)
# Look for partition data in disk_monitor
if plugin_name == "disk_monitor" and "partitions" in data:
partitions = data["partitions"]
@@ -583,10 +756,10 @@ class ThresholdChecker:
for metric_name, value in metrics.items():
metric_path = f"{plugin_name}.{partition}.{metric_name}"
if metric_path not in self.thresholds:
if metric_path not in thresholds:
continue
threshold = self.thresholds[metric_path]
threshold = thresholds[metric_path]
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
@@ -608,16 +781,20 @@ class ThresholdChecker:
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(
lvl, message, formatted_msg = self._trigger_notification(
host_name,
metric_path,
old_level,
new_level,
value,
threshold
threshold,
data # Pass full plugin data for format string
)
# Update alert state with formatted message
alert_state.formatted_message = formatted_msg
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
elif new_level != AlertLevel.OK:
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
def _trigger_notification(
self,
@@ -627,8 +804,19 @@ class ThresholdChecker:
new_level: AlertLevel,
value: Any,
threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]] = None,
):
"""Trigger a notification for an alert state change."""
"""Trigger a notification for an alert state change.
Args:
host_name: Name of the host
metric_path: Full metric path
old_level: Previous alert level
new_level: New alert level
value: Current metric value
threshold: Threshold configuration
plugin_data: Optional dictionary of all plugin data fields for format string
"""
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
@@ -646,20 +834,59 @@ class ThresholdChecker:
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
# Use display format string
threshold_info = self._format_display(
threshold.display,
value=value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data
)
message = f"{metric_path} = {value} {threshold_info}"
else:
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
# Use display format string
threshold_info = self._format_display(
threshold.display,
value=value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data
)
message = f"{metric_path} = {value} {threshold_info}"
else:
message = f"{metric_path} = {value}"
else:
lvl = "UNKNOWN"
message = f"{metric_path} = {value}"
# Send notification
# Return the formatted threshold info for storing in AlertState
formatted_threshold_msg = None
if threshold_value is not None and new_level != AlertLevel.OK:
formatted_threshold_msg = self._format_display(
threshold.display,
value=value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data
)
return lvl, message, formatted_threshold_msg
def _send_notification(
self,
host_name: str,
lvl: str,
message: str,
metric_path: str,
old_level: AlertLevel,
new_level: AlertLevel,
value: Any,
):
"""Send notification and log to journal/eventlog."""
if self.notification_callback is not None:
try:
self.notification_callback(f"{lvl}: {host_name} - {message}")
@@ -684,6 +911,56 @@ class ThresholdChecker:
# Log to eventlog as well
eventlog(host_name, lvl, message, service="threshold")
def _format_display(
self,
display_format: str,
value: Any,
threshold_value: float,
op_symbol: str,
plugin_data: Optional[Dict[str, Any]] = None,
) -> str:
"""Format the display string using available data.
Args:
display_format: Format string from threshold config
value: Current metric value
threshold_value: Threshold value that was exceeded
op_symbol: Comparison operator symbol
plugin_data: Optional dictionary of plugin data fields
Returns:
Formatted display string
"""
# Build format context with standard variables
format_context = {
'value': value,
'threshold_value': threshold_value,
'op_symbol': op_symbol,
}
# Add all plugin data fields if available
if plugin_data:
format_context.update(plugin_data)
try:
# Format the display string
return display_format.format(**format_context)
except KeyError as e:
logger.warning(
"Missing format variable in display string '%s': %s",
display_format,
e
)
# Fallback to default format
return f"(threshold: {op_symbol} {threshold_value})"
except Exception as e:
logger.error(
"Error formatting display string '%s': %s",
display_format,
e
)
return f"(threshold: {op_symbol} {threshold_value})"
def _check_renotify(
self,
host_name: str,
@@ -691,8 +968,18 @@ class ThresholdChecker:
metric_path: str,
value: Any,
threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]] = None,
):
"""Check if we should send a repeat notification."""
"""Check if we should send a repeat notification.
Args:
host_name: Name of the host
alert_state: Current alert state
metric_path: Full metric path
value: Current metric value
threshold: Threshold configuration
plugin_data: Optional dictionary of all plugin data fields
"""
if alert_state.level == AlertLevel.OK:
return
@@ -718,7 +1005,15 @@ class ThresholdChecker:
# Time to re-notify
if threshold_value is not None:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
# Use display format string
threshold_info = self._format_display(
threshold.display,
value=value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data
)
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"