diff --git a/.hb.yaml b/.hb.yaml index 02591e2..ec54df5 100644 --- a/.hb.yaml +++ b/.hb.yaml @@ -50,43 +50,119 @@ journal_max_size: 104857600 # Max size (100MB default) journal_max_backups: 10 # Number of backups to keep thresholds: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 - memory_monitor: - percent: - warning: 3.0 - critical: 95.0 - disk_monitor: - partitions: - /: - percent: - warning: 85.0 - critical: 90.0 - nagios_runner: - overall_status_code: - warning: 1 - critical: 2 - operator: ">=" - load_status: - warning: WARNING - critical: CRITICAL - operator: "==" - UPS_load: - warning: 70 - critical: 80 - operator: ">=" - UPS_status_code: - warning: 1 - critical: 2 - operator: ">=" - nextcloud_apps_status: - display: "{nextcloud_apps_output}" - warning: 1 - critical: 2 - operator: ">=" - rtt: - y: - warning: 30 - critical: 250.0 + default: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + rtt: + y: + warning: 30 + critical: 250.0 + + + freebsd_server: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: + y: + warning: 30 + critical: 250.0 + + truenas_server: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: + y: + warning: 30 + critical: 250.0 + + +host_threshold_mapping: + # Critical production servers + + wally: freebsd_server + eris: truenas_server + diff --git a/.hb.yaml.swp b/.hb.yaml.swp index 6dc3601..10da5a2 100644 Binary files a/.hb.yaml.swp and b/.hb.yaml.swp differ diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index 0ff0ef6..49b5be5 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -56,6 +56,7 @@ thresholds: critical: 90.0 operator: ">" hysteresis: 0.1 + display: "display format" enabled: true ``` @@ -82,6 +83,8 @@ Note: At least one of `warning` or `critical` must be specified. - Range: 0.0 to 1.0 - Prevents rapid state transitions when value hovers near threshold +- **display**: f-string to hold the display format for alert messages + - defaults to "(threshold: {op_symbol} {threshold_value})" - **enabled**: Whether this threshold is active (default: `true`) ### Comparison Operators @@ -740,3 +743,217 @@ Planned features: - [Message Journal Documentation](MESSAGE_JOURNAL.md) - Configuration examples: `hbd/config_thresholds_example.yaml` - Test suite: `test_threshold.py` + +## Multi-Threshold Configuration + +**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping. + +### Overview + +The multi-threshold feature allows you to: +- Define multiple sets of threshold configurations +- Map different hosts to different threshold sets +- Use different sensitivity levels for different environments +- Maintain a default configuration for unmapped hosts + +### Configuration Structure + +```yaml +# Optional: Set the default configuration name (defaults to "default") +default_threshold_config: "default" + +# Define multiple named threshold configurations +threshold_configs: + # Configuration name 1 + default: + thresholds: + # Standard threshold definitions + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + + # Configuration name 2 + high_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 60.0 + critical: 75.0 + + # Configuration name 3 + low_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 + critical: 95.0 + +# Map specific hosts to specific configurations +host_threshold_mapping: + prod-web-01: high_sensitivity + prod-web-02: high_sensitivity + dev-server-01: low_sensitivity + # Unmapped hosts use default_threshold_config +``` + +### Use Cases + +#### 1. Environment-Based Thresholds + +Different thresholds for production vs. development: + +```yaml +threshold_configs: + production: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 # Alert earlier in production + critical: 85.0 + + development: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 # More relaxed for dev + critical: 98.0 + +host_threshold_mapping: + prod-web-01: production + prod-web-02: production + dev-web-01: development + dev-web-02: development +``` + +#### 2. Server Role-Based Thresholds + +Different thresholds based on server function: + +```yaml +threshold_configs: + webserver: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + + database: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 + critical: 85.0 + memory_monitor: + percent: + warning: 90.0 # Databases can use high memory + critical: 97.0 + disk_monitor: + partitions: + /var/lib/mysql: + percent: + warning: 75.0 + critical: 85.0 + + cache: + thresholds: + memory_monitor: + percent: + warning: 95.0 # Redis/Memcached can use very high memory + critical: 99.0 + +host_threshold_mapping: + web-01: webserver + web-02: webserver + db-01: database + db-02: database + redis-01: cache + memcached-01: cache +``` + +#### 3. Sensitivity Levels + +Different sensitivity for critical vs. non-critical systems: + +```yaml +threshold_configs: + critical: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 70.0 # Very sensitive + critical: 80.0 + hysteresis: 0.15 + + standard: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 95.0 + hysteresis: 0.1 + + relaxed: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 90.0 + critical: 98.0 + hysteresis: 0.05 + +host_threshold_mapping: + payment-gateway: critical + auth-server: critical + web-01: standard + web-02: standard + test-server: relaxed +``` + +### Backward Compatibility + +The legacy single threshold configuration is fully supported: + +```yaml +# Old format - still works +thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 +``` + +This is equivalent to: + +```yaml +# New format +threshold_configs: + default: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 +``` + +### Configuration Priority + +1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config +2. **Default config**: Use `default_threshold_config` +3. **First alphabetically**: If default not found, use first config alphabetically +4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds` + +### Example: Complete Multi-Threshold Setup + +See `hbd/config_multi_threshold_example.yaml` for a complete example with: +- 4 named configurations (default, high_sensitivity, low_sensitivity, database) +- Host-to-config mappings for production, development, and test systems +- Specialized database server thresholds +- Custom display messages with plugin data + diff --git a/hbd/config_multi_threshold_example.yaml b/hbd/config_multi_threshold_example.yaml new file mode 100644 index 0000000..c14839c --- /dev/null +++ b/hbd/config_multi_threshold_example.yaml @@ -0,0 +1,202 @@ +# ============================================================================== +# Heartbeat Daemon Multi-Threshold Configuration Example +# ============================================================================== +# This file demonstrates the new multi-threshold configuration feature that allows +# different threshold settings for different hosts/clients. +# +# Features: +# - Define multiple named threshold configurations +# - Map specific hosts to specific threshold configurations +# - Set a default configuration for unmapped hosts +# - Backward compatible with single threshold configuration +# ============================================================================== + +# Global threshold settings +threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds) + +# Optional: Set default threshold config (defaults to "default" if not specified) +default_threshold_config: "default" + +# ---------------------------------------------------------------------------- +# Multiple Named Threshold Configurations +# ---------------------------------------------------------------------------- +# Define multiple threshold configurations with different sensitivity levels +threshold_configs: + + # Default configuration - moderate thresholds for most servers + default: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + operator: ">" + load_1min: + warning: 4.0 + critical: 8.0 + operator: ">" + + memory_monitor: + percent: + warning: 85.0 + critical: 95.0 + operator: ">" + + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 95.0 + operator: ">" + + rtt: + # RTT thresholds per remote host + router: + warning: 50.0 # ms + critical: 200.0 + server1: + warning: 100.0 + critical: 500.0 + + # High sensitivity configuration - lower thresholds for critical systems + high_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 60.0 # Alert earlier + critical: 75.0 + operator: ">" + hysteresis: 0.15 # More hysteresis to reduce flapping + load_1min: + warning: 2.0 + critical: 4.0 + operator: ">" + + memory_monitor: + percent: + warning: 75.0 # Alert at lower memory usage + critical: 85.0 + operator: ">" + display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)" + + disk_monitor: + partitions: + /: + percent: + warning: 75.0 + critical: 85.0 + operator: ">" + /var: + percent: + warning: 80.0 + critical: 90.0 + operator: ">" + + rtt: + router: + warning: 30.0 + critical: 100.0 + server1: + warning: 50.0 + critical: 200.0 + + # Low sensitivity configuration - higher thresholds for development/test systems + low_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 # Only alert at very high usage + critical: 95.0 + operator: ">" + + memory_monitor: + percent: + warning: 90.0 + critical: 98.0 + operator: ">" + + disk_monitor: + partitions: + /: + percent: + warning: 90.0 + critical: 95.0 + operator: ">" + + rtt: + router: + warning: 100.0 + critical: 500.0 + + # Production database servers - specialized thresholds + database: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 + critical: 85.0 + operator: ">" + + memory_monitor: + percent: + warning: 90.0 # Databases can use high memory + critical: 97.0 + operator: ">" + display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)" + + disk_monitor: + partitions: + /: + percent: + warning: 80.0 + critical: 90.0 + operator: ">" + /var/lib/mysql: # Database data partition + percent: + warning: 75.0 # Alert earlier for DB partition + critical: 85.0 + operator: ">" + + rtt: + router: + warning: 20.0 # Stricter latency requirements + critical: 50.0 + +# ---------------------------------------------------------------------------- +# Host to Threshold Configuration Mapping +# ---------------------------------------------------------------------------- +# Map specific hosts to specific threshold configurations +# Hosts not listed here will use the default_threshold_config +host_threshold_mapping: + # Critical production servers + prod-web-01: high_sensitivity + prod-web-02: high_sensitivity + prod-api-01: high_sensitivity + + # Database servers + prod-db-01: database + prod-db-02: database + prod-db-replica: database + + # Development and test systems + dev-server-01: low_sensitivity + dev-server-02: low_sensitivity + test-server-01: low_sensitivity + test-server-02: low_sensitivity + + # Everything else uses 'default' (no need to list explicitly) + +# ---------------------------------------------------------------------------- +# Backward Compatibility Example +# ---------------------------------------------------------------------------- +# The old single threshold format is still supported: +# Just use 'thresholds:' directly without 'threshold_configs:' +# +# thresholds: +# cpu_monitor: +# cpu_percent: +# warning: 80.0 +# critical: 90.0 +# +# This will apply the same thresholds to all hosts. diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html index 34a6744..1209480 100644 --- a/hbd/server/templates/alerts.html +++ b/hbd/server/templates/alerts.html @@ -397,9 +397,11 @@ const level = alert.level.toLowerCase(); const duration = getDuration(alert.since); - // Format value with threshold info if available + // Use formatted message if available, otherwise build from individual fields let valueText = `Value: ${formatValue(alert.last_value)}`; - if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { + if (alert.formatted_message) { + valueText += ` ${alert.formatted_message}`; + } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { valueText += ` (threshold: ${alert.operator} ${formatValue(alert.threshold_value)})`; } diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index f89881a..2c88b72 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -55,6 +55,7 @@ class AlertState: self.last_notification = None self.threshold_value = None # The threshold value that triggered alert self.operator = None # The comparison operator (>, <, >=, etc.) + self.formatted_message = None # Formatted display message for UI def update( self, @@ -120,6 +121,8 @@ class AlertState: result["threshold_value"] = self.threshold_value if self.operator is not None: result["operator"] = self.operator + if self.formatted_message is not None: + result["formatted_message"] = self.formatted_message return result @@ -285,7 +288,18 @@ class ThresholdChecker: renotify_interval: Seconds between repeat notifications (default: 1 hour) journal: Optional MessageJournal instance for logging threshold events """ - self.thresholds = {} # {metric_path: ThresholdConfig} + # Named threshold configurations: {config_name: {metric_path: ThresholdConfig}} + self.threshold_configs = {} + + # Single threshold set for backward compatibility: {metric_path: ThresholdConfig} + self.thresholds = {} + + # Host to config name mapping: {host_name: config_name} + self.host_config_mapping = {} + + # Default config name to use when no mapping exists + self.default_config = "default" + self.notification_callback = notification_callback self.renotify_interval = renotify_interval self.journal = journal @@ -293,10 +307,84 @@ class ThresholdChecker: # Parse configuration self._parse_config(config) - logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds)) + total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values()) + if total_thresholds == 0 and len(self.thresholds) > 0: + # Backward compatibility: using single threshold set + total_thresholds = len(self.thresholds) + logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds) + else: + logger.info( + "ThresholdChecker initialized with %d named configurations (%d total thresholds)", + len(self.threshold_configs), + total_thresholds + ) def _parse_config(self, config: Dict[str, Any]): - """Parse threshold configuration from YAML structure.""" + """Parse threshold configuration from YAML structure. + + Supports two formats: + 1. Legacy format with direct 'thresholds' section + 2. New format with 'threshold_configs' and 'host_threshold_mapping' + """ + # Check for new multi-config format + if "threshold_configs" in config: + self._parse_multi_config(config) + elif "thresholds" in config: + # Legacy single threshold configuration + self._parse_legacy_config(config) + else: + logger.info("No thresholds configured") + + def _parse_multi_config(self, config: Dict[str, Any]): + """Parse multiple named threshold configurations.""" + threshold_configs = config.get("threshold_configs", {}) + + if not threshold_configs: + logger.info("No threshold configurations defined") + return + + # Parse each named configuration + for config_name, config_data in threshold_configs.items(): + if not isinstance(config_data, dict): + logger.warning("Invalid threshold config '%s', skipping", config_name) + continue + + if "thresholds" not in config_data: + logger.warning("No thresholds in config '%s', skipping", config_name) + continue + + logger.info("Parsing threshold configuration: %s", config_name) + self.threshold_configs[config_name] = {} + + thresholds_config = config_data["thresholds"] + for plugin_name, plugin_thresholds in thresholds_config.items(): + if not isinstance(plugin_thresholds, dict): + continue + + self._parse_plugin_thresholds( + plugin_name, + plugin_thresholds, + target_dict=self.threshold_configs[config_name] + ) + + # Parse host to config mapping + self.host_config_mapping = config.get("host_threshold_mapping", {}) + + # Set default config (first one alphabetically or explicitly set) + self.default_config = config.get("default_threshold_config", "default") + if self.default_config not in self.threshold_configs and self.threshold_configs: + # Use first available config as default + self.default_config = sorted(self.threshold_configs.keys())[0] + logger.info("Using '%s' as default threshold config", self.default_config) + + logger.info( + "Loaded %d threshold configurations with %d host mappings", + len(self.threshold_configs), + len(self.host_config_mapping) + ) + + def _parse_legacy_config(self, config: Dict[str, Any]): + """Parse legacy single threshold configuration for backward compatibility.""" if not config or "thresholds" not in config: logger.info("No thresholds configured") return @@ -307,13 +395,27 @@ class ThresholdChecker: if not isinstance(plugin_thresholds, dict): continue - self._parse_plugin_thresholds(plugin_name, plugin_thresholds) + self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds) - def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]): - """Parse thresholds for a specific plugin.""" + def _parse_plugin_thresholds( + self, + plugin_name: str, + thresholds: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): + """Parse thresholds for a specific plugin. + + Args: + plugin_name: Name of the plugin + thresholds: Threshold configuration dictionary + target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds) + """ + if target_dict is None: + target_dict = self.thresholds + # Special handling for RTT thresholds (per-host) if plugin_name == "rtt": - self._parse_rtt_thresholds(thresholds) + self._parse_rtt_thresholds(thresholds, target_dict) return for metric_name, threshold_config in thresholds.items(): @@ -322,7 +424,7 @@ class ThresholdChecker: # Handle nested metrics (e.g., partitions./.percent) if metric_name == "partitions": - self._parse_partition_thresholds(plugin_name, threshold_config) + self._parse_partition_thresholds(plugin_name, threshold_config, target_dict) continue metric_path = f"{plugin_name}.{metric_name}" @@ -331,7 +433,7 @@ class ThresholdChecker: warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") - display = threshold_config.get("display") + display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default enabled = threshold_config.get("enabled", True) @@ -349,7 +451,7 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold logger.debug( "Registered threshold for %s: warn=%s, crit=%s, op=%s", metric_path, @@ -358,8 +460,22 @@ class ThresholdChecker: operator ) - def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]): - """Parse partition-specific thresholds for disk monitoring.""" + def _parse_partition_thresholds( + self, + plugin_name: str, + partitions: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): + """Parse partition-specific thresholds for disk monitoring. + + Args: + plugin_name: Name of the plugin + partitions: Partition threshold configuration + target_dict: Dictionary to store parsed thresholds + """ + if target_dict is None: + target_dict = self.thresholds + for partition, metrics in partitions.items(): if not isinstance(metrics, dict): continue @@ -390,9 +506,13 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold - def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]): + def _parse_rtt_thresholds( + self, + rtt_thresholds: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): """Parse RTT thresholds (per-host network latency thresholds). RTT thresholds are configured as: @@ -401,7 +521,14 @@ class ThresholdChecker: hostname1: warning: 100.0 # ms critical: 500.0 # ms + + Args: + rtt_thresholds: RTT threshold configuration + target_dict: Dictionary to store parsed thresholds """ + if target_dict is None: + target_dict = self.thresholds + for hostname, threshold_config in rtt_thresholds.items(): if not isinstance(threshold_config, dict): continue @@ -430,7 +557,7 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold logger.debug( "Registered RTT threshold for %s: warn=%s ms, crit=%s ms", hostname, @@ -438,6 +565,37 @@ class ThresholdChecker: critical ) + def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: + """Get the appropriate threshold configuration for a host. + + Args: + host_name: Name of the host + + Returns: + Dictionary of thresholds for this host + """ + # Legacy mode: single threshold set for all hosts + if self.thresholds and not self.threshold_configs: + return self.thresholds + + # Multi-config mode: look up host-specific configuration + if self.threshold_configs: + config_name = self.host_config_mapping.get(host_name, self.default_config) + + if config_name in self.threshold_configs: + return self.threshold_configs[config_name] + else: + logger.warning( + "Threshold config '%s' not found for host '%s', using default '%s'", + config_name, + host_name, + self.default_config + ) + return self.threshold_configs.get(self.default_config, {}) + + # No thresholds configured + return {} + def check_value( self, host_name: str, @@ -457,10 +615,13 @@ class ThresholdChecker: Returns: Tuple of (old_level, new_level) if state changed, None otherwise """ - if metric_path not in self.thresholds: + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + + if metric_path not in thresholds: return None - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: @@ -484,14 +645,17 @@ class ThresholdChecker: # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): - self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold) + # For check_value, we don't have full plugin data, pass None + lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) return (old_level, new_level) elif new_level != AlertLevel.OK: # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, None) return None - def check_plugin_data( self, host_name: str, @@ -513,14 +677,17 @@ class ThresholdChecker: """ state_changes = [] + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + # Check flat metrics for metric_name, value in data.items(): metric_path = f"{plugin_name}.{metric_name}" - if metric_path not in self.thresholds: + if metric_path not in thresholds: continue - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: @@ -545,10 +712,13 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold) + lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) elif new_level != AlertLevel.OK: # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) # Check nested metrics (e.g., partition data in disk_monitor) self._check_nested_metrics( @@ -570,6 +740,9 @@ class ThresholdChecker: state_changes: list, ): """Check nested metrics like partition-specific thresholds.""" + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + # Look for partition data in disk_monitor if plugin_name == "disk_monitor" and "partitions" in data: partitions = data["partitions"] @@ -583,10 +756,10 @@ class ThresholdChecker: for metric_name, value in metrics.items(): metric_path = f"{plugin_name}.{partition}.{metric_name}" - if metric_path not in self.thresholds: + if metric_path not in thresholds: continue - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) @@ -608,16 +781,20 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - self._trigger_notification( + lvl, message, formatted_msg = self._trigger_notification( host_name, metric_path, old_level, new_level, value, - threshold + threshold, + data # Pass full plugin data for format string ) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) elif new_level != AlertLevel.OK: - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) def _trigger_notification( self, @@ -627,8 +804,19 @@ class ThresholdChecker: new_level: AlertLevel, value: Any, threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]] = None, ): - """Trigger a notification for an alert state change.""" + """Trigger a notification for an alert state change. + + Args: + host_name: Name of the host + metric_path: Full metric path + old_level: Previous alert level + new_level: New alert level + value: Current metric value + threshold: Threshold configuration + plugin_data: Optional dictionary of all plugin data fields for format string + """ # Determine which threshold was exceeded threshold_value = None if new_level == AlertLevel.CRITICAL and threshold.critical is not None: @@ -646,20 +834,59 @@ class ThresholdChecker: elif new_level == AlertLevel.WARNING: lvl = "WARNING" if threshold_value is not None: - message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"{metric_path} = {value} {threshold_info}" else: message = f"{metric_path} = {value}" elif new_level == AlertLevel.CRITICAL: lvl = "CRITICAL" if threshold_value is not None: - message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"{metric_path} = {value} {threshold_info}" else: message = f"{metric_path} = {value}" else: lvl = "UNKNOWN" message = f"{metric_path} = {value}" - # Send notification + # Return the formatted threshold info for storing in AlertState + formatted_threshold_msg = None + if threshold_value is not None and new_level != AlertLevel.OK: + formatted_threshold_msg = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + + return lvl, message, formatted_threshold_msg + + def _send_notification( + self, + host_name: str, + lvl: str, + message: str, + metric_path: str, + old_level: AlertLevel, + new_level: AlertLevel, + value: Any, + ): + """Send notification and log to journal/eventlog.""" if self.notification_callback is not None: try: self.notification_callback(f"{lvl}: {host_name} - {message}") @@ -684,6 +911,56 @@ class ThresholdChecker: # Log to eventlog as well eventlog(host_name, lvl, message, service="threshold") + def _format_display( + self, + display_format: str, + value: Any, + threshold_value: float, + op_symbol: str, + plugin_data: Optional[Dict[str, Any]] = None, + ) -> str: + """Format the display string using available data. + + Args: + display_format: Format string from threshold config + value: Current metric value + threshold_value: Threshold value that was exceeded + op_symbol: Comparison operator symbol + plugin_data: Optional dictionary of plugin data fields + + Returns: + Formatted display string + """ + # Build format context with standard variables + format_context = { + 'value': value, + 'threshold_value': threshold_value, + 'op_symbol': op_symbol, + } + + # Add all plugin data fields if available + if plugin_data: + format_context.update(plugin_data) + + try: + # Format the display string + return display_format.format(**format_context) + except KeyError as e: + logger.warning( + "Missing format variable in display string '%s': %s", + display_format, + e + ) + # Fallback to default format + return f"(threshold: {op_symbol} {threshold_value})" + except Exception as e: + logger.error( + "Error formatting display string '%s': %s", + display_format, + e + ) + return f"(threshold: {op_symbol} {threshold_value})" + def _check_renotify( self, host_name: str, @@ -691,8 +968,18 @@ class ThresholdChecker: metric_path: str, value: Any, threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]] = None, ): - """Check if we should send a repeat notification.""" + """Check if we should send a repeat notification. + + Args: + host_name: Name of the host + alert_state: Current alert state + metric_path: Full metric path + value: Current metric value + threshold: Threshold configuration + plugin_data: Optional dictionary of all plugin data fields + """ if alert_state.level == AlertLevel.OK: return @@ -718,7 +1005,15 @@ class ThresholdChecker: # Time to re-notify if threshold_value is not None: - message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s" else: message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" diff --git a/nagios_bad.sh b/nagios_bad.sh new file mode 100755 index 0000000..2dc1341 --- /dev/null +++ b/nagios_bad.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +#echo "OK - all is well" +echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"