diff --git a/README.md b/README.md index 4c698bd..66fed85 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,8 @@ thresholds: warning: 80.0 # Warn when CPU > 80% critical: 90.0 # Critical when CPU > 90% operator: ">" - hysteresis: 0.1 # 10% hysteresis to prevent flapping + hysteresis: 0.02 # 2% hysteresis to prevent flapping + display: "(threshold: {op_symbol} {threshold_value}%)" # optional memory_monitor: percent: @@ -274,7 +275,59 @@ All plugin metrics can be thresholded: - **Memory**: percent, available_mb, swap_percent - **Disk**: Per-partition percent, free_gb, free_mb - **Network**: errors_total, dropped packets, connection counts -- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL) +- **Nagios**: Any field emitted by `nagios_runner` (status_code, exit_code, performance data, …) + +### Display Format Templates + +Each threshold entry accepts an optional `display` field — a Python format string shown in notifications and on the Alerts dashboard: + +```yaml +nagios_runner: + status_code: + warning: 1 + critical: 2 + operator: ">=" + display: "{check_name}: exit {value} (expected < {threshold_value})" +``` + +Available variables: + +| Variable | Description | +|---|---| +| `{value}` | Current metric value | +| `{threshold_value}` | Threshold that was crossed | +| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) | +| `{check_name}` | Prefix stripped by generic matching (see below) | +| `{metric_name}` | Full field name within the plugin data | +| any plugin field | Any other field present in the plugin's data | + +### Generic Threshold Matching + +When a metric name has no exact threshold entry, the server progressively strips leading underscore-separated segments and re-tries the lookup. This lets a single generic entry cover an entire family of metrics. + +The classic use case is `nagios_runner`, which names each metric after the command that produced it: + +``` +nagios_runner.check_disk_root_status_code → no exact match +nagios_runner.disk_root_status_code → no match +nagios_runner.root_status_code → no match +nagios_runner.status_code → matched ✓ +``` + +Configure the generic threshold once: + +```yaml +nagios_runner: + status_code: + warning: 1 + critical: 2 + operator: ">=" + display: "{check_name}: exit {value}" +``` + +The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command. + +Exact matches always take priority. A generic entry only applies when no specific one is defined. ### Per-Host Threshold Profiles diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 66c6619..b19b5b8 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -823,26 +823,33 @@ class ThresholdChecker: return None def _find_threshold( self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str - ) -> Optional["ThresholdConfig"]: - """Return the threshold for *metric_path*, falling back to suffix matches. + ) -> Tuple[Optional["ThresholdConfig"], Optional[str]]: + """Return (threshold, check_name) for *metric_path*, falling back to suffix matches. - Allows generic thresholds like ``ping_monitor.rtt_avg`` to match - fully-qualified paths like ``ping_monitor.8_8_8_8_rtt_avg``. + Allows generic thresholds like ``nagios_runner.status_code`` to match + fully-qualified paths like ``nagios_runner.check_disk_root_status_code``. The exact match is always tried first; then successive leading underscore-delimited segments are stripped from the field name until a match is found or no segments remain. + + Returns: + (ThresholdConfig, None) for an exact match. + (ThresholdConfig, "check_disk_root") for a suffix match — the second + element is the stripped prefix, available as ``{check_name}`` in + display format templates. + (None, None) when no threshold is found. """ if metric_path in thresholds: - return thresholds[metric_path] + return thresholds[metric_path], None plugin, sep, field = metric_path.partition(".") if not sep: - return None + return None, None parts = field.split("_") for i in range(1, len(parts)): candidate = plugin + "." + "_".join(parts[i:]) if candidate in thresholds: - return thresholds[candidate] - return None + return thresholds[candidate], "_".join(parts[:i]) + return None, None def check_plugin_data( self, @@ -871,23 +878,23 @@ class ThresholdChecker: # Check flat metrics for metric_name, value in data.items(): metric_path = f"{plugin_name}.{metric_name}" - - threshold = self._find_threshold(thresholds, metric_path) + + threshold, check_name = self._find_threshold(thresholds, metric_path) if threshold is None: continue - + # Get or create alert state if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) - + alert_state = alert_states[metric_path] - + # Evaluate threshold with hysteresis new_level = threshold.evaluate_with_hysteresis( value, alert_state.level ) - + # Determine which threshold was exceeded threshold_value = None if new_level == AlertLevel.CRITICAL and threshold.critical is not None: @@ -901,9 +908,9 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data) + self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data, check_name=check_name, metric_name=metric_name) elif new_level != AlertLevel.OK: - self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data) + self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data, check_name=check_name, metric_name=metric_name) # Check nested metrics (e.g., partition data in disk_monitor) self._check_nested_metrics( @@ -981,6 +988,8 @@ class ThresholdChecker: value: Any, threshold: ThresholdConfig, plugin_data: Optional[Dict[str, Any]] = None, + check_name: Optional[str] = None, + metric_name: Optional[str] = None, ): """Trigger a notification for an alert state change. @@ -1019,7 +1028,9 @@ class ThresholdChecker: value=display_value, threshold_value=threshold_value, op_symbol=op_symbol, - plugin_data=plugin_data + plugin_data=plugin_data, + check_name=check_name, + metric_name=metric_name, ) message = f"{metric_path} = {display_value} {threshold_info}" else: @@ -1032,7 +1043,9 @@ class ThresholdChecker: value=display_value, threshold_value=threshold_value, op_symbol=op_symbol, - plugin_data=plugin_data + plugin_data=plugin_data, + check_name=check_name, + metric_name=metric_name, ) message = f"{metric_path} = {display_value} {threshold_info}" else: @@ -1040,7 +1053,7 @@ class ThresholdChecker: else: lvl = "UNKNOWN" message = f"{metric_path} = {display_value}" - + # Return the formatted threshold info for storing in AlertState formatted_threshold_msg = None if threshold_value is not None and new_level != AlertLevel.OK: @@ -1049,9 +1062,11 @@ class ThresholdChecker: value=display_value, threshold_value=threshold_value, op_symbol=op_symbol, - plugin_data=plugin_data + plugin_data=plugin_data, + check_name=check_name, + metric_name=metric_name, ) - + return lvl, message, formatted_threshold_msg def _send_notification( @@ -1102,16 +1117,22 @@ class ThresholdChecker: threshold_value: float, op_symbol: str, plugin_data: Optional[Dict[str, Any]] = None, + check_name: Optional[str] = None, + metric_name: Optional[str] = None, ) -> str: """Format the display string using available data. - - Args: - display_format: Format string from threshold config - value: Current metric value - threshold_value: Threshold value that was exceeded - op_symbol: Comparison operator symbol - plugin_data: Optional dictionary of plugin data fields - + + Available template variables: + {value} - current metric value + {threshold_value} - threshold that was exceeded + {op_symbol} - comparison operator (>, <, >=, <=, ==, !=) + {check_name} - prefix stripped for generic threshold match + (e.g. "check_disk_root" when metric + "check_disk_root_status_code" matched generic + threshold "status_code") + {metric_name} - field name within the plugin data dict + Any key from plugin_data is also available. + Returns: Formatted display string """ @@ -1121,7 +1142,13 @@ class ThresholdChecker: 'threshold_value': threshold_value, 'op_symbol': op_symbol, } - + + # Add generic-match context variables when available + if check_name is not None: + format_context['check_name'] = check_name + if metric_name is not None: + format_context['metric_name'] = metric_name + # Add all plugin data fields if available if plugin_data: format_context.update(plugin_data) @@ -1155,6 +1182,8 @@ class ThresholdChecker: value: Any, threshold: ThresholdConfig, plugin_data: Optional[Dict[str, Any]], + check_name: Optional[str] = None, + metric_name: Optional[str] = None, ) -> None: """Handle a state-change transition with grace-period logic. @@ -1167,7 +1196,8 @@ class ThresholdChecker: - Past grace: fires the RECOVER notification normally. """ lvl, message, formatted_msg = self._trigger_notification( - host_name, metric_path, old_level, new_level, value, threshold, plugin_data + host_name, metric_path, old_level, new_level, value, threshold, plugin_data, + check_name=check_name, metric_name=metric_name, ) alert_state.formatted_message = formatted_msg @@ -1203,6 +1233,8 @@ class ThresholdChecker: value: Any, threshold: ThresholdConfig, plugin_data: Optional[Dict[str, Any]], + check_name: Optional[str] = None, + metric_name: Optional[str] = None, ) -> None: """Called when alert level is unchanged and non-OK. @@ -1212,7 +1244,8 @@ class ThresholdChecker: if alert_state.pending_since is not None: if time.time() - alert_state.pending_since >= self.grace_seconds: lvl, message, formatted_msg = self._trigger_notification( - host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data + host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data, + check_name=check_name, metric_name=metric_name, ) alert_state.formatted_message = formatted_msg self._send_notification( @@ -1221,7 +1254,7 @@ class ThresholdChecker: alert_state.pending_since = None # else: still within grace window, do nothing else: - self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name) def _check_renotify( self, @@ -1231,6 +1264,8 @@ class ThresholdChecker: value: Any, threshold: ThresholdConfig, plugin_data: Optional[Dict[str, Any]] = None, + check_name: Optional[str] = None, + metric_name: Optional[str] = None, ): """Check if we should send a repeat notification. @@ -1277,7 +1312,9 @@ class ThresholdChecker: value=value, threshold_value=threshold_value, op_symbol=op_symbol, - plugin_data=plugin_data + plugin_data=plugin_data, + check_name=check_name, + metric_name=metric_name, ) message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s" else: @@ -1310,7 +1347,7 @@ class ThresholdChecker: if not host.alert_states: continue configured = self.get_thresholds_for_host(hostname) - stale = [mp for mp in host.alert_states if mp not in configured] + stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None] for mp in stale: logger.info( "Purging stale alert state for %s / %s (no threshold configured)",