feat: generic threshold matching for nagios_runner with {check_name} display support

_find_threshold() now returns the stripped prefix ("check_name") alongside
the ThresholdConfig, enabling a single generic entry (e.g. nagios_runner.status_code)
to cover all per-command metrics (check_disk_root_status_code, check_load_status_code,
…). The prefix is threaded through to _format_display() as {check_name}, with
{metric_name} also available in display templates. purge_stale_alerts() updated
to use generic matching so it does not incorrectly drop alerts on generic-matched
metrics. README updated with Display Format Templates and Generic Threshold
Matching sections.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-05 10:48:17 -04:00
parent de778f680f
commit b1985d0eb2
2 changed files with 127 additions and 37 deletions
+55 -2
View File
@@ -181,7 +181,8 @@ thresholds:
warning: 80.0 # Warn when CPU > 80% warning: 80.0 # Warn when CPU > 80%
critical: 90.0 # Critical when CPU > 90% critical: 90.0 # Critical when CPU > 90%
operator: ">" operator: ">"
hysteresis: 0.1 # 10% hysteresis to prevent flapping hysteresis: 0.02 # 2% hysteresis to prevent flapping
display: "(threshold: {op_symbol} {threshold_value}%)" # optional
memory_monitor: memory_monitor:
percent: percent:
@@ -274,7 +275,59 @@ All plugin metrics can be thresholded:
- **Memory**: percent, available_mb, swap_percent - **Memory**: percent, available_mb, swap_percent
- **Disk**: Per-partition percent, free_gb, free_mb - **Disk**: Per-partition percent, free_gb, free_mb
- **Network**: errors_total, dropped packets, connection counts - **Network**: errors_total, dropped packets, connection counts
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL) - **Nagios**: Any field emitted by `nagios_runner` (status_code, exit_code, performance data, …)
### Display Format Templates
Each threshold entry accepts an optional `display` field — a Python format string shown in notifications and on the Alerts dashboard:
```yaml
nagios_runner:
status_code:
warning: 1
critical: 2
operator: ">="
display: "{check_name}: exit {value} (expected < {threshold_value})"
```
Available variables:
| Variable | Description |
|---|---|
| `{value}` | Current metric value |
| `{threshold_value}` | Threshold that was crossed |
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) |
| `{check_name}` | Prefix stripped by generic matching (see below) |
| `{metric_name}` | Full field name within the plugin data |
| any plugin field | Any other field present in the plugin's data |
### Generic Threshold Matching
When a metric name has no exact threshold entry, the server progressively strips leading underscore-separated segments and re-tries the lookup. This lets a single generic entry cover an entire family of metrics.
The classic use case is `nagios_runner`, which names each metric after the command that produced it:
```
nagios_runner.check_disk_root_status_code → no exact match
nagios_runner.disk_root_status_code → no match
nagios_runner.root_status_code → no match
nagios_runner.status_code → matched ✓
```
Configure the generic threshold once:
```yaml
nagios_runner:
status_code:
warning: 1
critical: 2
operator: ">="
display: "{check_name}: exit {value}"
```
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
Exact matches always take priority. A generic entry only applies when no specific one is defined.
### Per-Host Threshold Profiles ### Per-Host Threshold Profiles
+72 -35
View File
@@ -823,26 +823,33 @@ class ThresholdChecker:
return None return None
def _find_threshold( def _find_threshold(
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
) -> Optional["ThresholdConfig"]: ) -> Tuple[Optional["ThresholdConfig"], Optional[str]]:
"""Return the threshold for *metric_path*, falling back to suffix matches. """Return (threshold, check_name) for *metric_path*, falling back to suffix matches.
Allows generic thresholds like ``ping_monitor.rtt_avg`` to match Allows generic thresholds like ``nagios_runner.status_code`` to match
fully-qualified paths like ``ping_monitor.8_8_8_8_rtt_avg``. fully-qualified paths like ``nagios_runner.check_disk_root_status_code``.
The exact match is always tried first; then successive leading The exact match is always tried first; then successive leading
underscore-delimited segments are stripped from the field name until underscore-delimited segments are stripped from the field name until
a match is found or no segments remain. a match is found or no segments remain.
Returns:
(ThresholdConfig, None) for an exact match.
(ThresholdConfig, "check_disk_root") for a suffix match — the second
element is the stripped prefix, available as ``{check_name}`` in
display format templates.
(None, None) when no threshold is found.
""" """
if metric_path in thresholds: if metric_path in thresholds:
return thresholds[metric_path] return thresholds[metric_path], None
plugin, sep, field = metric_path.partition(".") plugin, sep, field = metric_path.partition(".")
if not sep: if not sep:
return None return None, None
parts = field.split("_") parts = field.split("_")
for i in range(1, len(parts)): for i in range(1, len(parts)):
candidate = plugin + "." + "_".join(parts[i:]) candidate = plugin + "." + "_".join(parts[i:])
if candidate in thresholds: if candidate in thresholds:
return thresholds[candidate] return thresholds[candidate], "_".join(parts[:i])
return None return None, None
def check_plugin_data( def check_plugin_data(
self, self,
@@ -871,23 +878,23 @@ class ThresholdChecker:
# Check flat metrics # Check flat metrics
for metric_name, value in data.items(): for metric_name, value in data.items():
metric_path = f"{plugin_name}.{metric_name}" metric_path = f"{plugin_name}.{metric_name}"
threshold = self._find_threshold(thresholds, metric_path) threshold, check_name = self._find_threshold(thresholds, metric_path)
if threshold is None: if threshold is None:
continue continue
# Get or create alert state # Get or create alert state
if metric_path not in alert_states: if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path) alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path] alert_state = alert_states[metric_path]
# Evaluate threshold with hysteresis # Evaluate threshold with hysteresis
new_level = threshold.evaluate_with_hysteresis( new_level = threshold.evaluate_with_hysteresis(
value, value,
alert_state.level alert_state.level
) )
# Determine which threshold was exceeded # Determine which threshold was exceeded
threshold_value = None threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None: if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
@@ -901,9 +908,9 @@ class ThresholdChecker:
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value)) state_changes.append((metric_path, old_level, new_level, value))
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data) self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data, check_name=check_name, metric_name=metric_name)
elif new_level != AlertLevel.OK: elif new_level != AlertLevel.OK:
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data) self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data, check_name=check_name, metric_name=metric_name)
# Check nested metrics (e.g., partition data in disk_monitor) # Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics( self._check_nested_metrics(
@@ -981,6 +988,8 @@ class ThresholdChecker:
value: Any, value: Any,
threshold: ThresholdConfig, threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]] = None, plugin_data: Optional[Dict[str, Any]] = None,
check_name: Optional[str] = None,
metric_name: Optional[str] = None,
): ):
"""Trigger a notification for an alert state change. """Trigger a notification for an alert state change.
@@ -1019,7 +1028,9 @@ class ThresholdChecker:
value=display_value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
) )
message = f"{metric_path} = {display_value} {threshold_info}" message = f"{metric_path} = {display_value} {threshold_info}"
else: else:
@@ -1032,7 +1043,9 @@ class ThresholdChecker:
value=display_value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
) )
message = f"{metric_path} = {display_value} {threshold_info}" message = f"{metric_path} = {display_value} {threshold_info}"
else: else:
@@ -1040,7 +1053,7 @@ class ThresholdChecker:
else: else:
lvl = "UNKNOWN" lvl = "UNKNOWN"
message = f"{metric_path} = {display_value}" message = f"{metric_path} = {display_value}"
# Return the formatted threshold info for storing in AlertState # Return the formatted threshold info for storing in AlertState
formatted_threshold_msg = None formatted_threshold_msg = None
if threshold_value is not None and new_level != AlertLevel.OK: if threshold_value is not None and new_level != AlertLevel.OK:
@@ -1049,9 +1062,11 @@ class ThresholdChecker:
value=display_value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
) )
return lvl, message, formatted_threshold_msg return lvl, message, formatted_threshold_msg
def _send_notification( def _send_notification(
@@ -1102,16 +1117,22 @@ class ThresholdChecker:
threshold_value: float, threshold_value: float,
op_symbol: str, op_symbol: str,
plugin_data: Optional[Dict[str, Any]] = None, plugin_data: Optional[Dict[str, Any]] = None,
check_name: Optional[str] = None,
metric_name: Optional[str] = None,
) -> str: ) -> str:
"""Format the display string using available data. """Format the display string using available data.
Args: Available template variables:
display_format: Format string from threshold config {value} - current metric value
value: Current metric value {threshold_value} - threshold that was exceeded
threshold_value: Threshold value that was exceeded {op_symbol} - comparison operator (>, <, >=, <=, ==, !=)
op_symbol: Comparison operator symbol {check_name} - prefix stripped for generic threshold match
plugin_data: Optional dictionary of plugin data fields (e.g. "check_disk_root" when metric
"check_disk_root_status_code" matched generic
threshold "status_code")
{metric_name} - field name within the plugin data dict
Any key from plugin_data is also available.
Returns: Returns:
Formatted display string Formatted display string
""" """
@@ -1121,7 +1142,13 @@ class ThresholdChecker:
'threshold_value': threshold_value, 'threshold_value': threshold_value,
'op_symbol': op_symbol, 'op_symbol': op_symbol,
} }
# Add generic-match context variables when available
if check_name is not None:
format_context['check_name'] = check_name
if metric_name is not None:
format_context['metric_name'] = metric_name
# Add all plugin data fields if available # Add all plugin data fields if available
if plugin_data: if plugin_data:
format_context.update(plugin_data) format_context.update(plugin_data)
@@ -1155,6 +1182,8 @@ class ThresholdChecker:
value: Any, value: Any,
threshold: ThresholdConfig, threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]], plugin_data: Optional[Dict[str, Any]],
check_name: Optional[str] = None,
metric_name: Optional[str] = None,
) -> None: ) -> None:
"""Handle a state-change transition with grace-period logic. """Handle a state-change transition with grace-period logic.
@@ -1167,7 +1196,8 @@ class ThresholdChecker:
- Past grace: fires the RECOVER notification normally. - Past grace: fires the RECOVER notification normally.
""" """
lvl, message, formatted_msg = self._trigger_notification( lvl, message, formatted_msg = self._trigger_notification(
host_name, metric_path, old_level, new_level, value, threshold, plugin_data host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
check_name=check_name, metric_name=metric_name,
) )
alert_state.formatted_message = formatted_msg alert_state.formatted_message = formatted_msg
@@ -1203,6 +1233,8 @@ class ThresholdChecker:
value: Any, value: Any,
threshold: ThresholdConfig, threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]], plugin_data: Optional[Dict[str, Any]],
check_name: Optional[str] = None,
metric_name: Optional[str] = None,
) -> None: ) -> None:
"""Called when alert level is unchanged and non-OK. """Called when alert level is unchanged and non-OK.
@@ -1212,7 +1244,8 @@ class ThresholdChecker:
if alert_state.pending_since is not None: if alert_state.pending_since is not None:
if time.time() - alert_state.pending_since >= self.grace_seconds: if time.time() - alert_state.pending_since >= self.grace_seconds:
lvl, message, formatted_msg = self._trigger_notification( lvl, message, formatted_msg = self._trigger_notification(
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
check_name=check_name, metric_name=metric_name,
) )
alert_state.formatted_message = formatted_msg alert_state.formatted_message = formatted_msg
self._send_notification( self._send_notification(
@@ -1221,7 +1254,7 @@ class ThresholdChecker:
alert_state.pending_since = None alert_state.pending_since = None
# else: still within grace window, do nothing # else: still within grace window, do nothing
else: else:
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data) self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
def _check_renotify( def _check_renotify(
self, self,
@@ -1231,6 +1264,8 @@ class ThresholdChecker:
value: Any, value: Any,
threshold: ThresholdConfig, threshold: ThresholdConfig,
plugin_data: Optional[Dict[str, Any]] = None, plugin_data: Optional[Dict[str, Any]] = None,
check_name: Optional[str] = None,
metric_name: Optional[str] = None,
): ):
"""Check if we should send a repeat notification. """Check if we should send a repeat notification.
@@ -1277,7 +1312,9 @@ class ThresholdChecker:
value=value, value=value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
) )
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s" message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
else: else:
@@ -1310,7 +1347,7 @@ class ThresholdChecker:
if not host.alert_states: if not host.alert_states:
continue continue
configured = self.get_thresholds_for_host(hostname) configured = self.get_thresholds_for_host(hostname)
stale = [mp for mp in host.alert_states if mp not in configured] stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
for mp in stale: for mp in stale:
logger.info( logger.info(
"Purging stale alert state for %s / %s (no threshold configured)", "Purging stale alert state for %s / %s (no threshold configured)",