feat: nagios operator for direct exit-code severity mapping

Add ComparisonOperator.NAGIOS ("nagios") that maps Nagios exit codes
directly to alert levels (0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN) without
requiring numeric warning/critical thresholds. Hysteresis is bypassed for
discrete codes. Display template defaults to "{check_name}: {output}".
_format_display() handles None threshold_value gracefully.

Add nagios_runner.status_code as a built-in default threshold config so
nagios checks alert out of the box.

Also: fix alerts.html scrolling (override html,body), make hostname a link
to /plugins#<hostname>, remove overall_status/overall_status_code/plugin_count
from nagios_runner and hbc_mini, replace with computed worst-status in
plugins.html via nagiosWorstStatus() helper.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-05 12:26:56 -04:00
parent d7b5c97a4e
commit a534c06b26
9 changed files with 100 additions and 114 deletions
+4 -6
View File
@@ -296,7 +296,7 @@ Available variables:
|---|---| |---|---|
| `{value}` | Current metric value | | `{value}` | Current metric value |
| `{threshold_value}` | Threshold that was crossed | | `{threshold_value}` | Threshold that was crossed |
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) | | `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
| `{check_name}` | Prefix stripped by generic matching (see below) | | `{check_name}` | Prefix stripped by generic matching (see below) |
| `{metric_name}` | Full field name within the plugin data | | `{metric_name}` | Full field name within the plugin data |
| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) | | `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
@@ -316,15 +316,13 @@ nagios_runner.root_status_code → no match
nagios_runner.status_code → matched ✓ nagios_runner.status_code → matched ✓
``` ```
Configure the generic threshold once: Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
```yaml ```yaml
nagios_runner: nagios_runner:
status_code: status_code:
warning: 1 operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
critical: 2 display: "{check_name}: {output}"
operator: ">="
display: "{check_name}: exit {value}"
``` ```
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command. The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
-5
View File
@@ -104,11 +104,6 @@ The `nagios_runner` plugin collects:
- `{name}_{metric}_min` - Minimum value (if present) - `{name}_{metric}_min` - Minimum value (if present)
- `{name}_{metric}_max` - Maximum value (if present) - `{name}_{metric}_max` - Maximum value (if present)
**Overall:**
- `overall_status` - Worst status from all commands
- `overall_status_code` - Worst status code
- `plugin_count` - Number of Nagios plugins executed
## Configuration Options ## Configuration Options
```yaml ```yaml
-27
View File
@@ -1110,33 +1110,6 @@ hosts:
db-02: db-02:
threshold_config: [tight_memory, db_disk] threshold_config: [tight_memory, db_disk]
``` ```
### Backward Compatibility
The legacy single threshold configuration is fully supported:
```yaml
# Old format - still works
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
```
This is equivalent to:
```yaml
# New format
threshold_configs:
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
```
### Configuration Priority ### Configuration Priority
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults 1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
+6
View File
@@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = {
'warning': 200, 'warning': 200,
'critical': 250.0, 'critical': 250.0,
'count': 3 # Optional: number of consecutive breaches before alerting 'count': 3 # Optional: number of consecutive breaches before alerting
},
'nagios_runner': {
'status_code': {
'display': '{check_name} {output}',
'operator': "nagios"
}
} }
} }
} }
+1 -1
View File
@@ -437,7 +437,7 @@
<div class="alert-main"> <div class="alert-main">
<div class="alert-header"> <div class="alert-header">
<span class="alert-level ${level}">${alert.level}</span> <span class="alert-level ${level}">${alert.level}</span>
<a class="alert-hostname" href="/plugins/${alert.hostname}">${alert.hostname}</a> <a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
</div> </div>
<div class="alert-metric">${alert.metric_path}</div> <div class="alert-metric">${alert.metric_path}</div>
<div class="alert-details"> <div class="alert-details">
+20 -8
View File
@@ -499,6 +499,17 @@
return pluginCache[hostname]?.[pluginName] ?? null; return pluginCache[hostname]?.[pluginName] ?? null;
} }
// Return worst nagios exit code (0-3) found in a nagios_runner data object.
function nagiosWorstStatus(data) {
let worst = 0;
for (const [k, v] of Object.entries(data || {})) {
if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) {
worst = v;
}
}
return worst;
}
// ── Fetch helpers ─────────────────────────────────────────────────────── // ── Fetch helpers ───────────────────────────────────────────────────────
async function fetchPlugin(hostname, pluginName) { async function fetchPlugin(hostname, pluginName) {
@@ -600,13 +611,13 @@
? chips.join('') ? chips.join('')
: '<span class="glance-loading">—</span>'; : '<span class="glance-loading">—</span>';
// Nagios badge // Nagios badge — derive worst status from individual check codes
const nagios = getCache(hostname, 'nagios_runner'); const nagios = getCache(hostname, 'nagios_runner');
if (nagosBadge && nagios) { if (nagosBadge && nagios) {
const status = (nagios.data.overall_status || '—').toUpperCase(); const worst = nagiosWorstStatus(nagios.data);
const cls = status === 'OK' ? 'ok' const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
: status === 'WARNING' ? 'warning' const status = names[worst] || '—';
: status === 'CRITICAL' ? 'critical' : ''; const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : '';
nagosBadge.className = `nagios-badge ${cls}`; nagosBadge.className = `nagios-badge ${cls}`;
nagosBadge.textContent = status; nagosBadge.textContent = status;
} }
@@ -715,9 +726,10 @@
break; break;
} }
case 'nagios_runner': { case 'nagios_runner': {
const status = (d.overall_status || '?').toUpperCase(); const worst = nagiosWorstStatus(d);
const count = d.plugin_count; const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
text = status + (count != null ? `${count} checks` : ''); const codes = Object.keys(d).filter(k => k.endsWith('_status_code'));
text = (names[worst] || '?') + (codes.length ? `${codes.length} checks` : '');
break; break;
} }
case 'filesystem_info': { case 'filesystem_info': {
+68 -60
View File
@@ -30,12 +30,13 @@ class AlertLevel(Enum):
class ComparisonOperator(Enum): class ComparisonOperator(Enum):
"""Supported comparison operators for threshold checks.""" """Supported comparison operators for threshold checks."""
GT = ">" # Greater than GT = ">" # Greater than
GTE = ">=" # Greater than or equal GTE = ">=" # Greater than or equal
LT = "<" # Less than LT = "<" # Less than
LTE = "<=" # Less than or equal LTE = "<=" # Less than or equal
EQ = "==" # Equal to EQ = "==" # Equal to
NEQ = "!=" # Not equal to NEQ = "!=" # Not equal to
NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
class AlertState: class AlertState:
@@ -229,33 +230,43 @@ class ThresholdConfig:
def evaluate(self, value: float) -> AlertLevel: def evaluate(self, value: float) -> AlertLevel:
""" """
Evaluate a value against this threshold. Evaluate a value against this threshold.
Args: Args:
value: Metric value to check value: Metric value to check
Returns: Returns:
AlertLevel indicating the severity AlertLevel indicating the severity
""" """
if not self.enabled: if not self.enabled:
return AlertLevel.OK return AlertLevel.OK
# Nagios exit-code semantics: value IS the severity
if self.operator == ComparisonOperator.NAGIOS:
try:
code = int(value)
except (TypeError, ValueError):
return AlertLevel.UNKNOWN
return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get(
code, AlertLevel.UNKNOWN
)
try: try:
# Convert value to float for comparison # Convert value to float for comparison
value = float(value) value = float(value)
except (TypeError, ValueError): except (TypeError, ValueError):
logger.warning("Cannot convert value %s to float for %s", value, self.metric_path) logger.warning("Cannot convert value %s to float for %s", value, self.metric_path)
return AlertLevel.UNKNOWN return AlertLevel.UNKNOWN
# Check critical threshold first # Check critical threshold first
if self.critical is not None: if self.critical is not None:
if self._compare(value, self.critical): if self._compare(value, self.critical):
return AlertLevel.CRITICAL return AlertLevel.CRITICAL
# Then check warning threshold # Then check warning threshold
if self.warning is not None: if self.warning is not None:
if self._compare(value, self.warning): if self._compare(value, self.warning):
return AlertLevel.WARNING return AlertLevel.WARNING
return AlertLevel.OK return AlertLevel.OK
def evaluate_with_hysteresis( def evaluate_with_hysteresis(
@@ -274,7 +285,11 @@ class ThresholdConfig:
New alert level considering hysteresis New alert level considering hysteresis
""" """
new_level = self.evaluate(value) new_level = self.evaluate(value)
# Nagios exit codes are discrete integers — hysteresis doesn't apply
if self.operator == ComparisonOperator.NAGIOS:
return new_level
# If no hysteresis, return new level # If no hysteresis, return new level
if self.hysteresis == 0.0: if self.hysteresis == 0.0:
return new_level return new_level
@@ -557,11 +572,14 @@ class ThresholdChecker:
warning = threshold_config.get("warning") warning = threshold_config.get("warning")
critical = threshold_config.get("critical") critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">") operator = threshold_config.get("operator", ">")
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") # Nagios operator maps exit codes directly; no numeric thresholds needed
hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default is_nagios_op = (operator == "nagios")
default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})"
display = threshold_config.get("display", default_display)
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
if warning is None and critical is None: if warning is None and critical is None and not is_nagios_op:
logger.warning("No thresholds defined for %s, skipping", metric_path) logger.warning("No thresholds defined for %s, skipping", metric_path)
continue continue
@@ -1016,48 +1034,12 @@ class ThresholdChecker:
import math import math
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
# Format message # Format message — for the nagios operator there is no numeric threshold_value;
if new_level == AlertLevel.OK: # render the display template whenever one is available.
lvl = "RECOVER" has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
if threshold_value is not None:
threshold_info = self._format_display(
threshold.display,
value=display_value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
)
message = f"{metric_path} = {display_value} {threshold_info}"
else:
message = f"{metric_path} = {display_value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
if threshold_value is not None:
threshold_info = self._format_display(
threshold.display,
value=display_value,
threshold_value=threshold_value,
op_symbol=op_symbol,
plugin_data=plugin_data,
check_name=check_name,
metric_name=metric_name,
)
message = f"{metric_path} = {display_value} {threshold_info}"
else:
message = f"{metric_path} = {display_value}"
else:
lvl = "UNKNOWN"
message = f"{metric_path} = {display_value}"
# Return the formatted threshold info for storing in AlertState def _fmt():
formatted_threshold_msg = None return self._format_display(
if threshold_value is not None and new_level != AlertLevel.OK:
formatted_threshold_msg = self._format_display(
threshold.display, threshold.display,
value=display_value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
@@ -1067,6 +1049,31 @@ class ThresholdChecker:
metric_name=metric_name, metric_name=metric_name,
) )
if new_level == AlertLevel.OK:
lvl = "RECOVER"
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
else:
lvl = "UNKNOWN"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
# Formatted threshold info stored on AlertState for the UI
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
return lvl, message, formatted_threshold_msg return lvl, message, formatted_threshold_msg
def _send_notification( def _send_notification(
@@ -1114,7 +1121,7 @@ class ThresholdChecker:
self, self,
display_format: str, display_format: str,
value: Any, value: Any,
threshold_value: float, threshold_value: Optional[float],
op_symbol: str, op_symbol: str,
plugin_data: Optional[Dict[str, Any]] = None, plugin_data: Optional[Dict[str, Any]] = None,
check_name: Optional[str] = None, check_name: Optional[str] = None,
@@ -1139,9 +1146,10 @@ class ThresholdChecker:
# Build format context with standard variables # Build format context with standard variables
format_context = { format_context = {
'value': value, 'value': value,
'threshold_value': threshold_value,
'op_symbol': op_symbol, 'op_symbol': op_symbol,
} }
if threshold_value is not None:
format_context['threshold_value'] = threshold_value
# Add generic-match context variables when available # Add generic-match context variables when available
if check_name is not None: if check_name is not None:
-5
View File
@@ -388,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
async def _collect_metrics(self) -> Dict[str, Any]: async def _collect_metrics(self) -> Dict[str, Any]:
results: Dict[str, Any] = {} results: Dict[str, Any] = {}
worst = 0
for cmd_cfg in self.commands: for cmd_cfg in self.commands:
name = cmd_cfg.get("name") name = cmd_cfg.get("name")
command = cmd_cfg.get("command") command = cmd_cfg.get("command")
@@ -399,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
results[f"{name}_status_code"] = rc results[f"{name}_status_code"] = rc
results[f"{name}_output"] = msg results[f"{name}_output"] = msg
results.update({f"{name}_{k}": v for k, v in perf.items()}) results.update({f"{name}_{k}": v for k, v in perf.items()})
worst = max(worst, rc)
results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN")
results["overall_status_code"] = worst
results["plugin_count"] = len(self.commands)
return results return results
+1 -2
View File
@@ -68,8 +68,7 @@ async def test_nagios_runner():
print(f" ✓ Collected {len(data)} data points") print(f" ✓ Collected {len(data)} data points")
print(f"\n4. Results:") print(f"\n4. Results:")
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})") print(f" Data points collected: {len(data)}")
print(f" Plugins Executed: {data.get('plugin_count')}")
# Show individual plugin results # Show individual plugin results
print(f"\n5. Individual Plugin Results:") print(f"\n5. Individual Plugin Results:")