feat: nagios operator for direct exit-code severity mapping
Add ComparisonOperator.NAGIOS ("nagios") that maps Nagios exit codes
directly to alert levels (0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN) without
requiring numeric warning/critical thresholds. Hysteresis is bypassed for
discrete codes. Display template defaults to "{check_name}: {output}".
_format_display() handles None threshold_value gracefully.
Add nagios_runner.status_code as a built-in default threshold config so
nagios checks alert out of the box.
Also: fix alerts.html scrolling (override html,body), make hostname a link
to /plugins#<hostname>, remove overall_status/overall_status_code/plugin_count
from nagios_runner and hbc_mini, replace with computed worst-status in
plugins.html via nagiosWorstStatus() helper.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -296,7 +296,7 @@ Available variables:
|
||||
|---|---|
|
||||
| `{value}` | Current metric value |
|
||||
| `{threshold_value}` | Threshold that was crossed |
|
||||
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) |
|
||||
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
|
||||
| `{check_name}` | Prefix stripped by generic matching (see below) |
|
||||
| `{metric_name}` | Full field name within the plugin data |
|
||||
| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
|
||||
@@ -316,15 +316,13 @@ nagios_runner.root_status_code → no match
|
||||
nagios_runner.status_code → matched ✓
|
||||
```
|
||||
|
||||
Configure the generic threshold once:
|
||||
Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
status_code:
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
display: "{check_name}: exit {value}"
|
||||
operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||
display: "{check_name}: {output}"
|
||||
```
|
||||
|
||||
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
|
||||
|
||||
@@ -104,11 +104,6 @@ The `nagios_runner` plugin collects:
|
||||
- `{name}_{metric}_min` - Minimum value (if present)
|
||||
- `{name}_{metric}_max` - Maximum value (if present)
|
||||
|
||||
**Overall:**
|
||||
- `overall_status` - Worst status from all commands
|
||||
- `overall_status_code` - Worst status code
|
||||
- `plugin_count` - Number of Nagios plugins executed
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -1110,33 +1110,6 @@ hosts:
|
||||
db-02:
|
||||
threshold_config: [tight_memory, db_disk]
|
||||
```
|
||||
|
||||
### Backward Compatibility
|
||||
|
||||
The legacy single threshold configuration is fully supported:
|
||||
|
||||
```yaml
|
||||
# Old format - still works
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
```
|
||||
|
||||
This is equivalent to:
|
||||
|
||||
```yaml
|
||||
# New format
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
```
|
||||
|
||||
### Configuration Priority
|
||||
|
||||
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
|
||||
|
||||
@@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = {
|
||||
'warning': 200,
|
||||
'critical': 250.0,
|
||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||
},
|
||||
'nagios_runner': {
|
||||
'status_code': {
|
||||
'display': '{check_name} {output}',
|
||||
'operator': "nagios"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,7 +437,7 @@
|
||||
<div class="alert-main">
|
||||
<div class="alert-header">
|
||||
<span class="alert-level ${level}">${alert.level}</span>
|
||||
<a class="alert-hostname" href="/plugins/${alert.hostname}">${alert.hostname}</a>
|
||||
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||
</div>
|
||||
<div class="alert-metric">${alert.metric_path}</div>
|
||||
<div class="alert-details">
|
||||
|
||||
@@ -499,6 +499,17 @@
|
||||
return pluginCache[hostname]?.[pluginName] ?? null;
|
||||
}
|
||||
|
||||
// Return worst nagios exit code (0-3) found in a nagios_runner data object.
|
||||
function nagiosWorstStatus(data) {
|
||||
let worst = 0;
|
||||
for (const [k, v] of Object.entries(data || {})) {
|
||||
if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) {
|
||||
worst = v;
|
||||
}
|
||||
}
|
||||
return worst;
|
||||
}
|
||||
|
||||
// ── Fetch helpers ───────────────────────────────────────────────────────
|
||||
|
||||
async function fetchPlugin(hostname, pluginName) {
|
||||
@@ -600,13 +611,13 @@
|
||||
? chips.join('')
|
||||
: '<span class="glance-loading">—</span>';
|
||||
|
||||
// Nagios badge
|
||||
// Nagios badge — derive worst status from individual check codes
|
||||
const nagios = getCache(hostname, 'nagios_runner');
|
||||
if (nagosBadge && nagios) {
|
||||
const status = (nagios.data.overall_status || '—').toUpperCase();
|
||||
const cls = status === 'OK' ? 'ok'
|
||||
: status === 'WARNING' ? 'warning'
|
||||
: status === 'CRITICAL' ? 'critical' : '';
|
||||
const worst = nagiosWorstStatus(nagios.data);
|
||||
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||
const status = names[worst] || '—';
|
||||
const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : '';
|
||||
nagosBadge.className = `nagios-badge ${cls}`;
|
||||
nagosBadge.textContent = status;
|
||||
}
|
||||
@@ -715,9 +726,10 @@
|
||||
break;
|
||||
}
|
||||
case 'nagios_runner': {
|
||||
const status = (d.overall_status || '?').toUpperCase();
|
||||
const count = d.plugin_count;
|
||||
text = status + (count != null ? ` — ${count} checks` : '');
|
||||
const worst = nagiosWorstStatus(d);
|
||||
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||
const codes = Object.keys(d).filter(k => k.endsWith('_status_code'));
|
||||
text = (names[worst] || '?') + (codes.length ? ` — ${codes.length} checks` : '');
|
||||
break;
|
||||
}
|
||||
case 'filesystem_info': {
|
||||
|
||||
+60
-52
@@ -30,12 +30,13 @@ class AlertLevel(Enum):
|
||||
|
||||
class ComparisonOperator(Enum):
|
||||
"""Supported comparison operators for threshold checks."""
|
||||
GT = ">" # Greater than
|
||||
GTE = ">=" # Greater than or equal
|
||||
LT = "<" # Less than
|
||||
LTE = "<=" # Less than or equal
|
||||
EQ = "==" # Equal to
|
||||
NEQ = "!=" # Not equal to
|
||||
GT = ">" # Greater than
|
||||
GTE = ">=" # Greater than or equal
|
||||
LT = "<" # Less than
|
||||
LTE = "<=" # Less than or equal
|
||||
EQ = "==" # Equal to
|
||||
NEQ = "!=" # Not equal to
|
||||
NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||
|
||||
|
||||
class AlertState:
|
||||
@@ -239,6 +240,16 @@ class ThresholdConfig:
|
||||
if not self.enabled:
|
||||
return AlertLevel.OK
|
||||
|
||||
# Nagios exit-code semantics: value IS the severity
|
||||
if self.operator == ComparisonOperator.NAGIOS:
|
||||
try:
|
||||
code = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return AlertLevel.UNKNOWN
|
||||
return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get(
|
||||
code, AlertLevel.UNKNOWN
|
||||
)
|
||||
|
||||
try:
|
||||
# Convert value to float for comparison
|
||||
value = float(value)
|
||||
@@ -275,6 +286,10 @@ class ThresholdConfig:
|
||||
"""
|
||||
new_level = self.evaluate(value)
|
||||
|
||||
# Nagios exit codes are discrete integers — hysteresis doesn't apply
|
||||
if self.operator == ComparisonOperator.NAGIOS:
|
||||
return new_level
|
||||
|
||||
# If no hysteresis, return new level
|
||||
if self.hysteresis == 0.0:
|
||||
return new_level
|
||||
@@ -557,11 +572,14 @@ class ThresholdChecker:
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default
|
||||
# Nagios operator maps exit codes directly; no numeric thresholds needed
|
||||
is_nagios_op = (operator == "nagios")
|
||||
default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})"
|
||||
display = threshold_config.get("display", default_display)
|
||||
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
if warning is None and critical is None and not is_nagios_op:
|
||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||
continue
|
||||
|
||||
@@ -1016,48 +1034,12 @@ class ThresholdChecker:
|
||||
import math
|
||||
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
|
||||
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVER"
|
||||
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
if threshold_value is not None:
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data,
|
||||
check_name=check_name,
|
||||
metric_name=metric_name,
|
||||
)
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
if threshold_value is not None:
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data,
|
||||
check_name=check_name,
|
||||
metric_name=metric_name,
|
||||
)
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {display_value}"
|
||||
# Format message — for the nagios operator there is no numeric threshold_value;
|
||||
# render the display template whenever one is available.
|
||||
has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS
|
||||
|
||||
# Return the formatted threshold info for storing in AlertState
|
||||
formatted_threshold_msg = None
|
||||
if threshold_value is not None and new_level != AlertLevel.OK:
|
||||
formatted_threshold_msg = self._format_display(
|
||||
def _fmt():
|
||||
return self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
@@ -1067,6 +1049,31 @@ class ThresholdChecker:
|
||||
metric_name=metric_name,
|
||||
)
|
||||
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVER"
|
||||
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
if has_display:
|
||||
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
if has_display:
|
||||
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
if has_display:
|
||||
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
|
||||
# Formatted threshold info stored on AlertState for the UI
|
||||
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
|
||||
|
||||
return lvl, message, formatted_threshold_msg
|
||||
|
||||
def _send_notification(
|
||||
@@ -1114,7 +1121,7 @@ class ThresholdChecker:
|
||||
self,
|
||||
display_format: str,
|
||||
value: Any,
|
||||
threshold_value: float,
|
||||
threshold_value: Optional[float],
|
||||
op_symbol: str,
|
||||
plugin_data: Optional[Dict[str, Any]] = None,
|
||||
check_name: Optional[str] = None,
|
||||
@@ -1139,9 +1146,10 @@ class ThresholdChecker:
|
||||
# Build format context with standard variables
|
||||
format_context = {
|
||||
'value': value,
|
||||
'threshold_value': threshold_value,
|
||||
'op_symbol': op_symbol,
|
||||
}
|
||||
if threshold_value is not None:
|
||||
format_context['threshold_value'] = threshold_value
|
||||
|
||||
# Add generic-match context variables when available
|
||||
if check_name is not None:
|
||||
|
||||
@@ -388,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
results: Dict[str, Any] = {}
|
||||
worst = 0
|
||||
for cmd_cfg in self.commands:
|
||||
name = cmd_cfg.get("name")
|
||||
command = cmd_cfg.get("command")
|
||||
@@ -399,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
results[f"{name}_status_code"] = rc
|
||||
results[f"{name}_output"] = msg
|
||||
results.update({f"{name}_{k}": v for k, v in perf.items()})
|
||||
worst = max(worst, rc)
|
||||
results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN")
|
||||
results["overall_status_code"] = worst
|
||||
results["plugin_count"] = len(self.commands)
|
||||
return results
|
||||
|
||||
|
||||
|
||||
+1
-2
@@ -68,8 +68,7 @@ async def test_nagios_runner():
|
||||
print(f" ✓ Collected {len(data)} data points")
|
||||
|
||||
print(f"\n4. Results:")
|
||||
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
|
||||
print(f" Plugins Executed: {data.get('plugin_count')}")
|
||||
print(f" Data points collected: {len(data)}")
|
||||
|
||||
# Show individual plugin results
|
||||
print(f"\n5. Individual Plugin Results:")
|
||||
|
||||
Reference in New Issue
Block a user