diff --git a/README.md b/README.md
index c42dd67..5308a96 100644
--- a/README.md
+++ b/README.md
@@ -296,7 +296,7 @@ Available variables:
|---|---|
| `{value}` | Current metric value |
| `{threshold_value}` | Threshold that was crossed |
-| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) |
+| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
| `{check_name}` | Prefix stripped by generic matching (see below) |
| `{metric_name}` | Full field name within the plugin data |
| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
@@ -316,15 +316,13 @@ nagios_runner.root_status_code → no match
nagios_runner.status_code → matched ✓
```
-Configure the generic threshold once:
+Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
```yaml
nagios_runner:
status_code:
- warning: 1
- critical: 2
- operator: ">="
- display: "{check_name}: exit {value}"
+ operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
+ display: "{check_name}: {output}"
```
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
diff --git a/docs/NAGIOS_INTEGRATION.md b/docs/NAGIOS_INTEGRATION.md
index d0bca47..f372429 100644
--- a/docs/NAGIOS_INTEGRATION.md
+++ b/docs/NAGIOS_INTEGRATION.md
@@ -104,11 +104,6 @@ The `nagios_runner` plugin collects:
- `{name}_{metric}_min` - Minimum value (if present)
- `{name}_{metric}_max` - Maximum value (if present)
-**Overall:**
-- `overall_status` - Worst status from all commands
-- `overall_status_code` - Worst status code
-- `plugin_count` - Number of Nagios plugins executed
-
## Configuration Options
```yaml
diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md
index 48bf66c..1ad94fb 100644
--- a/docs/THRESHOLD_ALERTING.md
+++ b/docs/THRESHOLD_ALERTING.md
@@ -1110,33 +1110,6 @@ hosts:
db-02:
threshold_config: [tight_memory, db_disk]
```
-
-### Backward Compatibility
-
-The legacy single threshold configuration is fully supported:
-
-```yaml
-# Old format - still works
-thresholds:
- cpu_monitor:
- cpu_percent:
- warning: 80.0
- critical: 90.0
-```
-
-This is equivalent to:
-
-```yaml
-# New format
-threshold_configs:
- default:
- thresholds:
- cpu_monitor:
- cpu_percent:
- warning: 80.0
- critical: 90.0
-```
-
### Configuration Priority
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
diff --git a/hbd/server/config.py b/hbd/server/config.py
index 58074fb..becac2b 100644
--- a/hbd/server/config.py
+++ b/hbd/server/config.py
@@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = {
'warning': 200,
'critical': 250.0,
'count': 3 # Optional: number of consecutive breaches before alerting
+ },
+ 'nagios_runner': {
+ 'status_code': {
+ 'display': '{check_name} {output}',
+ 'operator': "nagios"
+ }
}
}
}
diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html
index 9718301..733b231 100644
--- a/hbd/server/templates/alerts.html
+++ b/hbd/server/templates/alerts.html
@@ -437,7 +437,7 @@
${alert.metric_path}
diff --git a/hbd/server/templates/plugins.html b/hbd/server/templates/plugins.html
index d295beb..3d81bfb 100644
--- a/hbd/server/templates/plugins.html
+++ b/hbd/server/templates/plugins.html
@@ -499,6 +499,17 @@
return pluginCache[hostname]?.[pluginName] ?? null;
}
+ // Return worst nagios exit code (0-3) found in a nagios_runner data object.
+ function nagiosWorstStatus(data) {
+ let worst = 0;
+ for (const [k, v] of Object.entries(data || {})) {
+ if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) {
+ worst = v;
+ }
+ }
+ return worst;
+ }
+
// ── Fetch helpers ───────────────────────────────────────────────────────
async function fetchPlugin(hostname, pluginName) {
@@ -600,13 +611,13 @@
? chips.join('')
: '—';
- // Nagios badge
+ // Nagios badge — derive worst status from individual check codes
const nagios = getCache(hostname, 'nagios_runner');
if (nagosBadge && nagios) {
- const status = (nagios.data.overall_status || '—').toUpperCase();
- const cls = status === 'OK' ? 'ok'
- : status === 'WARNING' ? 'warning'
- : status === 'CRITICAL' ? 'critical' : '';
+ const worst = nagiosWorstStatus(nagios.data);
+ const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
+ const status = names[worst] || '—';
+ const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : '';
nagosBadge.className = `nagios-badge ${cls}`;
nagosBadge.textContent = status;
}
@@ -715,9 +726,10 @@
break;
}
case 'nagios_runner': {
- const status = (d.overall_status || '?').toUpperCase();
- const count = d.plugin_count;
- text = status + (count != null ? ` — ${count} checks` : '');
+ const worst = nagiosWorstStatus(d);
+ const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
+ const codes = Object.keys(d).filter(k => k.endsWith('_status_code'));
+ text = (names[worst] || '?') + (codes.length ? ` — ${codes.length} checks` : '');
break;
}
case 'filesystem_info': {
diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py
index f63a1fd..06125f9 100644
--- a/hbd/server/threshold.py
+++ b/hbd/server/threshold.py
@@ -30,12 +30,13 @@ class AlertLevel(Enum):
class ComparisonOperator(Enum):
"""Supported comparison operators for threshold checks."""
- GT = ">" # Greater than
- GTE = ">=" # Greater than or equal
- LT = "<" # Less than
- LTE = "<=" # Less than or equal
- EQ = "==" # Equal to
- NEQ = "!=" # Not equal to
+ GT = ">" # Greater than
+ GTE = ">=" # Greater than or equal
+ LT = "<" # Less than
+ LTE = "<=" # Less than or equal
+ EQ = "==" # Equal to
+ NEQ = "!=" # Not equal to
+ NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
class AlertState:
@@ -229,33 +230,43 @@ class ThresholdConfig:
def evaluate(self, value: float) -> AlertLevel:
"""
Evaluate a value against this threshold.
-
+
Args:
value: Metric value to check
-
+
Returns:
AlertLevel indicating the severity
"""
if not self.enabled:
return AlertLevel.OK
-
+
+ # Nagios exit-code semantics: value IS the severity
+ if self.operator == ComparisonOperator.NAGIOS:
+ try:
+ code = int(value)
+ except (TypeError, ValueError):
+ return AlertLevel.UNKNOWN
+ return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get(
+ code, AlertLevel.UNKNOWN
+ )
+
try:
# Convert value to float for comparison
value = float(value)
except (TypeError, ValueError):
logger.warning("Cannot convert value %s to float for %s", value, self.metric_path)
return AlertLevel.UNKNOWN
-
+
# Check critical threshold first
if self.critical is not None:
if self._compare(value, self.critical):
return AlertLevel.CRITICAL
-
+
# Then check warning threshold
if self.warning is not None:
if self._compare(value, self.warning):
return AlertLevel.WARNING
-
+
return AlertLevel.OK
def evaluate_with_hysteresis(
@@ -274,7 +285,11 @@ class ThresholdConfig:
New alert level considering hysteresis
"""
new_level = self.evaluate(value)
-
+
+ # Nagios exit codes are discrete integers — hysteresis doesn't apply
+ if self.operator == ComparisonOperator.NAGIOS:
+ return new_level
+
# If no hysteresis, return new level
if self.hysteresis == 0.0:
return new_level
@@ -557,11 +572,14 @@ class ThresholdChecker:
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
- display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
- hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default
+ # Nagios operator maps exit codes directly; no numeric thresholds needed
+ is_nagios_op = (operator == "nagios")
+ default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})"
+ display = threshold_config.get("display", default_display)
+ hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
enabled = threshold_config.get("enabled", True)
-
- if warning is None and critical is None:
+
+ if warning is None and critical is None and not is_nagios_op:
logger.warning("No thresholds defined for %s, skipping", metric_path)
continue
@@ -1016,48 +1034,12 @@ class ThresholdChecker:
import math
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
- # Format message
- if new_level == AlertLevel.OK:
- lvl = "RECOVER"
- message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
- elif new_level == AlertLevel.WARNING:
- lvl = "WARNING"
- if threshold_value is not None:
- threshold_info = self._format_display(
- threshold.display,
- value=display_value,
- threshold_value=threshold_value,
- op_symbol=op_symbol,
- plugin_data=plugin_data,
- check_name=check_name,
- metric_name=metric_name,
- )
- message = f"{metric_path} = {display_value} {threshold_info}"
- else:
- message = f"{metric_path} = {display_value}"
- elif new_level == AlertLevel.CRITICAL:
- lvl = "CRITICAL"
- if threshold_value is not None:
- threshold_info = self._format_display(
- threshold.display,
- value=display_value,
- threshold_value=threshold_value,
- op_symbol=op_symbol,
- plugin_data=plugin_data,
- check_name=check_name,
- metric_name=metric_name,
- )
- message = f"{metric_path} = {display_value} {threshold_info}"
- else:
- message = f"{metric_path} = {display_value}"
- else:
- lvl = "UNKNOWN"
- message = f"{metric_path} = {display_value}"
+ # Format message — for the nagios operator there is no numeric threshold_value;
+ # render the display template whenever one is available.
+ has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS
- # Return the formatted threshold info for storing in AlertState
- formatted_threshold_msg = None
- if threshold_value is not None and new_level != AlertLevel.OK:
- formatted_threshold_msg = self._format_display(
+ def _fmt():
+ return self._format_display(
threshold.display,
value=display_value,
threshold_value=threshold_value,
@@ -1067,6 +1049,31 @@ class ThresholdChecker:
metric_name=metric_name,
)
+ if new_level == AlertLevel.OK:
+ lvl = "RECOVER"
+ message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
+ elif new_level == AlertLevel.WARNING:
+ lvl = "WARNING"
+ if has_display:
+ message = f"{metric_path} = {display_value} {_fmt()}"
+ else:
+ message = f"{metric_path} = {display_value}"
+ elif new_level == AlertLevel.CRITICAL:
+ lvl = "CRITICAL"
+ if has_display:
+ message = f"{metric_path} = {display_value} {_fmt()}"
+ else:
+ message = f"{metric_path} = {display_value}"
+ else:
+ lvl = "UNKNOWN"
+ if has_display:
+ message = f"{metric_path} = {display_value} {_fmt()}"
+ else:
+ message = f"{metric_path} = {display_value}"
+
+ # Formatted threshold info stored on AlertState for the UI
+ formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
+
return lvl, message, formatted_threshold_msg
def _send_notification(
@@ -1114,7 +1121,7 @@ class ThresholdChecker:
self,
display_format: str,
value: Any,
- threshold_value: float,
+ threshold_value: Optional[float],
op_symbol: str,
plugin_data: Optional[Dict[str, Any]] = None,
check_name: Optional[str] = None,
@@ -1139,9 +1146,10 @@ class ThresholdChecker:
# Build format context with standard variables
format_context = {
'value': value,
- 'threshold_value': threshold_value,
'op_symbol': op_symbol,
}
+ if threshold_value is not None:
+ format_context['threshold_value'] = threshold_value
# Add generic-match context variables when available
if check_name is not None:
diff --git a/scripts/hbc_mini.py b/scripts/hbc_mini.py
index 5114e7a..5c37e52 100755
--- a/scripts/hbc_mini.py
+++ b/scripts/hbc_mini.py
@@ -388,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
async def _collect_metrics(self) -> Dict[str, Any]:
results: Dict[str, Any] = {}
- worst = 0
for cmd_cfg in self.commands:
name = cmd_cfg.get("name")
command = cmd_cfg.get("command")
@@ -399,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
results[f"{name}_status_code"] = rc
results[f"{name}_output"] = msg
results.update({f"{name}_{k}": v for k, v in perf.items()})
- worst = max(worst, rc)
- results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN")
- results["overall_status_code"] = worst
- results["plugin_count"] = len(self.commands)
return results
diff --git a/test_nagios.py b/test_nagios.py
index 12bf849..e2955e6 100644
--- a/test_nagios.py
+++ b/test_nagios.py
@@ -68,8 +68,7 @@ async def test_nagios_runner():
print(f" ✓ Collected {len(data)} data points")
print(f"\n4. Results:")
- print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
- print(f" Plugins Executed: {data.get('plugin_count')}")
+ print(f" Data points collected: {len(data)}")
# Show individual plugin results
print(f"\n5. Individual Plugin Results:")