diff --git a/README.md b/README.md index c42dd67..5308a96 100644 --- a/README.md +++ b/README.md @@ -296,7 +296,7 @@ Available variables: |---|---| | `{value}` | Current metric value | | `{threshold_value}` | Threshold that was crossed | -| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …) | +| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator | | `{check_name}` | Prefix stripped by generic matching (see below) | | `{metric_name}` | Full field name within the plugin data | | `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) | @@ -316,15 +316,13 @@ nagios_runner.root_status_code → no match nagios_runner.status_code → matched ✓ ``` -Configure the generic threshold once: +Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values: ```yaml nagios_runner: status_code: - warning: 1 - critical: 2 - operator: ">=" - display: "{check_name}: exit {value}" + operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN + display: "{check_name}: {output}" ``` The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command. diff --git a/docs/NAGIOS_INTEGRATION.md b/docs/NAGIOS_INTEGRATION.md index d0bca47..f372429 100644 --- a/docs/NAGIOS_INTEGRATION.md +++ b/docs/NAGIOS_INTEGRATION.md @@ -104,11 +104,6 @@ The `nagios_runner` plugin collects: - `{name}_{metric}_min` - Minimum value (if present) - `{name}_{metric}_max` - Maximum value (if present) -**Overall:** -- `overall_status` - Worst status from all commands -- `overall_status_code` - Worst status code -- `plugin_count` - Number of Nagios plugins executed - ## Configuration Options ```yaml diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index 48bf66c..1ad94fb 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -1110,33 +1110,6 @@ hosts: db-02: threshold_config: [tight_memory, db_disk] ``` - -### Backward Compatibility - -The legacy single threshold configuration is fully supported: - -```yaml -# Old format - still works -thresholds: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 -``` - -This is equivalent to: - -```yaml -# New format -threshold_configs: - default: - thresholds: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 -``` - ### Configuration Priority 1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults diff --git a/hbd/server/config.py b/hbd/server/config.py index 58074fb..becac2b 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = { 'warning': 200, 'critical': 250.0, 'count': 3 # Optional: number of consecutive breaches before alerting + }, + 'nagios_runner': { + 'status_code': { + 'display': '{check_name} {output}', + 'operator': "nagios" + } } } } diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html index 9718301..733b231 100644 --- a/hbd/server/templates/alerts.html +++ b/hbd/server/templates/alerts.html @@ -437,7 +437,7 @@
${alert.level} - ${alert.hostname} + ${alert.hostname}
${alert.metric_path}
diff --git a/hbd/server/templates/plugins.html b/hbd/server/templates/plugins.html index d295beb..3d81bfb 100644 --- a/hbd/server/templates/plugins.html +++ b/hbd/server/templates/plugins.html @@ -499,6 +499,17 @@ return pluginCache[hostname]?.[pluginName] ?? null; } + // Return worst nagios exit code (0-3) found in a nagios_runner data object. + function nagiosWorstStatus(data) { + let worst = 0; + for (const [k, v] of Object.entries(data || {})) { + if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) { + worst = v; + } + } + return worst; + } + // ── Fetch helpers ─────────────────────────────────────────────────────── async function fetchPlugin(hostname, pluginName) { @@ -600,13 +611,13 @@ ? chips.join('') : ''; - // Nagios badge + // Nagios badge — derive worst status from individual check codes const nagios = getCache(hostname, 'nagios_runner'); if (nagosBadge && nagios) { - const status = (nagios.data.overall_status || '—').toUpperCase(); - const cls = status === 'OK' ? 'ok' - : status === 'WARNING' ? 'warning' - : status === 'CRITICAL' ? 'critical' : ''; + const worst = nagiosWorstStatus(nagios.data); + const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'}; + const status = names[worst] || '—'; + const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : ''; nagosBadge.className = `nagios-badge ${cls}`; nagosBadge.textContent = status; } @@ -715,9 +726,10 @@ break; } case 'nagios_runner': { - const status = (d.overall_status || '?').toUpperCase(); - const count = d.plugin_count; - text = status + (count != null ? ` — ${count} checks` : ''); + const worst = nagiosWorstStatus(d); + const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'}; + const codes = Object.keys(d).filter(k => k.endsWith('_status_code')); + text = (names[worst] || '?') + (codes.length ? ` — ${codes.length} checks` : ''); break; } case 'filesystem_info': { diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index f63a1fd..06125f9 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -30,12 +30,13 @@ class AlertLevel(Enum): class ComparisonOperator(Enum): """Supported comparison operators for threshold checks.""" - GT = ">" # Greater than - GTE = ">=" # Greater than or equal - LT = "<" # Less than - LTE = "<=" # Less than or equal - EQ = "==" # Equal to - NEQ = "!=" # Not equal to + GT = ">" # Greater than + GTE = ">=" # Greater than or equal + LT = "<" # Less than + LTE = "<=" # Less than or equal + EQ = "==" # Equal to + NEQ = "!=" # Not equal to + NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN class AlertState: @@ -229,33 +230,43 @@ class ThresholdConfig: def evaluate(self, value: float) -> AlertLevel: """ Evaluate a value against this threshold. - + Args: value: Metric value to check - + Returns: AlertLevel indicating the severity """ if not self.enabled: return AlertLevel.OK - + + # Nagios exit-code semantics: value IS the severity + if self.operator == ComparisonOperator.NAGIOS: + try: + code = int(value) + except (TypeError, ValueError): + return AlertLevel.UNKNOWN + return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get( + code, AlertLevel.UNKNOWN + ) + try: # Convert value to float for comparison value = float(value) except (TypeError, ValueError): logger.warning("Cannot convert value %s to float for %s", value, self.metric_path) return AlertLevel.UNKNOWN - + # Check critical threshold first if self.critical is not None: if self._compare(value, self.critical): return AlertLevel.CRITICAL - + # Then check warning threshold if self.warning is not None: if self._compare(value, self.warning): return AlertLevel.WARNING - + return AlertLevel.OK def evaluate_with_hysteresis( @@ -274,7 +285,11 @@ class ThresholdConfig: New alert level considering hysteresis """ new_level = self.evaluate(value) - + + # Nagios exit codes are discrete integers — hysteresis doesn't apply + if self.operator == ComparisonOperator.NAGIOS: + return new_level + # If no hysteresis, return new level if self.hysteresis == 0.0: return new_level @@ -557,11 +572,14 @@ class ThresholdChecker: warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") - display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") - hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default + # Nagios operator maps exit codes directly; no numeric thresholds needed + is_nagios_op = (operator == "nagios") + default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})" + display = threshold_config.get("display", default_display) + hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02) enabled = threshold_config.get("enabled", True) - - if warning is None and critical is None: + + if warning is None and critical is None and not is_nagios_op: logger.warning("No thresholds defined for %s, skipping", metric_path) continue @@ -1016,48 +1034,12 @@ class ThresholdChecker: import math display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value - # Format message - if new_level == AlertLevel.OK: - lvl = "RECOVER" - message = f"{metric_path} = {display_value} ({old_level.name} -> OK)" - elif new_level == AlertLevel.WARNING: - lvl = "WARNING" - if threshold_value is not None: - threshold_info = self._format_display( - threshold.display, - value=display_value, - threshold_value=threshold_value, - op_symbol=op_symbol, - plugin_data=plugin_data, - check_name=check_name, - metric_name=metric_name, - ) - message = f"{metric_path} = {display_value} {threshold_info}" - else: - message = f"{metric_path} = {display_value}" - elif new_level == AlertLevel.CRITICAL: - lvl = "CRITICAL" - if threshold_value is not None: - threshold_info = self._format_display( - threshold.display, - value=display_value, - threshold_value=threshold_value, - op_symbol=op_symbol, - plugin_data=plugin_data, - check_name=check_name, - metric_name=metric_name, - ) - message = f"{metric_path} = {display_value} {threshold_info}" - else: - message = f"{metric_path} = {display_value}" - else: - lvl = "UNKNOWN" - message = f"{metric_path} = {display_value}" + # Format message — for the nagios operator there is no numeric threshold_value; + # render the display template whenever one is available. + has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS - # Return the formatted threshold info for storing in AlertState - formatted_threshold_msg = None - if threshold_value is not None and new_level != AlertLevel.OK: - formatted_threshold_msg = self._format_display( + def _fmt(): + return self._format_display( threshold.display, value=display_value, threshold_value=threshold_value, @@ -1067,6 +1049,31 @@ class ThresholdChecker: metric_name=metric_name, ) + if new_level == AlertLevel.OK: + lvl = "RECOVER" + message = f"{metric_path} = {display_value} ({old_level.name} -> OK)" + elif new_level == AlertLevel.WARNING: + lvl = "WARNING" + if has_display: + message = f"{metric_path} = {display_value} {_fmt()}" + else: + message = f"{metric_path} = {display_value}" + elif new_level == AlertLevel.CRITICAL: + lvl = "CRITICAL" + if has_display: + message = f"{metric_path} = {display_value} {_fmt()}" + else: + message = f"{metric_path} = {display_value}" + else: + lvl = "UNKNOWN" + if has_display: + message = f"{metric_path} = {display_value} {_fmt()}" + else: + message = f"{metric_path} = {display_value}" + + # Formatted threshold info stored on AlertState for the UI + formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None + return lvl, message, formatted_threshold_msg def _send_notification( @@ -1114,7 +1121,7 @@ class ThresholdChecker: self, display_format: str, value: Any, - threshold_value: float, + threshold_value: Optional[float], op_symbol: str, plugin_data: Optional[Dict[str, Any]] = None, check_name: Optional[str] = None, @@ -1139,9 +1146,10 @@ class ThresholdChecker: # Build format context with standard variables format_context = { 'value': value, - 'threshold_value': threshold_value, 'op_symbol': op_symbol, } + if threshold_value is not None: + format_context['threshold_value'] = threshold_value # Add generic-match context variables when available if check_name is not None: diff --git a/scripts/hbc_mini.py b/scripts/hbc_mini.py index 5114e7a..5c37e52 100755 --- a/scripts/hbc_mini.py +++ b/scripts/hbc_mini.py @@ -388,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin): async def _collect_metrics(self) -> Dict[str, Any]: results: Dict[str, Any] = {} - worst = 0 for cmd_cfg in self.commands: name = cmd_cfg.get("name") command = cmd_cfg.get("command") @@ -399,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin): results[f"{name}_status_code"] = rc results[f"{name}_output"] = msg results.update({f"{name}_{k}": v for k, v in perf.items()}) - worst = max(worst, rc) - results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN") - results["overall_status_code"] = worst - results["plugin_count"] = len(self.commands) return results diff --git a/test_nagios.py b/test_nagios.py index 12bf849..e2955e6 100644 --- a/test_nagios.py +++ b/test_nagios.py @@ -68,8 +68,7 @@ async def test_nagios_runner(): print(f" ✓ Collected {len(data)} data points") print(f"\n4. Results:") - print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})") - print(f" Plugins Executed: {data.get('plugin_count')}") + print(f" Data points collected: {len(data)}") # Show individual plugin results print(f"\n5. Individual Plugin Results:")