From 0f90be659ed29f5c5c854a9ff5136b1d795086c3 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Wed, 13 May 2026 06:33:06 -0400 Subject: [PATCH] fix: correct ZFS pool status threshold operator and add per-metric grace The default zfs_monitor.*.status threshold used operator '>' with warning=1, so a DEGRADED pool (status=1) never alerted (1 > 1 is false) and a FAULTED pool (status=2) only triggered WARNING instead of CRITICAL. Fix the operator to '>=' in THRESHOLD_DEFAULTS and the example config. Also adds a per-metric grace period override (ThresholdConfig.grace) so individual thresholds can bypass or shorten the global grace delay. Alerts with grace=0 fire immediately on state change rather than waiting for a second collection cycle. Sets grace=0 on zfs_monitor.*.status so pool degradation alerts fire on the first data report after the event. Co-Authored-By: Claude Sonnet 4.6 --- hbd/config_thresholds_example.yaml | 5 +-- hbd/server/config.py | 3 +- hbd/server/http.py | 7 +++++ hbd/server/settings.py | 1 + hbd/server/threshold.py | 49 ++++++++++++++++++++++-------- 5 files changed, 49 insertions(+), 16 deletions(-) diff --git a/hbd/config_thresholds_example.yaml b/hbd/config_thresholds_example.yaml index aceb593..b8dd5d9 100644 --- a/hbd/config_thresholds_example.yaml +++ b/hbd/config_thresholds_example.yaml @@ -146,8 +146,9 @@ thresholds: status: warning: 1 # Alert WARNING when pool is DEGRADED critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL - operator: ">" - hysteresis: 0.0 # No hysteresis — a degraded pool is always critical + operator: ">=" + hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting + grace: 0 # Fire immediately — don't wait for a second collection display: "ZFS pool {pool_name} is {health}" # Per-pool capacity thresholds (optional; add pools you care about) diff --git a/hbd/server/config.py b/hbd/server/config.py index 51ec54c..335ad2f 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -113,8 +113,9 @@ THRESHOLD_DEFAULTS = { 'status': { 'warning': 1, 'critical': 2, - 'operator': '>', + 'operator': '>=', 'hysteresis': 0.0, + 'grace': 0, 'display': 'ZFS pool {pool_name} is {health}' }, 'capacity': { diff --git a/hbd/server/http.py b/hbd/server/http.py index d2fe421..06b27f9 100644 --- a/hbd/server/http.py +++ b/hbd/server/http.py @@ -61,6 +61,13 @@ def _insert_threshold_metric(thresholds: dict, metric_path: str, values: dict) - except (TypeError, ValueError): pass + grace = values.get("grace") + if grace is not None: + try: + cfg["grace"] = float(grace) + except (TypeError, ValueError): + pass + count = values.get("count") if count is not None: try: diff --git a/hbd/server/settings.py b/hbd/server/settings.py index aef014d..1b8d90f 100644 --- a/hbd/server/settings.py +++ b/hbd/server/settings.py @@ -248,6 +248,7 @@ def get_settings_sections(config: dict, threshold_checker=None) -> list: "count": tc.count, "enabled": tc.enabled, "display": tc.display or "", + "grace": tc.grace, } threshold_config_list = [] diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 80b9d80..ec50f03 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -195,6 +195,7 @@ class ThresholdConfig: hysteresis: float = 0.0, enabled: bool = True, count: int = 1, + grace: Optional[float] = None, ): """ Initialize threshold configuration. @@ -207,6 +208,7 @@ class ThresholdConfig: hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) enabled: Whether this threshold is enabled count: Number of consecutive exceedances required before alerting (default 1) + grace: Per-metric grace period in seconds; overrides global grace when set """ self.metric_path = metric_path self.warning = warning @@ -215,6 +217,7 @@ class ThresholdConfig: self.hysteresis = hysteresis self.display = display self.count = max(1, int(count)) + self.grace = float(grace) if grace is not None else None # Parse operator try: @@ -624,11 +627,12 @@ class ThresholdChecker: display = threshold_config.get("display", default_display) hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02) enabled = threshold_config.get("enabled", True) + grace = threshold_config.get("grace", None) if warning is None and critical is None and not is_nagios_op: logger.warning("No thresholds defined for %s, skipping", metric_path) continue - + threshold = ThresholdConfig( metric_path=metric_path, warning=warning, @@ -636,7 +640,8 @@ class ThresholdChecker: operator=operator, hysteresis=hysteresis, enabled=enabled, - display=display + display=display, + grace=grace, ) target_dict[metric_path] = threshold @@ -681,9 +686,10 @@ class ThresholdChecker: hysteresis = threshold_config.get("hysteresis", 0.1) enabled = threshold_config.get("enabled", True) display = threshold_config.get("display") + grace = threshold_config.get("grace", None) if warning is None and critical is None: continue - + threshold = ThresholdConfig( metric_path=metric_path, warning=warning, @@ -691,7 +697,8 @@ class ThresholdChecker: operator=operator, hysteresis=hysteresis, enabled=enabled, - display=display + display=display, + grace=grace, ) target_dict[metric_path] = threshold @@ -734,6 +741,7 @@ class ThresholdChecker: hysteresis = threshold_config.get("hysteresis", 0.02) enabled = threshold_config.get("enabled", True) display = threshold_config.get("display") + grace = threshold_config.get("grace", None) if warning is None and critical is None: continue target_dict[metric_path] = ThresholdConfig( @@ -744,6 +752,7 @@ class ThresholdChecker: hysteresis=hysteresis, enabled=enabled, display=display, + grace=grace, ) def _parse_rtt_thresholds( @@ -779,6 +788,7 @@ class ThresholdChecker: enabled = rtt_thresholds.get("enabled", True) display = rtt_thresholds.get("display") count = rtt_thresholds.get("count", 1) + grace = rtt_thresholds.get("grace", None) if warning is None and critical is None: logger.warning("No RTT thresholds defined, skipping") @@ -793,6 +803,7 @@ class ThresholdChecker: enabled=enabled, display=display, count=count, + grace=grace, ) target_dict[metric_path] = threshold @@ -1353,7 +1364,9 @@ class ThresholdChecker: ) -> None: """Handle a state-change transition with grace-period logic. - Transitioning INTO alert (worsening): defers the notification for grace_seconds. + Transitioning INTO alert (worsening): defers the notification for the effective + grace period (threshold.grace if set, else self.grace_seconds). Grace of 0 fires + the notification immediately with no deferral. De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification; the metric is still alerting so no RECOVER was sent. Transitioning TO OK: @@ -1361,6 +1374,8 @@ class ThresholdChecker: and the recovery — the spike never warranted a page. - Past grace: fires the RECOVER notification normally. """ + effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds + lvl, message, formatted_msg = self._trigger_notification( host_name, metric_path, old_level, new_level, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name, @@ -1371,18 +1386,25 @@ class ThresholdChecker: if alert_state.pending_since is not None: logger.info( "Alert suppressed (recovered within %.0fs grace): %s on %s", - self.grace_seconds, metric_path, host_name, + effective_grace, metric_path, host_name, ) alert_state.pending_since = None else: self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) elif new_level.value > old_level.value: - # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification. - alert_state.pending_since = time.time() - logger.debug( - "Alert deferred (%.0fs grace): %s on %s = %s", - self.grace_seconds, metric_path, host_name, value, - ) + # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL). + if effective_grace <= 0: + # No grace period — fire immediately. + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) + now = time.time() + alert_state.last_notification = now + alert_state.notification_count = 1 + else: + alert_state.pending_since = time.time() + logger.debug( + "Alert deferred (%.0fs grace): %s on %s = %s", + effective_grace, metric_path, host_name, value, + ) else: # De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still # alerting but did not recover, so no new notification. @@ -1407,8 +1429,9 @@ class ThresholdChecker: If a deferred notification is pending and grace_seconds have elapsed, fires it now. Otherwise falls through to normal reminder logic. """ + effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds if alert_state.pending_since is not None: - if time.time() - alert_state.pending_since >= self.grace_seconds: + if time.time() - alert_state.pending_since >= effective_grace: lvl, message, formatted_msg = self._trigger_notification( host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name,