From 0f90be659ed29f5c5c854a9ff5136b1d795086c3 Mon Sep 17 00:00:00 2001
From: Andreas Wrede <aew.git@wrede.ca>
Date: Wed, 13 May 2026 06:33:06 -0400
Subject: [PATCH] fix: correct ZFS pool status threshold operator and add
 per-metric grace

The default zfs_monitor.*.status threshold used operator '>' with warning=1,
so a DEGRADED pool (status=1) never alerted (1 > 1 is false) and a FAULTED
pool (status=2) only triggered WARNING instead of CRITICAL.

Fix the operator to '>=' in THRESHOLD_DEFAULTS and the example config.

Also adds a per-metric grace period override (ThresholdConfig.grace) so
individual thresholds can bypass or shorten the global grace delay. Alerts
with grace=0 fire immediately on state change rather than waiting for a
second collection cycle. Sets grace=0 on zfs_monitor.*.status so pool
degradation alerts fire on the first data report after the event.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 hbd/config_thresholds_example.yaml |  5 +--
 hbd/server/config.py               |  3 +-
 hbd/server/http.py                 |  7 +++++
 hbd/server/settings.py             |  1 +
 hbd/server/threshold.py            | 49 ++++++++++++++++++++++--------
 5 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/hbd/config_thresholds_example.yaml b/hbd/config_thresholds_example.yaml
index aceb593..b8dd5d9 100644
--- a/hbd/config_thresholds_example.yaml
+++ b/hbd/config_thresholds_example.yaml
@@ -146,8 +146,9 @@ thresholds:
         status:
           warning: 1           # Alert WARNING when pool is DEGRADED
           critical: 2           # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
-          operator: ">"
-          hysteresis: 0.0       # No hysteresis — a degraded pool is always critical
+          operator: ">="
+          hysteresis: 0.0       # No hysteresis — a degraded pool is always alerting
+          grace: 0              # Fire immediately — don't wait for a second collection
           display: "ZFS pool {pool_name} is {health}"
 
       # Per-pool capacity thresholds (optional; add pools you care about)
diff --git a/hbd/server/config.py b/hbd/server/config.py
index 51ec54c..335ad2f 100644
--- a/hbd/server/config.py
+++ b/hbd/server/config.py
@@ -113,8 +113,9 @@ THRESHOLD_DEFAULTS = {
                         'status': {
                             'warning': 1,
                             'critical': 2,
-                            'operator': '>',
+                            'operator': '>=',
                             'hysteresis': 0.0,
+                            'grace': 0,
                             'display': 'ZFS pool {pool_name} is {health}'
                         },
                         'capacity': {
diff --git a/hbd/server/http.py b/hbd/server/http.py
index d2fe421..06b27f9 100644
--- a/hbd/server/http.py
+++ b/hbd/server/http.py
@@ -61,6 +61,13 @@ def _insert_threshold_metric(thresholds: dict, metric_path: str, values: dict) -
             except (TypeError, ValueError):
                 pass
 
+    grace = values.get("grace")
+    if grace is not None:
+        try:
+            cfg["grace"] = float(grace)
+        except (TypeError, ValueError):
+            pass
+
     count = values.get("count")
     if count is not None:
         try:
diff --git a/hbd/server/settings.py b/hbd/server/settings.py
index aef014d..1b8d90f 100644
--- a/hbd/server/settings.py
+++ b/hbd/server/settings.py
@@ -248,6 +248,7 @@ def get_settings_sections(config: dict, threshold_checker=None) -> list:
             "count": tc.count,
             "enabled": tc.enabled,
             "display": tc.display or "",
+            "grace": tc.grace,
         }
 
     threshold_config_list = []
diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py
index 80b9d80..ec50f03 100644
--- a/hbd/server/threshold.py
+++ b/hbd/server/threshold.py
@@ -195,6 +195,7 @@ class ThresholdConfig:
         hysteresis: float = 0.0,
         enabled: bool = True,
         count: int = 1,
+        grace: Optional[float] = None,
     ):
         """
         Initialize threshold configuration.
@@ -207,6 +208,7 @@ class ThresholdConfig:
             hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
             enabled: Whether this threshold is enabled
             count: Number of consecutive exceedances required before alerting (default 1)
+            grace: Per-metric grace period in seconds; overrides global grace when set
         """
         self.metric_path = metric_path
         self.warning = warning
@@ -215,6 +217,7 @@ class ThresholdConfig:
         self.hysteresis = hysteresis
         self.display = display
         self.count = max(1, int(count))
+        self.grace = float(grace) if grace is not None else None
         
         # Parse operator
         try:
@@ -624,11 +627,12 @@ class ThresholdChecker:
             display = threshold_config.get("display", default_display)
             hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
             enabled = threshold_config.get("enabled", True)
+            grace = threshold_config.get("grace", None)
 
             if warning is None and critical is None and not is_nagios_op:
                 logger.warning("No thresholds defined for %s, skipping", metric_path)
                 continue
-            
+
             threshold = ThresholdConfig(
                 metric_path=metric_path,
                 warning=warning,
@@ -636,7 +640,8 @@ class ThresholdChecker:
                 operator=operator,
                 hysteresis=hysteresis,
                 enabled=enabled,
-                display=display
+                display=display,
+                grace=grace,
             )
             
             target_dict[metric_path] = threshold
@@ -681,9 +686,10 @@ class ThresholdChecker:
                 hysteresis = threshold_config.get("hysteresis", 0.1)
                 enabled = threshold_config.get("enabled", True)
                 display = threshold_config.get("display")
+                grace = threshold_config.get("grace", None)
                 if warning is None and critical is None:
                     continue
-                
+
                 threshold = ThresholdConfig(
                     metric_path=metric_path,
                     warning=warning,
@@ -691,7 +697,8 @@ class ThresholdChecker:
                     operator=operator,
                     hysteresis=hysteresis,
                     enabled=enabled,
-                    display=display 
+                    display=display,
+                    grace=grace,
                 )
                 
                 target_dict[metric_path] = threshold
@@ -734,6 +741,7 @@ class ThresholdChecker:
                 hysteresis = threshold_config.get("hysteresis", 0.02)
                 enabled = threshold_config.get("enabled", True)
                 display = threshold_config.get("display")
+                grace = threshold_config.get("grace", None)
                 if warning is None and critical is None:
                     continue
                 target_dict[metric_path] = ThresholdConfig(
@@ -744,6 +752,7 @@ class ThresholdChecker:
                     hysteresis=hysteresis,
                     enabled=enabled,
                     display=display,
+                    grace=grace,
                 )
 
     def _parse_rtt_thresholds(
@@ -779,6 +788,7 @@ class ThresholdChecker:
         enabled = rtt_thresholds.get("enabled", True)
         display = rtt_thresholds.get("display")
         count = rtt_thresholds.get("count", 1)
+        grace = rtt_thresholds.get("grace", None)
 
         if warning is None and critical is None:
             logger.warning("No RTT thresholds defined, skipping")
@@ -793,6 +803,7 @@ class ThresholdChecker:
             enabled=enabled,
             display=display,
             count=count,
+            grace=grace,
         )
 
         target_dict[metric_path] = threshold
@@ -1353,7 +1364,9 @@ class ThresholdChecker:
     ) -> None:
         """Handle a state-change transition with grace-period logic.
 
-        Transitioning INTO alert (worsening): defers the notification for grace_seconds.
+        Transitioning INTO alert (worsening): defers the notification for the effective
+        grace period (threshold.grace if set, else self.grace_seconds). Grace of 0 fires
+        the notification immediately with no deferral.
         De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
           the metric is still alerting so no RECOVER was sent.
         Transitioning TO OK:
@@ -1361,6 +1374,8 @@ class ThresholdChecker:
             and the recovery — the spike never warranted a page.
           - Past grace: fires the RECOVER notification normally.
         """
+        effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
+
         lvl, message, formatted_msg = self._trigger_notification(
             host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
             check_name=check_name, metric_name=metric_name,
@@ -1371,18 +1386,25 @@ class ThresholdChecker:
             if alert_state.pending_since is not None:
                 logger.info(
                     "Alert suppressed (recovered within %.0fs grace): %s on %s",
-                    self.grace_seconds, metric_path, host_name,
+                    effective_grace, metric_path, host_name,
                 )
                 alert_state.pending_since = None
             else:
                 self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
         elif new_level.value > old_level.value:
-            # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification.
-            alert_state.pending_since = time.time()
-            logger.debug(
-                "Alert deferred (%.0fs grace): %s on %s = %s",
-                self.grace_seconds, metric_path, host_name, value,
-            )
+            # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL).
+            if effective_grace <= 0:
+                # No grace period — fire immediately.
+                self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
+                now = time.time()
+                alert_state.last_notification = now
+                alert_state.notification_count = 1
+            else:
+                alert_state.pending_since = time.time()
+                logger.debug(
+                    "Alert deferred (%.0fs grace): %s on %s = %s",
+                    effective_grace, metric_path, host_name, value,
+                )
         else:
             # De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
             # alerting but did not recover, so no new notification.
@@ -1407,8 +1429,9 @@ class ThresholdChecker:
         If a deferred notification is pending and grace_seconds have elapsed,
         fires it now. Otherwise falls through to normal reminder logic.
         """
+        effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
         if alert_state.pending_since is not None:
-            if time.time() - alert_state.pending_since >= self.grace_seconds:
+            if time.time() - alert_state.pending_since >= effective_grace:
                 lvl, message, formatted_msg = self._trigger_notification(
                     host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
                     check_name=check_name, metric_name=metric_name,