From aef9e7769bd98ab25f38e8d1d201b20281567a57 Mon Sep 17 00:00:00 2001
From: Andreas Wrede <andreas@wrede.ca>
Date: Sat, 9 May 2026 07:42:09 -0400
Subject: [PATCH] fix: zfs_monitor alerts dropped on restart with wildcard pool
 thresholds

purge_stale_alerts used _find_threshold to validate alert state keys,
but _find_threshold has no wildcard matching. A threshold configured as
"zfs_monitor.*.status" never matched the concrete alert state key
"zfs_monitor.tank.status", so every restart silently purged active ZFS
pool alert states and reset the grace period from scratch.

Also fix _check_pending_or_renotify to set last_notification after the
grace-period notification fires, so the re-notification interval is
anchored to when the alert was actually sent rather than the next PLG cycle.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 hbd/server/threshold.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py
index 19cc78a..3572e19 100644
--- a/hbd/server/threshold.py
+++ b/hbd/server/threshold.py
@@ -1389,6 +1389,9 @@ class ThresholdChecker:
                     host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
                 )
                 alert_state.pending_since = None
+                now = time.time()
+                alert_state.last_notification = now
+                alert_state.notification_count = 1
             # else: still within grace window, do nothing
         else:
             self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
@@ -1497,7 +1500,16 @@ class ThresholdChecker:
             if not host.alert_states:
                 continue
             configured = self.get_thresholds_for_host(hostname)
-            stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
+            stale = []
+            for mp in host.alert_states:
+                if self._find_threshold(configured, mp)[0] is not None:
+                    continue
+                # Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status"
+                # covers alert state "zfs_monitor.tank.status").
+                parts = mp.split(".")
+                if len(parts) == 3 and f"{parts[0]}.*.{parts[2]}" in configured:
+                    continue
+                stale.append(mp)
             for mp in stale:
                 logger.info(
                     "Purging stale alert state for %s / %s (no threshold configured)",