From aef9e7769bd98ab25f38e8d1d201b20281567a57 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Sat, 9 May 2026 07:42:09 -0400 Subject: [PATCH] fix: zfs_monitor alerts dropped on restart with wildcard pool thresholds purge_stale_alerts used _find_threshold to validate alert state keys, but _find_threshold has no wildcard matching. A threshold configured as "zfs_monitor.*.status" never matched the concrete alert state key "zfs_monitor.tank.status", so every restart silently purged active ZFS pool alert states and reset the grace period from scratch. Also fix _check_pending_or_renotify to set last_notification after the grace-period notification fires, so the re-notification interval is anchored to when the alert was actually sent rather than the next PLG cycle. Co-Authored-By: Claude Sonnet 4.6 --- hbd/server/threshold.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 19cc78a..3572e19 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -1389,6 +1389,9 @@ class ThresholdChecker: host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value ) alert_state.pending_since = None + now = time.time() + alert_state.last_notification = now + alert_state.notification_count = 1 # else: still within grace window, do nothing else: self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name) @@ -1497,7 +1500,16 @@ class ThresholdChecker: if not host.alert_states: continue configured = self.get_thresholds_for_host(hostname) - stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None] + stale = [] + for mp in host.alert_states: + if self._find_threshold(configured, mp)[0] is not None: + continue + # Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status" + # covers alert state "zfs_monitor.tank.status"). + parts = mp.split(".") + if len(parts) == 3 and f"{parts[0]}.*.{parts[2]}" in configured: + continue + stale.append(mp) for mp in stale: logger.info( "Purging stale alert state for %s / %s (no threshold configured)",