fix: zfs_monitor alerts dropped on restart with wildcard pool thresholds

purge_stale_alerts used _find_threshold to validate alert state keys, but _find_threshold has no wildcard matching. A threshold configured as "zfs_monitor.*.status" never matched the concrete alert state key "zfs_monitor.tank.status", so every restart silently purged active ZFS pool alert states and reset the grace period from scratch. Also fix _check_pending_or_renotify to set last_notification after the grace-period notification fires, so the re-notification interval is anchored to when the alert was actually sent rather than the next PLG cycle. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 07:42:09 -04:00
parent 58c2b9d996
commit aef9e7769b
1 changed files with 13 additions and 1 deletions
@@ -1389,6 +1389,9 @@ class ThresholdChecker:
                    host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
                )
                alert_state.pending_since = None
                now = time.time()
                alert_state.last_notification = now
                alert_state.notification_count = 1
            # else: still within grace window, do nothing
        else:
            self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
@@ -1497,7 +1500,16 @@ class ThresholdChecker:
            if not host.alert_states:
                continue
            configured = self.get_thresholds_for_host(hostname)
-            stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
+            stale = []
            for mp in host.alert_states:
                if self._find_threshold(configured, mp)[0] is not None:
                    continue
                # Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status"
                # covers alert state "zfs_monitor.tank.status").
                parts = mp.split(".")
                if len(parts) == 3 and f"{parts[0]}.*.{parts[2]}" in configured:
                    continue
                stale.append(mp)
            for mp in stale:
                logger.info(
                    "Purging stale alert state for %s / %s (no threshold configured)",