fix: zfs_monitor alerts dropped on restart with wildcard pool thresholds

purge_stale_alerts used _find_threshold to validate alert state keys,
but _find_threshold has no wildcard matching. A threshold configured as
"zfs_monitor.*.status" never matched the concrete alert state key
"zfs_monitor.tank.status", so every restart silently purged active ZFS
pool alert states and reset the grace period from scratch.

Also fix _check_pending_or_renotify to set last_notification after the
grace-period notification fires, so the re-notification interval is
anchored to when the alert was actually sent rather than the next PLG cycle.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-09 07:42:09 -04:00
parent 58c2b9d996
commit aef9e7769b
+13 -1
View File
@@ -1389,6 +1389,9 @@ class ThresholdChecker:
host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
) )
alert_state.pending_since = None alert_state.pending_since = None
now = time.time()
alert_state.last_notification = now
alert_state.notification_count = 1
# else: still within grace window, do nothing # else: still within grace window, do nothing
else: else:
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name) self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
@@ -1497,7 +1500,16 @@ class ThresholdChecker:
if not host.alert_states: if not host.alert_states:
continue continue
configured = self.get_thresholds_for_host(hostname) configured = self.get_thresholds_for_host(hostname)
stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None] stale = []
for mp in host.alert_states:
if self._find_threshold(configured, mp)[0] is not None:
continue
# Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status"
# covers alert state "zfs_monitor.tank.status").
parts = mp.split(".")
if len(parts) == 3 and f"{parts[0]}.*.{parts[2]}" in configured:
continue
stale.append(mp)
for mp in stale: for mp in stale:
logger.info( logger.info(
"Purging stale alert state for %s / %s (no threshold configured)", "Purging stale alert state for %s / %s (no threshold configured)",