fix: zfs_monitor alerts dropped on restart with wildcard pool thresholds
purge_stale_alerts used _find_threshold to validate alert state keys, but _find_threshold has no wildcard matching. A threshold configured as "zfs_monitor.*.status" never matched the concrete alert state key "zfs_monitor.tank.status", so every restart silently purged active ZFS pool alert states and reset the grace period from scratch. Also fix _check_pending_or_renotify to set last_notification after the grace-period notification fires, so the re-notification interval is anchored to when the alert was actually sent rather than the next PLG cycle. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+13
-1
@@ -1389,6 +1389,9 @@ class ThresholdChecker:
|
||||
host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
|
||||
)
|
||||
alert_state.pending_since = None
|
||||
now = time.time()
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count = 1
|
||||
# else: still within grace window, do nothing
|
||||
else:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
|
||||
@@ -1497,7 +1500,16 @@ class ThresholdChecker:
|
||||
if not host.alert_states:
|
||||
continue
|
||||
configured = self.get_thresholds_for_host(hostname)
|
||||
stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
|
||||
stale = []
|
||||
for mp in host.alert_states:
|
||||
if self._find_threshold(configured, mp)[0] is not None:
|
||||
continue
|
||||
# Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status"
|
||||
# covers alert state "zfs_monitor.tank.status").
|
||||
parts = mp.split(".")
|
||||
if len(parts) == 3 and f"{parts[0]}.*.{parts[2]}" in configured:
|
||||
continue
|
||||
stale.append(mp)
|
||||
for mp in stale:
|
||||
logger.info(
|
||||
"Purging stale alert state for %s / %s (no threshold configured)",
|
||||
|
||||
Reference in New Issue
Block a user