fix: correct ZFS pool status threshold operator and add per-metric grace

The default zfs_monitor.*.status threshold used operator '>' with warning=1,
so a DEGRADED pool (status=1) never alerted (1 > 1 is false) and a FAULTED
pool (status=2) only triggered WARNING instead of CRITICAL.

Fix the operator to '>=' in THRESHOLD_DEFAULTS and the example config.

Also adds a per-metric grace period override (ThresholdConfig.grace) so
individual thresholds can bypass or shorten the global grace delay. Alerts
with grace=0 fire immediately on state change rather than waiting for a
second collection cycle. Sets grace=0 on zfs_monitor.*.status so pool
degradation alerts fire on the first data report after the event.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Andreas Wrede
2026-05-13 06:33:06 -04:00
parent 4160e34a96
commit 0f90be659e
5 changed files with 49 additions and 16 deletions
+3 -2
View File
@@ -146,8 +146,9 @@ thresholds:
status: status:
warning: 1 # Alert WARNING when pool is DEGRADED warning: 1 # Alert WARNING when pool is DEGRADED
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
operator: ">" operator: ">="
hysteresis: 0.0 # No hysteresis — a degraded pool is always critical hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting
grace: 0 # Fire immediately — don't wait for a second collection
display: "ZFS pool {pool_name} is {health}" display: "ZFS pool {pool_name} is {health}"
# Per-pool capacity thresholds (optional; add pools you care about) # Per-pool capacity thresholds (optional; add pools you care about)
+2 -1
View File
@@ -113,8 +113,9 @@ THRESHOLD_DEFAULTS = {
'status': { 'status': {
'warning': 1, 'warning': 1,
'critical': 2, 'critical': 2,
'operator': '>', 'operator': '>=',
'hysteresis': 0.0, 'hysteresis': 0.0,
'grace': 0,
'display': 'ZFS pool {pool_name} is {health}' 'display': 'ZFS pool {pool_name} is {health}'
}, },
'capacity': { 'capacity': {
+7
View File
@@ -61,6 +61,13 @@ def _insert_threshold_metric(thresholds: dict, metric_path: str, values: dict) -
except (TypeError, ValueError): except (TypeError, ValueError):
pass pass
grace = values.get("grace")
if grace is not None:
try:
cfg["grace"] = float(grace)
except (TypeError, ValueError):
pass
count = values.get("count") count = values.get("count")
if count is not None: if count is not None:
try: try:
+1
View File
@@ -248,6 +248,7 @@ def get_settings_sections(config: dict, threshold_checker=None) -> list:
"count": tc.count, "count": tc.count,
"enabled": tc.enabled, "enabled": tc.enabled,
"display": tc.display or "", "display": tc.display or "",
"grace": tc.grace,
} }
threshold_config_list = [] threshold_config_list = []
+36 -13
View File
@@ -195,6 +195,7 @@ class ThresholdConfig:
hysteresis: float = 0.0, hysteresis: float = 0.0,
enabled: bool = True, enabled: bool = True,
count: int = 1, count: int = 1,
grace: Optional[float] = None,
): ):
""" """
Initialize threshold configuration. Initialize threshold configuration.
@@ -207,6 +208,7 @@ class ThresholdConfig:
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled enabled: Whether this threshold is enabled
count: Number of consecutive exceedances required before alerting (default 1) count: Number of consecutive exceedances required before alerting (default 1)
grace: Per-metric grace period in seconds; overrides global grace when set
""" """
self.metric_path = metric_path self.metric_path = metric_path
self.warning = warning self.warning = warning
@@ -215,6 +217,7 @@ class ThresholdConfig:
self.hysteresis = hysteresis self.hysteresis = hysteresis
self.display = display self.display = display
self.count = max(1, int(count)) self.count = max(1, int(count))
self.grace = float(grace) if grace is not None else None
# Parse operator # Parse operator
try: try:
@@ -624,11 +627,12 @@ class ThresholdChecker:
display = threshold_config.get("display", default_display) display = threshold_config.get("display", default_display)
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02) hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
grace = threshold_config.get("grace", None)
if warning is None and critical is None and not is_nagios_op: if warning is None and critical is None and not is_nagios_op:
logger.warning("No thresholds defined for %s, skipping", metric_path) logger.warning("No thresholds defined for %s, skipping", metric_path)
continue continue
threshold = ThresholdConfig( threshold = ThresholdConfig(
metric_path=metric_path, metric_path=metric_path,
warning=warning, warning=warning,
@@ -636,7 +640,8 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display display=display,
grace=grace,
) )
target_dict[metric_path] = threshold target_dict[metric_path] = threshold
@@ -681,9 +686,10 @@ class ThresholdChecker:
hysteresis = threshold_config.get("hysteresis", 0.1) hysteresis = threshold_config.get("hysteresis", 0.1)
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display") display = threshold_config.get("display")
grace = threshold_config.get("grace", None)
if warning is None and critical is None: if warning is None and critical is None:
continue continue
threshold = ThresholdConfig( threshold = ThresholdConfig(
metric_path=metric_path, metric_path=metric_path,
warning=warning, warning=warning,
@@ -691,7 +697,8 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display display=display,
grace=grace,
) )
target_dict[metric_path] = threshold target_dict[metric_path] = threshold
@@ -734,6 +741,7 @@ class ThresholdChecker:
hysteresis = threshold_config.get("hysteresis", 0.02) hysteresis = threshold_config.get("hysteresis", 0.02)
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display") display = threshold_config.get("display")
grace = threshold_config.get("grace", None)
if warning is None and critical is None: if warning is None and critical is None:
continue continue
target_dict[metric_path] = ThresholdConfig( target_dict[metric_path] = ThresholdConfig(
@@ -744,6 +752,7 @@ class ThresholdChecker:
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display, display=display,
grace=grace,
) )
def _parse_rtt_thresholds( def _parse_rtt_thresholds(
@@ -779,6 +788,7 @@ class ThresholdChecker:
enabled = rtt_thresholds.get("enabled", True) enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display") display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1) count = rtt_thresholds.get("count", 1)
grace = rtt_thresholds.get("grace", None)
if warning is None and critical is None: if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping") logger.warning("No RTT thresholds defined, skipping")
@@ -793,6 +803,7 @@ class ThresholdChecker:
enabled=enabled, enabled=enabled,
display=display, display=display,
count=count, count=count,
grace=grace,
) )
target_dict[metric_path] = threshold target_dict[metric_path] = threshold
@@ -1353,7 +1364,9 @@ class ThresholdChecker:
) -> None: ) -> None:
"""Handle a state-change transition with grace-period logic. """Handle a state-change transition with grace-period logic.
Transitioning INTO alert (worsening): defers the notification for grace_seconds. Transitioning INTO alert (worsening): defers the notification for the effective
grace period (threshold.grace if set, else self.grace_seconds). Grace of 0 fires
the notification immediately with no deferral.
De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification; De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
the metric is still alerting so no RECOVER was sent. the metric is still alerting so no RECOVER was sent.
Transitioning TO OK: Transitioning TO OK:
@@ -1361,6 +1374,8 @@ class ThresholdChecker:
and the recovery — the spike never warranted a page. and the recovery — the spike never warranted a page.
- Past grace: fires the RECOVER notification normally. - Past grace: fires the RECOVER notification normally.
""" """
effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
lvl, message, formatted_msg = self._trigger_notification( lvl, message, formatted_msg = self._trigger_notification(
host_name, metric_path, old_level, new_level, value, threshold, plugin_data, host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
check_name=check_name, metric_name=metric_name, check_name=check_name, metric_name=metric_name,
@@ -1371,18 +1386,25 @@ class ThresholdChecker:
if alert_state.pending_since is not None: if alert_state.pending_since is not None:
logger.info( logger.info(
"Alert suppressed (recovered within %.0fs grace): %s on %s", "Alert suppressed (recovered within %.0fs grace): %s on %s",
self.grace_seconds, metric_path, host_name, effective_grace, metric_path, host_name,
) )
alert_state.pending_since = None alert_state.pending_since = None
else: else:
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
elif new_level.value > old_level.value: elif new_level.value > old_level.value:
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification. # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL).
alert_state.pending_since = time.time() if effective_grace <= 0:
logger.debug( # No grace period — fire immediately.
"Alert deferred (%.0fs grace): %s on %s = %s", self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
self.grace_seconds, metric_path, host_name, value, now = time.time()
) alert_state.last_notification = now
alert_state.notification_count = 1
else:
alert_state.pending_since = time.time()
logger.debug(
"Alert deferred (%.0fs grace): %s on %s = %s",
effective_grace, metric_path, host_name, value,
)
else: else:
# De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still # De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
# alerting but did not recover, so no new notification. # alerting but did not recover, so no new notification.
@@ -1407,8 +1429,9 @@ class ThresholdChecker:
If a deferred notification is pending and grace_seconds have elapsed, If a deferred notification is pending and grace_seconds have elapsed,
fires it now. Otherwise falls through to normal reminder logic. fires it now. Otherwise falls through to normal reminder logic.
""" """
effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
if alert_state.pending_since is not None: if alert_state.pending_since is not None:
if time.time() - alert_state.pending_since >= self.grace_seconds: if time.time() - alert_state.pending_since >= effective_grace:
lvl, message, formatted_msg = self._trigger_notification( lvl, message, formatted_msg = self._trigger_notification(
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data, host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
check_name=check_name, metric_name=metric_name, check_name=check_name, metric_name=metric_name,