fix: correct ZFS pool status threshold operator and add per-metric grace
The default zfs_monitor.*.status threshold used operator '>' with warning=1, so a DEGRADED pool (status=1) never alerted (1 > 1 is false) and a FAULTED pool (status=2) only triggered WARNING instead of CRITICAL. Fix the operator to '>=' in THRESHOLD_DEFAULTS and the example config. Also adds a per-metric grace period override (ThresholdConfig.grace) so individual thresholds can bypass or shorten the global grace delay. Alerts with grace=0 fire immediately on state change rather than waiting for a second collection cycle. Sets grace=0 on zfs_monitor.*.status so pool degradation alerts fire on the first data report after the event. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -146,8 +146,9 @@ thresholds:
|
||||
status:
|
||||
warning: 1 # Alert WARNING when pool is DEGRADED
|
||||
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
|
||||
operator: ">"
|
||||
hysteresis: 0.0 # No hysteresis — a degraded pool is always critical
|
||||
operator: ">="
|
||||
hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting
|
||||
grace: 0 # Fire immediately — don't wait for a second collection
|
||||
display: "ZFS pool {pool_name} is {health}"
|
||||
|
||||
# Per-pool capacity thresholds (optional; add pools you care about)
|
||||
|
||||
@@ -113,8 +113,9 @@ THRESHOLD_DEFAULTS = {
|
||||
'status': {
|
||||
'warning': 1,
|
||||
'critical': 2,
|
||||
'operator': '>',
|
||||
'operator': '>=',
|
||||
'hysteresis': 0.0,
|
||||
'grace': 0,
|
||||
'display': 'ZFS pool {pool_name} is {health}'
|
||||
},
|
||||
'capacity': {
|
||||
|
||||
@@ -61,6 +61,13 @@ def _insert_threshold_metric(thresholds: dict, metric_path: str, values: dict) -
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
grace = values.get("grace")
|
||||
if grace is not None:
|
||||
try:
|
||||
cfg["grace"] = float(grace)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
count = values.get("count")
|
||||
if count is not None:
|
||||
try:
|
||||
|
||||
@@ -248,6 +248,7 @@ def get_settings_sections(config: dict, threshold_checker=None) -> list:
|
||||
"count": tc.count,
|
||||
"enabled": tc.enabled,
|
||||
"display": tc.display or "",
|
||||
"grace": tc.grace,
|
||||
}
|
||||
|
||||
threshold_config_list = []
|
||||
|
||||
+30
-7
@@ -195,6 +195,7 @@ class ThresholdConfig:
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
count: int = 1,
|
||||
grace: Optional[float] = None,
|
||||
):
|
||||
"""
|
||||
Initialize threshold configuration.
|
||||
@@ -207,6 +208,7 @@ class ThresholdConfig:
|
||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||
enabled: Whether this threshold is enabled
|
||||
count: Number of consecutive exceedances required before alerting (default 1)
|
||||
grace: Per-metric grace period in seconds; overrides global grace when set
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.warning = warning
|
||||
@@ -215,6 +217,7 @@ class ThresholdConfig:
|
||||
self.hysteresis = hysteresis
|
||||
self.display = display
|
||||
self.count = max(1, int(count))
|
||||
self.grace = float(grace) if grace is not None else None
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
@@ -624,6 +627,7 @@ class ThresholdChecker:
|
||||
display = threshold_config.get("display", default_display)
|
||||
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
grace = threshold_config.get("grace", None)
|
||||
|
||||
if warning is None and critical is None and not is_nagios_op:
|
||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||
@@ -636,7 +640,8 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
display=display,
|
||||
grace=grace,
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
@@ -681,6 +686,7 @@ class ThresholdChecker:
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
grace = threshold_config.get("grace", None)
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
|
||||
@@ -691,7 +697,8 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
display=display,
|
||||
grace=grace,
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
@@ -734,6 +741,7 @@ class ThresholdChecker:
|
||||
hysteresis = threshold_config.get("hysteresis", 0.02)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
grace = threshold_config.get("grace", None)
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
target_dict[metric_path] = ThresholdConfig(
|
||||
@@ -744,6 +752,7 @@ class ThresholdChecker:
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display,
|
||||
grace=grace,
|
||||
)
|
||||
|
||||
def _parse_rtt_thresholds(
|
||||
@@ -779,6 +788,7 @@ class ThresholdChecker:
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
count = rtt_thresholds.get("count", 1)
|
||||
grace = rtt_thresholds.get("grace", None)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined, skipping")
|
||||
@@ -793,6 +803,7 @@ class ThresholdChecker:
|
||||
enabled=enabled,
|
||||
display=display,
|
||||
count=count,
|
||||
grace=grace,
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
@@ -1353,7 +1364,9 @@ class ThresholdChecker:
|
||||
) -> None:
|
||||
"""Handle a state-change transition with grace-period logic.
|
||||
|
||||
Transitioning INTO alert (worsening): defers the notification for grace_seconds.
|
||||
Transitioning INTO alert (worsening): defers the notification for the effective
|
||||
grace period (threshold.grace if set, else self.grace_seconds). Grace of 0 fires
|
||||
the notification immediately with no deferral.
|
||||
De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
|
||||
the metric is still alerting so no RECOVER was sent.
|
||||
Transitioning TO OK:
|
||||
@@ -1361,6 +1374,8 @@ class ThresholdChecker:
|
||||
and the recovery — the spike never warranted a page.
|
||||
- Past grace: fires the RECOVER notification normally.
|
||||
"""
|
||||
effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
|
||||
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
|
||||
check_name=check_name, metric_name=metric_name,
|
||||
@@ -1371,17 +1386,24 @@ class ThresholdChecker:
|
||||
if alert_state.pending_since is not None:
|
||||
logger.info(
|
||||
"Alert suppressed (recovered within %.0fs grace): %s on %s",
|
||||
self.grace_seconds, metric_path, host_name,
|
||||
effective_grace, metric_path, host_name,
|
||||
)
|
||||
alert_state.pending_since = None
|
||||
else:
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
elif new_level.value > old_level.value:
|
||||
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification.
|
||||
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL).
|
||||
if effective_grace <= 0:
|
||||
# No grace period — fire immediately.
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
now = time.time()
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count = 1
|
||||
else:
|
||||
alert_state.pending_since = time.time()
|
||||
logger.debug(
|
||||
"Alert deferred (%.0fs grace): %s on %s = %s",
|
||||
self.grace_seconds, metric_path, host_name, value,
|
||||
effective_grace, metric_path, host_name, value,
|
||||
)
|
||||
else:
|
||||
# De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
|
||||
@@ -1407,8 +1429,9 @@ class ThresholdChecker:
|
||||
If a deferred notification is pending and grace_seconds have elapsed,
|
||||
fires it now. Otherwise falls through to normal reminder logic.
|
||||
"""
|
||||
effective_grace = threshold.grace if threshold.grace is not None else self.grace_seconds
|
||||
if alert_state.pending_since is not None:
|
||||
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
||||
if time.time() - alert_state.pending_since >= effective_grace:
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
|
||||
check_name=check_name, metric_name=metric_name,
|
||||
|
||||
Reference in New Issue
Block a user