feat: alert CRITICAL on degraded or suspended ZFS pools

This commit is contained in:
2026-05-08 16:23:49 -04:00
parent 05045bafa2
commit b9db0c552e
3 changed files with 110 additions and 11 deletions
+14 -2
View File
@@ -103,8 +103,20 @@ THRESHOLD_DEFAULTS = {
'status_code': {
'display': '{check_name} {output}',
'operator': "nagios"
}
}
}
},
'zfs_monitor': {
'pools': {
'*': {
'health_ok': {
'critical': 1,
'operator': '<',
'hysteresis': 0.0,
'display': 'ZFS pool {pool_name} is {health}',
}
}
}
},
}
}
+87 -2
View File
@@ -575,10 +575,13 @@ class ThresholdChecker:
if not isinstance(threshold_config, dict):
continue
# Handle nested metrics (e.g., partitions./.percent)
# Handle nested metrics (e.g., partitions./.percent or pools.*.health_ok)
if metric_name == "partitions":
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
continue
if metric_name == "pools":
self._parse_pool_thresholds(plugin_name, threshold_config, target_dict)
continue
metric_path = f"{plugin_name}.{metric_name}"
@@ -663,7 +666,56 @@ class ThresholdChecker:
)
target_dict[metric_path] = threshold
def _parse_pool_thresholds(
self,
plugin_name: str,
pools: Dict[str, Any],
target_dict: Optional[Dict[str, ThresholdConfig]] = None,
):
"""Parse ZFS pool thresholds. Pool names may be literal or '*' (all pools).
Config shape::
zfs_monitor:
pools:
'*':
health_ok:
critical: 1
operator: '<'
tank:
capacity:
warning: 80
critical: 90
"""
if target_dict is None:
target_dict = self.thresholds
for pool_name, metrics in pools.items():
if not isinstance(metrics, dict):
continue
for metric_name, threshold_config in metrics.items():
if not isinstance(threshold_config, dict):
continue
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.02)
enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None:
continue
target_dict[metric_path] = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display,
)
def _parse_rtt_thresholds(
self,
rtt_thresholds: Dict[str, Any],
@@ -967,6 +1019,39 @@ class ThresholdChecker:
# Get host-specific thresholds
thresholds = self.get_thresholds_for_host(host_name)
# ZFS pool health checks
if plugin_name == "zfs_monitor" and "pools" in data:
pools = data["pools"]
if isinstance(pools, dict):
for pool_name, pool_metrics in pools.items():
if not isinstance(pool_metrics, dict):
continue
for metric_name, value in pool_metrics.items():
# Try specific pool name first, then wildcard '*'
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
wildcard_path = f"{plugin_name}.*.{metric_name}"
threshold = thresholds.get(metric_path) or thresholds.get(wildcard_path)
if threshold is None:
continue
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path]
new_level = threshold.evaluate_with_hysteresis(value, alert_state.level)
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
pool_context = dict(pool_metrics)
pool_context["pool_name"] = pool_name
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, pool_context, metric_name=pool_name)
elif new_level != AlertLevel.OK:
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, pool_context, metric_name=pool_name)
# Look for partition data in disk_monitor
if plugin_name == "disk_monitor" and "partitions" in data:
partitions = data["partitions"]