feat: alert CRITICAL on degraded or suspended ZFS pools
This commit is contained in:
@@ -89,14 +89,16 @@ class ZFSMonitorPlugin(MonitorPlugin):
|
||||
name = parts[0].strip()
|
||||
if self._pools_filter and name not in self._pools_filter:
|
||||
continue
|
||||
health = parts[1].strip()
|
||||
pools[name] = {
|
||||
"health": parts[1].strip(),
|
||||
"size": _int(parts[2]),
|
||||
"alloc": _int(parts[3]),
|
||||
"free": _int(parts[4]),
|
||||
"capacity": _float(parts[5]),
|
||||
"frag": _float(parts[6]),
|
||||
"dedup": _float(parts[7]),
|
||||
"health": health,
|
||||
"health_ok": 1 if health == "ONLINE" else 0,
|
||||
"size": _int(parts[2]),
|
||||
"alloc": _int(parts[3]),
|
||||
"free": _int(parts[4]),
|
||||
"capacity": _float(parts[5]),
|
||||
"frag": _float(parts[6]),
|
||||
"dedup": _float(parts[7]),
|
||||
}
|
||||
return pools
|
||||
|
||||
|
||||
+13
-1
@@ -104,7 +104,19 @@ THRESHOLD_DEFAULTS = {
|
||||
'display': '{check_name} {output}',
|
||||
'operator': "nagios"
|
||||
}
|
||||
}
|
||||
},
|
||||
'zfs_monitor': {
|
||||
'pools': {
|
||||
'*': {
|
||||
'health_ok': {
|
||||
'critical': 1,
|
||||
'operator': '<',
|
||||
'hysteresis': 0.0,
|
||||
'display': 'ZFS pool {pool_name} is {health}',
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+86
-1
@@ -575,10 +575,13 @@ class ThresholdChecker:
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Handle nested metrics (e.g., partitions./.percent)
|
||||
# Handle nested metrics (e.g., partitions./.percent or pools.*.health_ok)
|
||||
if metric_name == "partitions":
|
||||
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
|
||||
continue
|
||||
if metric_name == "pools":
|
||||
self._parse_pool_thresholds(plugin_name, threshold_config, target_dict)
|
||||
continue
|
||||
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
@@ -664,6 +667,55 @@ class ThresholdChecker:
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
|
||||
def _parse_pool_thresholds(
|
||||
self,
|
||||
plugin_name: str,
|
||||
pools: Dict[str, Any],
|
||||
target_dict: Optional[Dict[str, ThresholdConfig]] = None,
|
||||
):
|
||||
"""Parse ZFS pool thresholds. Pool names may be literal or '*' (all pools).
|
||||
|
||||
Config shape::
|
||||
|
||||
zfs_monitor:
|
||||
pools:
|
||||
'*':
|
||||
health_ok:
|
||||
critical: 1
|
||||
operator: '<'
|
||||
tank:
|
||||
capacity:
|
||||
warning: 80
|
||||
critical: 90
|
||||
"""
|
||||
if target_dict is None:
|
||||
target_dict = self.thresholds
|
||||
|
||||
for pool_name, metrics in pools.items():
|
||||
if not isinstance(metrics, dict):
|
||||
continue
|
||||
for metric_name, threshold_config in metrics.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.02)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
target_dict[metric_path] = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display,
|
||||
)
|
||||
|
||||
def _parse_rtt_thresholds(
|
||||
self,
|
||||
rtt_thresholds: Dict[str, Any],
|
||||
@@ -967,6 +1019,39 @@ class ThresholdChecker:
|
||||
# Get host-specific thresholds
|
||||
thresholds = self.get_thresholds_for_host(host_name)
|
||||
|
||||
# ZFS pool health checks
|
||||
if plugin_name == "zfs_monitor" and "pools" in data:
|
||||
pools = data["pools"]
|
||||
if isinstance(pools, dict):
|
||||
for pool_name, pool_metrics in pools.items():
|
||||
if not isinstance(pool_metrics, dict):
|
||||
continue
|
||||
for metric_name, value in pool_metrics.items():
|
||||
# Try specific pool name first, then wildcard '*'
|
||||
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
|
||||
wildcard_path = f"{plugin_name}.*.{metric_name}"
|
||||
threshold = thresholds.get(metric_path) or thresholds.get(wildcard_path)
|
||||
if threshold is None:
|
||||
continue
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
alert_state = alert_states[metric_path]
|
||||
new_level = threshold.evaluate_with_hysteresis(value, alert_state.level)
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||
pool_context = dict(pool_metrics)
|
||||
pool_context["pool_name"] = pool_name
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, pool_context, metric_name=pool_name)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, pool_context, metric_name=pool_name)
|
||||
|
||||
# Look for partition data in disk_monitor
|
||||
if plugin_name == "disk_monitor" and "partitions" in data:
|
||||
partitions = data["partitions"]
|
||||
|
||||
Reference in New Issue
Block a user