feat: alert CRITICAL on degraded or suspended ZFS pools
This commit is contained in:
@@ -89,14 +89,16 @@ class ZFSMonitorPlugin(MonitorPlugin):
|
|||||||
name = parts[0].strip()
|
name = parts[0].strip()
|
||||||
if self._pools_filter and name not in self._pools_filter:
|
if self._pools_filter and name not in self._pools_filter:
|
||||||
continue
|
continue
|
||||||
|
health = parts[1].strip()
|
||||||
pools[name] = {
|
pools[name] = {
|
||||||
"health": parts[1].strip(),
|
"health": health,
|
||||||
"size": _int(parts[2]),
|
"health_ok": 1 if health == "ONLINE" else 0,
|
||||||
"alloc": _int(parts[3]),
|
"size": _int(parts[2]),
|
||||||
"free": _int(parts[4]),
|
"alloc": _int(parts[3]),
|
||||||
"capacity": _float(parts[5]),
|
"free": _int(parts[4]),
|
||||||
"frag": _float(parts[6]),
|
"capacity": _float(parts[5]),
|
||||||
"dedup": _float(parts[7]),
|
"frag": _float(parts[6]),
|
||||||
|
"dedup": _float(parts[7]),
|
||||||
}
|
}
|
||||||
return pools
|
return pools
|
||||||
|
|
||||||
|
|||||||
+14
-2
@@ -103,8 +103,20 @@ THRESHOLD_DEFAULTS = {
|
|||||||
'status_code': {
|
'status_code': {
|
||||||
'display': '{check_name} {output}',
|
'display': '{check_name} {output}',
|
||||||
'operator': "nagios"
|
'operator': "nagios"
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
'zfs_monitor': {
|
||||||
|
'pools': {
|
||||||
|
'*': {
|
||||||
|
'health_ok': {
|
||||||
|
'critical': 1,
|
||||||
|
'operator': '<',
|
||||||
|
'hysteresis': 0.0,
|
||||||
|
'display': 'ZFS pool {pool_name} is {health}',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+87
-2
@@ -575,10 +575,13 @@ class ThresholdChecker:
|
|||||||
if not isinstance(threshold_config, dict):
|
if not isinstance(threshold_config, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Handle nested metrics (e.g., partitions./.percent)
|
# Handle nested metrics (e.g., partitions./.percent or pools.*.health_ok)
|
||||||
if metric_name == "partitions":
|
if metric_name == "partitions":
|
||||||
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
|
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
|
||||||
continue
|
continue
|
||||||
|
if metric_name == "pools":
|
||||||
|
self._parse_pool_thresholds(plugin_name, threshold_config, target_dict)
|
||||||
|
continue
|
||||||
|
|
||||||
metric_path = f"{plugin_name}.{metric_name}"
|
metric_path = f"{plugin_name}.{metric_name}"
|
||||||
|
|
||||||
@@ -663,7 +666,56 @@ class ThresholdChecker:
|
|||||||
)
|
)
|
||||||
|
|
||||||
target_dict[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
|
|
||||||
|
def _parse_pool_thresholds(
|
||||||
|
self,
|
||||||
|
plugin_name: str,
|
||||||
|
pools: Dict[str, Any],
|
||||||
|
target_dict: Optional[Dict[str, ThresholdConfig]] = None,
|
||||||
|
):
|
||||||
|
"""Parse ZFS pool thresholds. Pool names may be literal or '*' (all pools).
|
||||||
|
|
||||||
|
Config shape::
|
||||||
|
|
||||||
|
zfs_monitor:
|
||||||
|
pools:
|
||||||
|
'*':
|
||||||
|
health_ok:
|
||||||
|
critical: 1
|
||||||
|
operator: '<'
|
||||||
|
tank:
|
||||||
|
capacity:
|
||||||
|
warning: 80
|
||||||
|
critical: 90
|
||||||
|
"""
|
||||||
|
if target_dict is None:
|
||||||
|
target_dict = self.thresholds
|
||||||
|
|
||||||
|
for pool_name, metrics in pools.items():
|
||||||
|
if not isinstance(metrics, dict):
|
||||||
|
continue
|
||||||
|
for metric_name, threshold_config in metrics.items():
|
||||||
|
if not isinstance(threshold_config, dict):
|
||||||
|
continue
|
||||||
|
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
|
||||||
|
warning = threshold_config.get("warning")
|
||||||
|
critical = threshold_config.get("critical")
|
||||||
|
operator = threshold_config.get("operator", ">")
|
||||||
|
hysteresis = threshold_config.get("hysteresis", 0.02)
|
||||||
|
enabled = threshold_config.get("enabled", True)
|
||||||
|
display = threshold_config.get("display")
|
||||||
|
if warning is None and critical is None:
|
||||||
|
continue
|
||||||
|
target_dict[metric_path] = ThresholdConfig(
|
||||||
|
metric_path=metric_path,
|
||||||
|
warning=warning,
|
||||||
|
critical=critical,
|
||||||
|
operator=operator,
|
||||||
|
hysteresis=hysteresis,
|
||||||
|
enabled=enabled,
|
||||||
|
display=display,
|
||||||
|
)
|
||||||
|
|
||||||
def _parse_rtt_thresholds(
|
def _parse_rtt_thresholds(
|
||||||
self,
|
self,
|
||||||
rtt_thresholds: Dict[str, Any],
|
rtt_thresholds: Dict[str, Any],
|
||||||
@@ -967,6 +1019,39 @@ class ThresholdChecker:
|
|||||||
# Get host-specific thresholds
|
# Get host-specific thresholds
|
||||||
thresholds = self.get_thresholds_for_host(host_name)
|
thresholds = self.get_thresholds_for_host(host_name)
|
||||||
|
|
||||||
|
# ZFS pool health checks
|
||||||
|
if plugin_name == "zfs_monitor" and "pools" in data:
|
||||||
|
pools = data["pools"]
|
||||||
|
if isinstance(pools, dict):
|
||||||
|
for pool_name, pool_metrics in pools.items():
|
||||||
|
if not isinstance(pool_metrics, dict):
|
||||||
|
continue
|
||||||
|
for metric_name, value in pool_metrics.items():
|
||||||
|
# Try specific pool name first, then wildcard '*'
|
||||||
|
metric_path = f"{plugin_name}.{pool_name}.{metric_name}"
|
||||||
|
wildcard_path = f"{plugin_name}.*.{metric_name}"
|
||||||
|
threshold = thresholds.get(metric_path) or thresholds.get(wildcard_path)
|
||||||
|
if threshold is None:
|
||||||
|
continue
|
||||||
|
if metric_path not in alert_states:
|
||||||
|
alert_states[metric_path] = AlertState(metric_path)
|
||||||
|
alert_state = alert_states[metric_path]
|
||||||
|
new_level = threshold.evaluate_with_hysteresis(value, alert_state.level)
|
||||||
|
threshold_value = None
|
||||||
|
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||||
|
threshold_value = threshold.critical
|
||||||
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
|
threshold_value = threshold.warning
|
||||||
|
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||||
|
pool_context = dict(pool_metrics)
|
||||||
|
pool_context["pool_name"] = pool_name
|
||||||
|
old_level = alert_state.level
|
||||||
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
|
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, pool_context, metric_name=pool_name)
|
||||||
|
elif new_level != AlertLevel.OK:
|
||||||
|
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, pool_context, metric_name=pool_name)
|
||||||
|
|
||||||
# Look for partition data in disk_monitor
|
# Look for partition data in disk_monitor
|
||||||
if plugin_name == "disk_monitor" and "partitions" in data:
|
if plugin_name == "disk_monitor" and "partitions" in data:
|
||||||
partitions = data["partitions"]
|
partitions = data["partitions"]
|
||||||
|
|||||||
Reference in New Issue
Block a user