diff --git a/hbd/client/plugins/zfs_monitor.py b/hbd/client/plugins/zfs_monitor.py index 5a256ef..ef763a1 100644 --- a/hbd/client/plugins/zfs_monitor.py +++ b/hbd/client/plugins/zfs_monitor.py @@ -89,14 +89,16 @@ class ZFSMonitorPlugin(MonitorPlugin): name = parts[0].strip() if self._pools_filter and name not in self._pools_filter: continue + health = parts[1].strip() pools[name] = { - "health": parts[1].strip(), - "size": _int(parts[2]), - "alloc": _int(parts[3]), - "free": _int(parts[4]), - "capacity": _float(parts[5]), - "frag": _float(parts[6]), - "dedup": _float(parts[7]), + "health": health, + "health_ok": 1 if health == "ONLINE" else 0, + "size": _int(parts[2]), + "alloc": _int(parts[3]), + "free": _int(parts[4]), + "capacity": _float(parts[5]), + "frag": _float(parts[6]), + "dedup": _float(parts[7]), } return pools diff --git a/hbd/server/config.py b/hbd/server/config.py index 9cc9021..4b878d0 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -103,8 +103,20 @@ THRESHOLD_DEFAULTS = { 'status_code': { 'display': '{check_name} {output}', 'operator': "nagios" - } - } + } + }, + 'zfs_monitor': { + 'pools': { + '*': { + 'health_ok': { + 'critical': 1, + 'operator': '<', + 'hysteresis': 0.0, + 'display': 'ZFS pool {pool_name} is {health}', + } + } + } + }, } } diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 11d79a3..1bcbb8a 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -575,10 +575,13 @@ class ThresholdChecker: if not isinstance(threshold_config, dict): continue - # Handle nested metrics (e.g., partitions./.percent) + # Handle nested metrics (e.g., partitions./.percent or pools.*.health_ok) if metric_name == "partitions": self._parse_partition_thresholds(plugin_name, threshold_config, target_dict) continue + if metric_name == "pools": + self._parse_pool_thresholds(plugin_name, threshold_config, target_dict) + continue metric_path = f"{plugin_name}.{metric_name}" @@ -663,7 +666,56 @@ class ThresholdChecker: ) target_dict[metric_path] = threshold - + + def _parse_pool_thresholds( + self, + plugin_name: str, + pools: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None, + ): + """Parse ZFS pool thresholds. Pool names may be literal or '*' (all pools). + + Config shape:: + + zfs_monitor: + pools: + '*': + health_ok: + critical: 1 + operator: '<' + tank: + capacity: + warning: 80 + critical: 90 + """ + if target_dict is None: + target_dict = self.thresholds + + for pool_name, metrics in pools.items(): + if not isinstance(metrics, dict): + continue + for metric_name, threshold_config in metrics.items(): + if not isinstance(threshold_config, dict): + continue + metric_path = f"{plugin_name}.{pool_name}.{metric_name}" + warning = threshold_config.get("warning") + critical = threshold_config.get("critical") + operator = threshold_config.get("operator", ">") + hysteresis = threshold_config.get("hysteresis", 0.02) + enabled = threshold_config.get("enabled", True) + display = threshold_config.get("display") + if warning is None and critical is None: + continue + target_dict[metric_path] = ThresholdConfig( + metric_path=metric_path, + warning=warning, + critical=critical, + operator=operator, + hysteresis=hysteresis, + enabled=enabled, + display=display, + ) + def _parse_rtt_thresholds( self, rtt_thresholds: Dict[str, Any], @@ -967,6 +1019,39 @@ class ThresholdChecker: # Get host-specific thresholds thresholds = self.get_thresholds_for_host(host_name) + # ZFS pool health checks + if plugin_name == "zfs_monitor" and "pools" in data: + pools = data["pools"] + if isinstance(pools, dict): + for pool_name, pool_metrics in pools.items(): + if not isinstance(pool_metrics, dict): + continue + for metric_name, value in pool_metrics.items(): + # Try specific pool name first, then wildcard '*' + metric_path = f"{plugin_name}.{pool_name}.{metric_name}" + wildcard_path = f"{plugin_name}.*.{metric_name}" + threshold = thresholds.get(metric_path) or thresholds.get(wildcard_path) + if threshold is None: + continue + if metric_path not in alert_states: + alert_states[metric_path] = AlertState(metric_path) + alert_state = alert_states[metric_path] + new_level = threshold.evaluate_with_hysteresis(value, alert_state.level) + threshold_value = None + if new_level == AlertLevel.CRITICAL and threshold.critical is not None: + threshold_value = threshold.critical + elif new_level == AlertLevel.WARNING and threshold.warning is not None: + threshold_value = threshold.warning + alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None + pool_context = dict(pool_metrics) + pool_context["pool_name"] = pool_name + old_level = alert_state.level + if alert_state.update(new_level, value, threshold_value, threshold.operator.value): + state_changes.append((metric_path, old_level, new_level, value)) + self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, pool_context, metric_name=pool_name) + elif new_level != AlertLevel.OK: + self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, pool_context, metric_name=pool_name) + # Look for partition data in disk_monitor if plugin_name == "disk_monitor" and "partitions" in data: partitions = data["partitions"]