From 217bba1b769679399df64fbab89e8c20eebf1ebf Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Fri, 8 May 2026 16:57:45 -0400 Subject: [PATCH] fix: change health_ok to status --- docs/THRESHOLD_ALERTING.md | 12 ++++++------ hbd/client/plugins/zfs_monitor.py | 10 +++++++++- hbd/config_thresholds_example.yaml | 8 ++++---- hbd/server/config.py | 5 ++--- hbd/server/threshold.py | 16 ++++++++-------- 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index 8ca5222..cb35b12 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -268,9 +268,9 @@ The default threshold is equivalent to: zfs_monitor: pools: '*': - health_ok: + status: critical: 1 - operator: "<" + operator: ">" hysteresis: 0.0 display: "ZFS pool {pool_name} is {health}" ``` @@ -285,7 +285,7 @@ zfs_monitor: pools: # Suppress health alerts for a scratch pool (not mission-critical) scratch: - health_ok: + status: enabled: false # Capacity threshold for a specific pool @@ -297,12 +297,12 @@ zfs_monitor: hysteresis: 0.05 ``` -**Alert state paths** follow the pattern `zfs_monitor..health_ok`, +**Alert state paths** follow the pattern `zfs_monitor..status`, so acknowledgements and silences target individual pools: ``` -zfs_monitor.tank.health_ok -zfs_monitor.backup.health_ok +zfs_monitor.tank.status +zfs_monitor.backup.status ``` ### Network Monitor diff --git a/hbd/client/plugins/zfs_monitor.py b/hbd/client/plugins/zfs_monitor.py index ef763a1..3c43437 100644 --- a/hbd/client/plugins/zfs_monitor.py +++ b/hbd/client/plugins/zfs_monitor.py @@ -90,9 +90,17 @@ class ZFSMonitorPlugin(MonitorPlugin): if self._pools_filter and name not in self._pools_filter: continue health = parts[1].strip() + if health == "ONLINE": + status = 0 + elif health in ("DEGRADED", "ONLINE with errors"): + status = 1 + elif health in ("FAULTED", "OFFLINE", "UNAVAIL"): + status = 2 + else: + status = 3 # unknown status pools[name] = { "health": health, - "health_ok": 1 if health == "ONLINE" else 0, + "status": status, "size": _int(parts[2]), "alloc": _int(parts[3]), "free": _int(parts[4]), diff --git a/hbd/config_thresholds_example.yaml b/hbd/config_thresholds_example.yaml index 02a7659..698efec 100644 --- a/hbd/config_thresholds_example.yaml +++ b/hbd/config_thresholds_example.yaml @@ -139,13 +139,13 @@ thresholds: # ---------------------------------------------------------------------------- zfs_monitor: # Pool health check — built-in default; shown here for reference/override. - # health_ok is 1 (ONLINE) or 0 (DEGRADED, SUSPENDED, FAULTED, UNAVAIL…). + # status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…). # Use '*' to apply the same rule to every pool, or name a specific pool. pools: '*': - health_ok: - critical: 1 # Alert CRITICAL when pool is not ONLINE - operator: "<" + status: + critical: 0 # Alert CRITICAL when pool is not ONLINE + operator: ">" hysteresis: 0.0 # No hysteresis — a degraded pool is always critical display: "ZFS pool {pool_name} is {health}" diff --git a/hbd/server/config.py b/hbd/server/config.py index 4b878d0..0487b51 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -108,9 +108,8 @@ THRESHOLD_DEFAULTS = { 'zfs_monitor': { 'pools': { '*': { - 'health_ok': { - 'critical': 1, - 'operator': '<', + 'status': {0, + 'operator': '>', 'hysteresis': 0.0, 'display': 'ZFS pool {pool_name} is {health}', } diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 5a4e21f..5310f5c 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -575,7 +575,7 @@ class ThresholdChecker: if not isinstance(threshold_config, dict): continue - # Handle nested metrics (e.g., partitions./.percent or pools.*.health_ok) + # Handle nested metrics (e.g., partitions./.percent or pools.*.status) if metric_name == "partitions": self._parse_partition_thresholds(plugin_name, threshold_config, target_dict) continue @@ -680,9 +680,9 @@ class ThresholdChecker: zfs_monitor: pools: '*': - health_ok: - critical: 1 - operator: '<' + status: + critical: 0 + operator: '>' tank: capacity: warning: 80 @@ -1026,11 +1026,11 @@ class ThresholdChecker: for pool_name, pool_metrics in pools.items(): if not isinstance(pool_metrics, dict): continue - # Synthesize health_ok from health string for older clients - # that predate the health_ok field. + # Synthesize status from health string for older clients + # that predate the status field. pool_metrics_effective = dict(pool_metrics) - if "health" in pool_metrics and "health_ok" not in pool_metrics: - pool_metrics_effective["health_ok"] = 1 if pool_metrics["health"] == "ONLINE" else 0 + if "health" in pool_metrics and "status" not in pool_metrics: + pool_metrics_effective["status"] = 0 if pool_metrics["health"] == "ONLINE" else 1 for metric_name, value in pool_metrics_effective.items(): # Try specific pool name first, then wildcard '*' metric_path = f"{plugin_name}.{pool_name}.{metric_name}"