diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index 1ad94fb..8ca5222 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -256,6 +256,55 @@ disk_monitor: operator: "<" ``` +### ZFS Monitor + +ZFS pool health is checked automatically for every pool. A pool in any state +other than `ONLINE` (e.g. `DEGRADED`, `SUSPENDED`, `FAULTED`, `UNAVAIL`) raises +a **CRITICAL** alert by default — no configuration required. + +The default threshold is equivalent to: + +```yaml +zfs_monitor: + pools: + '*': + health_ok: + critical: 1 + operator: "<" + hysteresis: 0.0 + display: "ZFS pool {pool_name} is {health}" +``` + +`'*'` matches every pool on the host. The notification message includes the pool +name and its current health string, e.g. `ZFS pool tank is DEGRADED`. + +**Override for specific pools** — named pool entries take priority over `'*'`: + +```yaml +zfs_monitor: + pools: + # Suppress health alerts for a scratch pool (not mission-critical) + scratch: + health_ok: + enabled: false + + # Capacity threshold for a specific pool + tank: + capacity: + warning: 75.0 + critical: 90.0 + operator: ">" + hysteresis: 0.05 +``` + +**Alert state paths** follow the pattern `zfs_monitor..health_ok`, +so acknowledgements and silences target individual pools: + +``` +zfs_monitor.tank.health_ok +zfs_monitor.backup.health_ok +``` + ### Network Monitor ```yaml diff --git a/hbd/config_thresholds_example.yaml b/hbd/config_thresholds_example.yaml index 97058e4..02a7659 100644 --- a/hbd/config_thresholds_example.yaml +++ b/hbd/config_thresholds_example.yaml @@ -134,6 +134,29 @@ thresholds: hysteresis: 0.1 enabled: true + # ---------------------------------------------------------------------------- + # ZFS Monitor Thresholds + # ---------------------------------------------------------------------------- + zfs_monitor: + # Pool health check — built-in default; shown here for reference/override. + # health_ok is 1 (ONLINE) or 0 (DEGRADED, SUSPENDED, FAULTED, UNAVAIL…). + # Use '*' to apply the same rule to every pool, or name a specific pool. + pools: + '*': + health_ok: + critical: 1 # Alert CRITICAL when pool is not ONLINE + operator: "<" + hysteresis: 0.0 # No hysteresis — a degraded pool is always critical + display: "ZFS pool {pool_name} is {health}" + + # Per-pool capacity thresholds (optional; add pools you care about) + # tank: + # capacity: + # warning: 75.0 # Warn at 75% used + # critical: 90.0 # Critical at 90% used + # operator: ">" + # hysteresis: 0.05 + # ---------------------------------------------------------------------------- # Network Monitor Thresholds # ---------------------------------------------------------------------------- diff --git a/hbd/server/notify.py b/hbd/server/notify.py index a44067e..d828d2f 100644 --- a/hbd/server/notify.py +++ b/hbd/server/notify.py @@ -141,9 +141,11 @@ def _send_pushover(channel_cfg: dict, notif: Notification) -> bool: logger.warning("pushover: missing token or user") return False params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body} + if channel_cfg.get("sound"): + params["sound"] = channel_cfg["sound"] if notif.url: params["url"] = notif.url - params["url_title"] = "Plugin metrics" + params["url_title"] = "Heartbeat" conn = http.client.HTTPSConnection("api.pushover.net:443") try: conn.request(