feat: host-level watch flag suppresses notifications; filter dashboard/overview by owner/manager; add ZFS monitor plugin

- watch: true (default) per host; watch: false suppresses all notifications
  for that host in udp.py and threshold.py
- Live Dashboard and Host Overview now show only hosts where the logged-in
  user is owner or manager (admins see all); WebSocket broadcasts filtered
  per-connection by the same rule
- Add hbd/client/plugins/zfs_monitor.py: collects per-pool health, capacity,
  fragmentation, dedup ratio, and cumulative I/O ops/bandwidth via zpool(8)

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andreas Wrede
2026-05-02 12:42:35 -04:00
parent cffc9805f9
commit 691f62aa69
8 changed files with 234 additions and 45 deletions
+1 -1
View File
@@ -225,7 +225,7 @@ def get_watchhosts(config):
hosts_config = config.get("hosts", {})
if isinstance(hosts_config, dict):
for host_name, host_attrs in hosts_config.items():
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
if isinstance(host_attrs, dict) and host_attrs.get("watch", True):
watchhosts.append(host_name)
return watchhosts
+2 -1
View File
@@ -286,7 +286,7 @@ class Host:
Host.hosts[name] = self
self.num = num
self.dyn = False
self.watched = False
self.watched = True
self.upcount = 0
self.interval = 0
self.doesack = -1
@@ -304,6 +304,7 @@ class Host:
def statedict(self):
d = {}
d["raw_name"] = self.name
d["name"] = self.name
if self.dyn:
d["name"] += "*"
+4 -2
View File
@@ -258,7 +258,9 @@ async def start(
extra_scripts=extra_scripts,
hbd_version=hbd_version,
hosts=[
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
hbdclass.Host.hosts[h].stateinfo()
for h in sorted(hbdclass.Host.hosts)
if _can_operate_host(current_user, hbdclass.Host.hosts[h])
],
messages=data.msgs[-30:],
current_user=current_user.to_dict() if current_user else None,
@@ -510,7 +512,7 @@ async def start(
hosts_with_plugins = []
for hostname in sorted(hbdclass.Host.hosts.keys()):
host = hbdclass.Host.hosts[hostname]
if not _can_view_host(current_user, host):
if not _can_operate_host(current_user, host):
continue
if host.plugin_data:
hosts_with_plugins.append({
+1 -1
View File
@@ -188,7 +188,7 @@ def get_settings_sections(config: dict) -> list:
continue
hosts_list.append({
"name": hname,
"watch": bool(hcfg.get("watch", False)),
"watch": bool(hcfg.get("watch", True)),
"dyndns": bool(hcfg.get("dyndns", False)),
"owner": hcfg.get("owner", ""),
"managers": hcfg.get("managers", []),
+18 -10
View File
@@ -9,6 +9,7 @@ This module provides a flexible threshold checking system that:
- Supports multiple comparison operators
"""
import asyncio
import logging
import time
from enum import Enum
@@ -1020,6 +1021,11 @@ class ThresholdChecker:
value: Any,
):
"""Send notification and log to journal/eventlog."""
from . import hbdclass
host = hbdclass.Host.hosts.get(host_name)
if host is not None and not host.watched:
eventlog(host_name, lvl, message, service="threshold")
return
asyncio.get_event_loop().create_task(notify_mod.send_notification(
host_name,
notify_mod.Notification(
@@ -1032,7 +1038,6 @@ class ThresholdChecker:
# Log to journal
if self.journal is not None:
try:
import asyncio
loop = asyncio.get_event_loop()
loop.create_task(self.journal.log_threshold_event(
host_name=host_name,
@@ -1224,17 +1229,20 @@ class ThresholdChecker:
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
asyncio.get_event_loop().create_task(notify_mod.send_notification(
host_name,
notify_mod.Notification(
title=f"[REMINDER/{alert_state.level.name}] {host_name}",
body=message,
level=alert_state.level.name,
),
))
from . import hbdclass
host = hbdclass.Host.hosts.get(host_name)
if host is None or host.watched:
asyncio.get_event_loop().create_task(notify_mod.send_notification(
host_name,
notify_mod.Notification(
title=f"[REMINDER/{alert_state.level.name}] {host_name}",
body=message,
level=alert_state.level.name,
),
))
logger.info("Re-notification sent: %s", message)
alert_state.last_notification = now
alert_state.notification_count += 1
logger.info("Re-notification sent: %s", message)
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
"""
+25 -20
View File
@@ -211,10 +211,11 @@ def _make_timer_callbacks(uname, host, ctx):
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL", msg)
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
))
if host.watched:
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
))
# Track in alert_states so the Alerts Dashboard shows this
_set_connectivity_alert(host, connection.afam, "CRITICAL")
if threshold_checker:
@@ -407,10 +408,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if res:
eventlog(uname, "WARNING", res)
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
))
if host.watched:
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
))
interval = int(msg.get("interval", 0) or 0)
shutdown = msg.get("shutdown", 0)
@@ -420,10 +422,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if boot:
eventlog(uname, "INFO", "booted")
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
))
if host.watched:
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
))
if message:
eventlog(uname, "INFO", "msg: %s" % message, service=service)
@@ -440,10 +443,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
))
if host.watched:
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
))
if boot or newh:
host.upcount = host.doesack
@@ -453,10 +457,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if shutdown:
m = "%s shutdown" % conn.afam
eventlog(uname, "INFO", m)
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
))
if host.watched:
asyncio.create_task(notify_mod.send_notification(
uname,
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
))
conn.newstate(hbdcls.Connection.DOWN, now)
_set_connectivity_alert(host, conn.afam, "CRITICAL")
+53 -10
View File
@@ -13,7 +13,8 @@ from . import data
logger = logging.getLogger(__name__)
_connections: set = set()
# Map of WebSocket → User object (or None when auth is disabled)
_connections: dict = {}
_loop: Optional[asyncio.AbstractEventLoop] = None
_get_hosts: Optional[Callable[[], Iterable]] = None
_verbose: bool = False
@@ -34,23 +35,53 @@ def setup(
_verbose = verbose
def _user_can_see_host(user, host_name: str) -> bool:
"""Return True if *user* may see updates for *host_name* (manager or higher)."""
from . import hbdclass, users as users_mod
if user is None or not users_mod.users_enabled():
return True
if user.admin:
return True
host = hbdclass.Host.hosts.get(host_name)
if host is None:
return False
return host.is_manager(user.username)
def _get_token(request) -> str:
"""Extract session token from request (mirrors logic in http.py)."""
auth = request.headers.get("Authorization", "")
if auth.startswith("Bearer "):
return auth[7:].strip()
token = request.headers.get("X-Auth-Token", "")
if token:
return token
return request.cookies.get("hbd_session", "")
async def handler(request):
"""aiohttp WebSocket upgrade handler — register as GET /ws."""
from aiohttp import web
from . import users as users_mod
ws = web.WebSocketResponse()
await ws.prepare(request)
_connections.add(ws)
token = _get_token(request)
user = users_mod.get_session_user(token) if token else None
_connections[ws] = user
remote = request.remote
logger.info("WebSocket connected from %s", remote)
try:
# Send current host state to the new client
# Send current host state, filtered to hosts this user may see
if _get_hosts:
try:
for h in list(_get_hosts()):
await ws.send_str(json.dumps({"type": "host", "data": h}))
host_name = h.get("raw_name") or h.get("name", "")
if _user_can_see_host(user, host_name):
await ws.send_str(json.dumps({"type": "host", "data": h}))
except Exception as e:
logger.error("Error sending initial hosts: %s", e)
@@ -74,7 +105,7 @@ async def handler(request):
except Exception as e:
logger.exception("WebSocket handler error from %s: %s", remote, e)
finally:
_connections.discard(ws)
_connections.pop(ws, None)
logger.info("WebSocket disconnected from %s", remote)
return ws
@@ -83,25 +114,37 @@ async def handler(request):
def broadcast(typ: str, payload) -> bool:
"""Thread-safe broadcast to all connected WebSocket clients.
For host and plugin updates, only sends to clients whose user has
manager-or-higher access to that host. Other message types are
broadcast to all clients.
Can be called from any thread; schedules sends on the event loop.
Returns False if the loop is not running yet.
"""
if not _loop:
return False
# Determine the host name for access-filtered message types
host_name: Optional[str] = None
if typ in ("host", "plugin"):
host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
jmsg = json.dumps({"type": typ, "data": payload})
async def _send_all():
dead = set()
for ws in list(_connections):
for ws, user in list(_connections.items()):
try:
if not ws.closed:
await ws.send_str(jmsg)
else:
if ws.closed:
dead.add(ws)
continue
if host_name is not None and not _user_can_see_host(user, host_name):
continue
await ws.send_str(jmsg)
except Exception:
dead.add(ws)
for ws in dead:
_connections.discard(ws)
_connections.pop(ws, None)
asyncio.run_coroutine_threadsafe(_send_all(), _loop)
return True