version 5.2.6

fix: show human-readable duration in re-notification messages
Replace raw seconds with d h m s format in "ongoing for ..." strings. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 06:56:00 -04:00 · 2026-05-09 06:53:41 -04:00 · 2026-05-09 06:46:13 -04:00 · 2026-05-09 06:24:27 -04:00 · 2026-05-08 17:25:50 -04:00 · 2026-05-08 17:18:41 -04:00
110 changed files with 26492 additions and 3192 deletions
@@ -0,0 +1,51 @@
+name: Release
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  release:
+    runs-on: FreeBSD
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        
+#      - name: Set up Python
+#        uses: actions/setup-python@v5
+#        with:
+#          python-version: '3.11'
+      - name: Set up Python
+        # Use a generic run step for FreeBSD if actions/setup-python
+        # fails in restricted environments.
+        run: |
+          python3 --version
+          python3 -m ensurepip --upgrade
+          
+      - name: Install build tools
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install build twine
+          
+      - name: Build package
+        run: python3 -m build
+        
+      - name: Extract version from tag
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
+        
+      - name: Upload to Gitea PyPI registry
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/*
+
+      - name: Create release
+        uses: actions/gitea-release-action@v1
+        with:
+          files: |
+            dist/*.whl
+            dist/*.tar.gz
+          title: "Release ${{ steps.get_version.outputs.VERSION }}"
+          body: "Release version ${{ steps.get_version.outputs.VERSION }}"
@@ -11,3 +11,4 @@ dist/
 *.egg-info/
 ssl/
 uv.lock
+.hb.yaml
@@ -1,44 +0,0 @@
-#name: "w02"
-hb_port: 50003   
-hbd_host: ''
-#logfile: "/home/andreas/public_html/messages/andreas"
-logfile: "/Users/andreas/public_html/messages/andreas"
-logfmt: "msg"
-grace: 40
-interval: 10
-watchhosts: 
-#    "localhost":
-#    "haschloss" :
-#    "cotgate":
-    "wentworth":
-        notify: +4915123456789
-        src: "signal"
-    "y":
-        notify: +4915123456789
-        src: "signal"
-    "winter":
-        notify: +14168226179
-        src: "signal"
-dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
-drophosts: {"unknown", "wookie15", "wort"}
-nsupdate_bin: "/usr/local/bin/nsupdate"
-pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
-pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
-pushsrv: "pushover"
-
-dyndomains: {"wrede.org"}
-toemail: ["aew.hbd.notify@wrede.ca"]
-fromemail: "aew.hbd@wrede.ca"
-smtpserver: "smtp.fastmail.com"
-smtpuser: "andreas@wrede.ca"
-smtppassword: "r8psra6wj6gcakkp"
-smtpport: 587
-
-ws_port: 50005
-wss_port: 50006
-cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
-cert_path: "ssl/"
-# CERT_PATH = "./test/"
-wss_pem: "fullchain.pem"
-wss_key: "privkey.pem"
-
@@ -4,12 +4,13 @@
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
+
    {
      "name": "Python: Run hbd (module)",
      "type": "debugpy",
      "request": "launch",
-      "module": "hbd.cli",
-      "args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
+      "module": "hbd.server.cli",
+      "args": ["-c", "~/.hb.yaml", "-f", "-v"],
      "cwd": "${workspaceFolder}",
      "env": {
        "PYTHONPATH": "${workspaceFolder}"
@@ -28,14 +29,14 @@
      ]
    },
    {
-      "name": "Python: Run hbd with debugpy (listen)",
+      "name": "Python: Run hbc (module)",
      "type": "debugpy",
      "request": "launch",
-      "module": "debugpy",
-      "args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
+      "module": "hbd.client.main",
+      "args": ["-c", "~/.hbc.yaml",  "-v", "winter"],
+      "cwd": "${workspaceFolder}",
      "env": { "PYTHONPATH": "${workspaceFolder}" },
      "console": "integratedTerminal",
-      "justMyCode": false
    }
  ]
 }
@@ -2,5 +2,8 @@
    "python.pythonPath": "/usr/bin/python3",
    "python.linting.enabled": true,
    "python.formatting.provider": "black",
-    "python.linting.flake8Enabled": true
+    "python.linting.flake8Enabled": true,
+    "chat.tools.terminal.autoApprove": {
+        "mv": true
+    }
 }
@@ -0,0 +1,4 @@
+1. Don't assume. Don't hide confusion. Surface tradeoffs.
+2. Minimum code that solves the problem. Nothing speculative.
+3. Touch only what you must. Clean up only your own mess.
+4. Define success criteria. Loop until verified.
@@ -11,15 +11,463 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
 - Queue DNS updates via `nsupdate` and run them in a background thread ✅
 - WebSocket API for live updates (hosts & messages) ✅
 - Notification pipeline (email, Pushover, Mattermost, Signal) ✅
+- **User management & access control** ✅
+  - Optional user accounts with bcrypt-style password hashing (stdlib only)
+  - Per-host roles: owner, manager, monitor
+  - Session-based auth with cookie support (browser login page included)
+  - Backwards compatible: no auth required when no users are configured
+- **HTTP API & Web UI** ✅
+  - REST API for plugin data, alerts, host information, and user management
+  - Live dashboard with WebSocket updates
+  - Interactive plugin metrics visualization
+  - Alerts dashboard with filtering and summaries
+- **Message journal with automatic log rotation** ✅
+  - Logs all received messages in JSON format
+  - Size-based automatic rotation
+  - Configurable retention and backup management
+- **Plugin system for extensible monitoring** ✅
+  - Collect system metrics (CPU, memory, disk, network)
+  - Monitor ZFS pool health, capacity, and I/O via `zpool(8)`
+  - Execute existing Nagios monitoring plugins
+  - Create custom plugins with simple Python classes
+- **Threshold alerting system** ✅
+  - Monitor metrics against configurable WARNING/CRITICAL thresholds
+  - Hysteresis to prevent alert flapping
+  - Automatic notifications on state changes
+  - Re-notification for ongoing alerts
+- **Per-host watch flag** — set `watch: false` on any host to silence all notifications for that host without removing its configuration ✅
+- **Role-filtered dashboards** — Live Dashboard and Host Overview show only hosts where the logged-in user is owner or manager (admins see all) ✅
 - Modular codebase suitable for unit testing and CI ✅

 ---

+## 🔌 Plugin System
+
+Heartbeat includes a comprehensive plugin architecture that extends monitoring beyond simple heartbeats. The plugin system allows you to:
+
+- **Collect system information**: OS details, hardware info, system configuration
+- **Monitor resources**: CPU usage, memory, disk space, network statistics
+- **Run Nagios plugins**: Execute thousands of existing Nagios monitoring plugins without modification
+- **Create custom plugins**: Build your own monitoring logic with simple Python classes
+
+### Plugin Types
+
+- **InfoPlugin**: Collects static information once (e.g., OS version, hardware specs)
+- **MonitorPlugin**: Collects metrics periodically (e.g., CPU usage every 30 seconds)
+
+### Built-in Plugins
+
+- `os_info`: Collects OS, kernel, distribution, and architecture information
+- `cpu_monitor`: Monitors CPU usage, load average, frequency, process counts, and uptime
+- `memory_monitor`: Monitors RAM and swap usage, available memory (ZFS ARC-aware)
+- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
+- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
+- `ping_monitor`: Measures round-trip latency to configured hosts
+- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
+- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
+- `zfs_monitor`: Monitors ZFS pool health, capacity, fragmentation, dedup ratio, and cumulative I/O via `zpool(8)`
+
+### Nagios Integration
+
+The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
+
+- Executes plugins asynchronously (non-blocking) with timeout protection
+- Captures both stdout and stderr; if stdout is empty, stderr is used as the status message
+- Handles signal-killed processes (negative exit code → UNKNOWN status)
+- Validates absolute command paths at startup and warns on missing or non-executable files
+- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
+- Extracts performance data with thresholds
+- Reports per-check status, exit code, and output; no aggregate rollup field
+
+See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
+
+### Creating Custom Plugins
+
+```python
+from hbd.client.plugin import MonitorPlugin
+
+class DiskMonitorPlugin(MonitorPlugin):
+    name = "disk_monitor"
+    interval = 60  # Run every 60 seconds
+    
+    async def collect(self):
+        return {
+            "disk_usage": get_disk_usage(),
+            "timestamp": time.time()
+        }
+```
+
+Place plugins in `hbd/client/plugins/` and they'll be automatically discovered and loaded by the client.
+
+---
+
+## 📝 Message Journal
+
+Heartbeat includes a message journal that logs all received messages with automatic rotation.
+
+### Features
+
+- **JSON Format**: All messages logged in JSONL (JSON Lines) format for easy parsing
+- **Automatic Rotation**: Size-based rotation with configurable thresholds
+- **Backup Management**: Keeps configurable number of rotated log files
+- **Non-blocking**: Async logging with minimal performance impact
+
+### Configuration
+
+```yaml
+# Message journal settings
+journal_enabled: true                    # Enable/disable journaling
+journal_dir: /var/log/heartbeat         # Journal directory
+journal_file: messages.journal           # Base filename
+journal_max_size: 104857600             # Max size (100MB default)
+journal_max_backups: 10                 # Number of backups to keep
+```
+
+### Example Journal Entry
+
+```json
+{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
+```
+
+### Analyzing Journal Files
+
+```bash
+# View recent messages
+tail -100 /var/log/heartbeat/messages.journal | jq .
+
+# Count messages by type
+cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
+
+# Filter by hostname
+cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
+```
+
+See [docs/MESSAGE_JOURNAL.md](docs/MESSAGE_JOURNAL.md) for complete documentation including rotation behavior, integration with log management systems, and analysis examples.
+
+---
+
+## 🚨 Threshold Alerting
+
+Heartbeat includes a sophisticated threshold alerting system that monitors plugin metrics and triggers notifications when values exceed configured limits.
+
+### Features
+
+- **Multi-level alerts**: WARNING and CRITICAL severity levels
+- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
+- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
+- **Smart notifications**: Alerts only on state changes, not every check; de-escalations (e.g. CRITICAL → WARNING) do not generate a notification
+- **Re-notifications**: Periodic reminders for ongoing alerts
+- **Short-duration suppression**: Recovery notifications are suppressed for down events under 4 seconds (avoids noise from transient blips)
+- **Journal integration**: All threshold events logged for audit trail
+- **`ping_monitor` thresholds**: Latency and packet-loss thresholds use the same format as all other plugin metrics
+
+### Configuration
+
+```yaml
+thresholds:
+  # RTT (Round-Trip Time) thresholds for heartbeat monitoring
+  # These are checked on every HTB message arrival
+  rtt:
+    webserver01:
+      warning: 100.0   # Warn when RTT > 100ms
+      critical: 500.0  # Critical when RTT > 500ms
+    
+    database01:
+      warning: 50.0
+      critical: 200.0
+  
+  # Plugin metric thresholds
+  cpu_monitor:
+    cpu_percent:
+      warning: 80.0      # Warn when CPU > 80%
+      critical: 90.0     # Critical when CPU > 90%
+      operator: ">"
+      hysteresis: 0.02   # 2% hysteresis to prevent flapping
+      display: "(threshold: {op_symbol} {threshold_value}%)"  # optional
+  
+  memory_monitor:
+    percent:
+      warning: 85.0
+      critical: 95.0
+  
+  disk_monitor:
+    partitions:
+      /:
+        percent:
+          warning: 80.0
+          critical: 90.0
+        free_gb:
+          warning: 10.0   # Alert when < 10GB free
+          critical: 5.0
+          operator: "<"   # Inverse threshold
+
+# Global settings
+threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts
+```
+
+### RTT Monitoring
+
+Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
+
+- **Per-host configuration**: Set different thresholds for each monitored host
+- **Real-time checking**: Thresholds evaluated on every HTB message arrival
+- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
+- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
+- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
+- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
+- **Re-notification**: Periodic reminders for ongoing RTT issues
+- **Event & journal logging**: All RTT events logged for audit trail
+
+**Configuration format:**
+```yaml
+thresholds:
+  rtt:
+    <hostname>:
+      warning: <milliseconds>   # Warn when RTT > this value
+      critical: <milliseconds>  # Critical when RTT > this value
+      hysteresis: 0.02          # Optional: 2% hysteresis (default)
+```
+
+**Example alerts:**
+```
+WARNING: webserver01 - rtt.webserver01 = 125.3
+CRITICAL: database01 - rtt.database01 = 520.1
+RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
+```
+
+RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
+
+### Alert Behavior
+
+1. **State Changes**: Notifications sent when crossing thresholds
+   - OK → WARNING: Early notification
+   - WARNING → CRITICAL: Escalation
+   - CRITICAL → OK: Recovery
+
+2. **Hysteresis**: Prevents rapid state transitions
+   ```
+   Critical threshold: 90%
+   Hysteresis: 10%
+   Recovery threshold: 81% (90 - 10% of 90)
+   
+   Value 91% → CRITICAL (threshold crossed)
+   Value 85% → CRITICAL (still above 81%)
+   Value 79% → OK (below recovery threshold)
+   ```
+
+3. **Re-notifications**: Periodic reminders for ongoing alerts
+   - Default: Every 60 minutes
+   - Configurable via `threshold_renotify_interval`
+
+### Example Notifications
+
+```
+WARNING: webserver01 - cpu_monitor.cpu_percent = 85.0
+CRITICAL: webserver01 - memory_monitor.percent = 96.0
+RECOVERED: database01 - disk_monitor./.percent = 75.0 (WARNING -> OK)
+REMINDER (CRITICAL): mailserver - cpu_monitor.load_1min = 12.5 (ongoing for 3600s)
+```
+
+### Supported Metrics
+
+All plugin metrics can be thresholded:
+
+- **CPU**: cpu_percent, load_1min, load_5min, load_15min
+- **Memory**: percent, available_mb, swap_percent
+- **Disk**: Per-partition percent, free_gb, free_mb
+- **Network**: errors_total, dropped packets, connection counts
+- **Nagios**: Any field emitted by `nagios_runner` (`<name>_status_code`, `<name>_status`, `<name>_output`, performance data fields)
+
+### Display Format Templates
+
+Each threshold entry accepts an optional `display` field — a Python format string shown in notifications and on the Alerts dashboard:
+
+```yaml
+nagios_runner:
+  status_code:
+    warning: 1
+    critical: 2
+    operator: ">="
+    display: "{check_name}: exit {value} (expected < {threshold_value})"
+```
+
+Available variables:
+
+| Variable | Description |
+|---|---|
+| `{value}` | Current metric value |
+| `{threshold_value}` | Threshold that was crossed |
+| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
+| `{check_name}` | Prefix stripped by generic matching (see below) |
+| `{metric_name}` | Full field name within the plugin data |
+| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
+| `{status}` | For `nagios_runner` generic matches: the matched check's status name — OK/WARNING/CRITICAL/UNKNOWN (alias for `{check_name}_status`) |
+| any plugin field | Any other field present in the plugin's data |
+
+### Generic Threshold Matching
+
+When a metric name has no exact threshold entry, the server progressively strips leading underscore-separated segments and re-tries the lookup. This lets a single generic entry cover an entire family of metrics.
+
+The classic use case is `nagios_runner`, which names each metric after the command that produced it:
+
+```
+nagios_runner.check_disk_root_status_code    → no exact match
+nagios_runner.disk_root_status_code          → no match
+nagios_runner.root_status_code               → no match
+nagios_runner.status_code                    → matched ✓
+```
+
+Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
+
+```yaml
+nagios_runner:
+  status_code:
+    operator: "nagios"   # 0=OK  1=WARNING  2=CRITICAL  3=UNKNOWN
+    display: "{check_name}: {output}"
+```
+
+The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
+
+Exact matches always take priority. A generic entry only applies when no specific one is defined.
+
+### Per-Host Threshold Profiles
+
+Named threshold configurations let different hosts use different limits. A host's `threshold_config` can be a single name or a **list** — lists are applied left-to-right so profiles compose without duplication:
+
+```yaml
+threshold_configs:
+  default:
+    thresholds:
+      cpu_monitor:
+        cpu_percent: {warning: 80, critical: 90}
+      memory_monitor:
+        memory_percent: {warning: 85, critical: 95}
+
+  tight_cpu:           # override CPU limits only
+    thresholds:
+      cpu_monitor:
+        cpu_percent: {warning: 60, critical: 75}
+
+  db_disk:             # add a database partition check
+    thresholds:
+      disk_monitor:
+        partitions:
+          /var/lib/postgresql:
+            percent: {warning: 75, critical: 88}
+
+hosts:
+  web-01:
+    threshold_config: default          # single profile
+
+  db-01:
+    threshold_config: [tight_cpu, db_disk]   # layered: CPU override + extra disk check
+```
+
+Each named config's overrides are applied in order on top of the defaults. Metrics not mentioned in a profile are inherited unchanged.
+
+See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration.
+
+---
+
+## 👥 User Management
+
+Heartbeat supports optional user accounts with role-based access control per host.
+
+### Roles
+
+- **monitor** — view status, plugin data, alerts
+- **manager** — monitor + queue commands, trigger DNS, queue upgrades
+- **owner** — manager + drop host, transfer ownership, update access
+- **admin** (user flag) — owner-level access on every host
+
+When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
+
+### Quick setup
+
+```yaml
+users:
+  alice:
+    full_name: Alice Smith
+    password: pbkdf2:sha256:...    # hbd passwd alice
+    admin: true
+
+default_owner: alice
+
+hosts:
+  webserver01:
+    owner: alice
+    managers: [bob]
+    monitors: [carol]
+```
+
+```bash
+# Generate a password hash
+hbd passwd alice
+```
+
+Browser users are redirected to `/login` automatically. The session cookie is set on login, so `fetch()` calls from dashboards work without any JavaScript changes.
+
+See [docs/USERS.md](docs/USERS.md) for complete user management documentation.
+
+---
+
+## 🌐 HTTP API & Web UI
+
+Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST API and web-based dashboards for monitoring and visualization.
+
+### Features
+
+- **User auth**: Optional session-based authentication with per-host role enforcement
+- **REST API**: JSON endpoints for accessing plugin data, alerts, host information, and user management
+- **Live Dashboard**: Real-time WebSocket-powered host status view
+- **Plugin Metrics**: Interactive visualization of all plugin data with auto-refresh
+- **Alerts Dashboard**: Comprehensive alert monitoring with filtering and summaries
+
+### Web Dashboards
+
+- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
+- **Live View** (`/live`): Real-time host connectivity, latency, and messages; hostnames link directly to the Host Overview page
+- **Host Overview** (`/plugins/<host>`): Per-host plugin metrics with ZFS pool visualization; filtered to hosts where the logged-in user is owner or manager (admins see all)
+- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering; alert count pie chart shown in the navigation bar
+- **Settings** (`/settings`): Server configuration, user management, and threshold configuration viewer
+
+### API Endpoints
+
+```bash
+# Log in (when auth is configured)
+TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
+  -H 'Content-Type: application/json' \
+  -d '{"username":"alice","password":"secret"}' | jq -r .token)
+AUTH="-H \"Authorization: Bearer $TOKEN\""
+
+# List all monitored hosts
+curl $AUTH http://localhost:50004/api/0/hosts
+
+# Get all plugin data for a host
+curl $AUTH http://localhost:50004/api/0/hosts/webserver01/plugins
+
+# Get detailed plugin history (last 50 samples)
+curl $AUTH "http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=50"
+
+# Get alert states for a specific host
+curl $AUTH http://localhost:50004/api/0/hosts/webserver01/alerts
+
+# Get all active alerts across all hosts
+curl $AUTH http://localhost:50004/api/0/alerts
+
+# View/update host access roles
+curl $AUTH http://localhost:50004/api/0/hosts/webserver01/access
+```
+
+See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation including response formats, error handling, and integration examples.
+
+---
+
 ## ⚙️ Quickstart

 Prerequisites:

- Python 3.10+ (project uses language features from recent Python)
+- Python 3.11+ (project uses language features from recent Python)
 - `nsupdate` (for DNS updates) if using dynamic DNS

 Install dependencies (recommended into a venv):
@@ -28,7 +476,7 @@ This project now declares its dependencies in `pyproject.toml`. Instead
 of the old `requirements.txt` flow, install the package into a virtualenv
 using `pip`:

-See `scripts/install.sh` for a way to install.
+See `scripts/hb_install.sh` for a way to install.

 Run the daemon (example):

@@ -40,39 +488,161 @@ hbd -c .hb.yaml -f -v
 You can also run it directly via the package entrypoint after installation:

 ```bash
-python -m hbd.cli -c /path/to/config.yaml
+python -m hbd.server.cli -c /path/to/config.yaml
 ```

+### Running the Client
+
+The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
+
+```bash
+# Basic usage pointing to server (host is a positional argument)
+hbc your-server.example.com
+
+# Run as daemon with a config file
+hbc -d -c /etc/hbc.yaml your-server.example.com
+
+# Send a one-off boot message
+hbc --boot your-server.example.com
+
+# Verbose output
+hbc -v your-server.example.com
+
+# Send 'boot' and 'shutdown' messages on start and exit 
+hbc -b your-server.example.com
+```
+
+You can also run it via the module entrypoint:
+
+```bash
+python -m hbd.client.main your-server.example.com
+```
+
+Client configuration can also be specified in YAML (`~/.hbc.yaml`):
+
+```yaml
+hb_port: 50003        # Server port (default: 50003)
+interval: 30          # Heartbeat interval in seconds
+plugins:
+  cpu_monitor:
+    interval: 300      # Check every 5 minutes (default)
+    per_core: true
+  memory_monitor:
+    interval: 300      # Check every 5 minutes (default)
+  disk_monitor:
+    interval: 300      # Check every 5 minutes (default)
+  network_monitor:
+    interval: 300      # Check every 5 minutes (default)
+  nagios_runner:
+    interval: 300      # Check every 5 minutes (default)
+    commands:
+      - name: check_load
+        command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+      - name: check_disk
+        command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+```
+
+The server hostname is always passed as a positional command-line argument; there is no `server:` config key.
+
+All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
+
+**Connection retry:** If a server is temporarily unreachable, `hbc` retries `open()` indefinitely on every heartbeat interval. IPv6 connections that never succeeded during early startup are dropped after 3 consecutive failures (to handle hosts without IPv6 routing), while IPv4 connections always retry.
+
+**Daemon logging:** When running with `-d`, `hbc` routes all log output to syslog (`LOG_DAEMON` facility) after daemonizing. Without `-d`, logs go to stderr as usual.
+
+### hbc_mini — single-file client (no external dependencies)
+
+`scripts/hbc_mini.py` is a self-contained version of the heartbeat client that requires only Python 3.8+ and no external packages. Copy it to any host and run it directly — no virtualenv, no `pip install`.
+
+```bash
+# Basic usage
+python3 hbc_mini.py your-server.example.com
+
+# Run as daemon
+python3 hbc_mini.py -d your-server.example.com
+
+# Send a boot message
+python3 hbc_mini.py -b your-server.example.com
+
+# Send a one-off message
+python3 hbc_mini.py -m "maintenance starting" your-server.example.com
+```
+
+**Config:** `~/.hbc.json` (same keys as `~/.hbc.yaml`, JSON format). Example:
+
+```json
+{
+  "hb_port": 50003,
+  "interval": 30,
+  "plugins": {
+    "ping_monitor": {
+      "interval": 60,
+      "hosts": ["8.8.8.8", "192.168.1.1"]
+    },
+    "nagios_runner": {
+      "interval": 300,
+      "commands": [
+        {"name": "check_load", "command": "/usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6"}
+      ]
+    }
+  }
+}
+```
+
+**Plugin availability:**
+
+| Plugin | Platform | Data source |
+|---|---|---|
+| `os_info` | all | `platform` stdlib |
+| `ping_monitor` | all | `ping` subprocess |
+| `nagios_runner` | all (not Windows) | subprocess |
+| `cpu_monitor` | Linux | `/proc/stat` |
+| `memory_monitor` | Linux | `/proc/meminfo` |
+| `disk_monitor` | Linux, macOS, BSD | `df -P` subprocess |
+| `network_monitor` | Linux | `/proc/net/dev` |
+
+**What is not available compared to the full `hbc`:**
+
+- No YAML config (use JSON instead)
+- No `filesystem_info` plugin
+- No `zfs_monitor` plugin (requires `zpool(8)` and the full plugin loader)
+- `cpu_monitor` does not report per-core usage or CPU frequency (no psutil)
+- Plugins cannot be loaded from external `.py` files — all plugins are compiled in
+- No IPv6 early-fail protection — connections that fail to open at startup are silently skipped rather than retried
+
+Everything else — heartbeat protocol, ACK/CMD/UPD handling, `hb_install.sh`-based self-update, daemonize, syslog — is identical to the full client.
+
+---
+
 ## 🐞 Debugging in VS Code

 This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.

 - Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
 - Use **F5** and pick one of these configurations from the Run view:
-  - **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
+  - **Python: Run hbd (module)** — runs `hbd.server.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
  - **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
  - **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.

 To start `hbd` manually and wait for the debugger to attach, run:

 ```bash
-PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
+PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli -c .hb.yaml -f -v
 ```

-Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
+Set breakpoints in modules such as `hbd/server/udp.py`, `hbd/server/dns.py`, or `hbd/server/main.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.

 ---

 ## 🛠 Configuration

-`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
+`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/server/config.py`):

 - `hb_port`: UDP port to listen for heartbeats (default: 50003)
 - `hbd_port`: internal control port (default: 50004)
 - `hbd_host`: bind address for HTTP/WSS
 - `pickfile`: path for persisted state
 - `logfile`: path to log file
- `logfmt`: `text` or `msg`
 - `pushsrv`: push service (`pushover`|`mattermost`|`all`)
 - `interval` / `grace`: heartbeat timing configuration
 - `dyndomains`: list of dyndomains to update via `nsupdate`
@@ -84,6 +654,8 @@ Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py
 - `cert_path`: directory where TLS certificate and key are looked up (default: /usr/local/etc/ssl/)
 - `wss_pem`: filename for the certificate chain (default: fullchain.pem)
 - `wss_key`: filename for the private key (default: privkey.pem)
+- `users`: mapping of username → user attributes (full_name, avatar, password, admin, notification_channels)
+- `default_owner`: username that owns hosts with no explicit owner (falls back to first admin user)

 Example `.hb.yaml` (minimal):

@@ -96,25 +668,39 @@ nsupdate_bin: /usr/bin/nsupdate
 pushsrv: pushover
 ```

-> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
+> Tip: `SERVER_DEFAULTS` in `hbd/server/config.py` contains the canonical defaults and accepted configuration keys.

 ---

 ## 🔧 Architecture & Modules

- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
-  The DNS worker now runs as an `asyncio` task and the package exposes a
-  small thread-safe bridge so legacy synchronous code can `put()` updates
-  into the queue; there is no longer a permanently-blocking background
-  `threading.Thread`.
- `hbd.notify` — email and push notification helpers
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
- `hbd.http` — HTTP handler factory for the status UI/API
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
- `hbd.cli` — CLI entrypoint and argument parsing
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
+The package is organized into three subpackages:
+
+**`hbd.common`** — shared code used by both client and server:
+- `hbd.common.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
+- `hbd.common.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
+
+**`hbd.server`** — the heartbeat daemon (`hbd`):
+- `hbd.server.cli` — CLI entrypoint and argument parsing
+- `hbd.server.main` — async orchestration to run UDP/HTTP/WSS components
+- `hbd.server.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
+- `hbd.server.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
+  The DNS worker runs as an `asyncio` task and the package exposes a small thread-safe bridge
+  so legacy synchronous code can `put()` updates into the queue.
+- `hbd.server.notify` — email and push notification helpers
+- `hbd.server.ws` — WebSocket server and thread-safe broadcast helpers
+- `hbd.server.http` — HTTP handler factory for the status UI/API
+- `hbd.server.journal` — message journal with size-based log rotation and backup management
+- `hbd.server.threshold` — threshold alerting engine
+- `hbd.server.monitor` — host state monitoring
+- `hbd.server.hbdclass` — `Host` class and shared server state
+- `hbd.server.config` — configuration loader and defaults
+
+**`hbd.client`** — the heartbeat client (`hbc`):
+- `hbd.client.main` — client entrypoint; sends heartbeats and plugin data to the server
+- `hbd.client.plugin` — plugin framework with base classes, registry, and dynamic loader
+- `hbd.client.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
+- `hbd.client.config` — client configuration loader

 This modular layout makes the code easier to test and maintain.

@@ -122,12 +708,12 @@ This modular layout makes the code easier to test and maintain.

 - The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks.
 - On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit.
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.hbdclass.Host.dnsQ`.
+- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.server.hbdclass.Host.dnsQ`.

 **Templates & Static Files**

- Template files are located under `hbd/templates` by default. The HTTP server resolves templates relative to the `hbd` package but the path can be overridden with the `templates_dir` config key.
- Static assets (CSS/JS/images) are served from `hbd/static` via the `/static/<path>` HTTP route. Place your static files in that directory or configure the HTTP server as needed.
+- Template files are located under `hbd/server/templates`. The HTTP server resolves templates relative to the `hbd.server` package but the path can be overridden with the `templates_dir` config key.
+- Static assets (CSS/JS/images) are served from `hbd/server/static` via the `/static/<path>` HTTP route.

 ---

@@ -0,0 +1,40 @@
+async def send_sms(hass, user, password, sender_did, call):
+    """Send SMS message using multipart form-data like MMS."""
+    _LOGGER = logging.getLogger(__name__)
+    recipient = call.data.get("recipient")
+    message = call.data.get("message")
+
+    if not recipient or not message:
+        _LOGGER.error("Recipient or message missing.")
+        return
+
+    # Build form data dictionary
+    form_data = {
+        'api_username': str(user),
+        'api_password': str(password),
+        'did': str(sender_did),
+        'dst': str(recipient),
+        'message': str(message),
+        'method': 'sendSMS'
+    }
+
+    async with aiohttp.ClientSession() as session:
+        with aiohttp.MultipartWriter("form-data") as mp:
+            for key, value in form_data.items():
+                part = mp.append(value)
+                part.set_content_disposition('form-data', name=key)
+
+            _LOGGER.error("voipms_sms: sending SMS: %s", mp)
+            async with session.post(REST_ENDPOINT, data=mp) as response:
+                response_text = await response.text()
+                if response.status == 200:
+                    response_json = json.loads(response_text)
+                    if response_json['status'] == "success": 
+                        _LOGGER.info("voipms_sms: SMS sent successfully: %s", response_text)
+                    else:
+                        _LOGGER.error("voipms_sms: SMS not sent: %s", response_text)
+                else:
+                    _LOGGER.error("voipms_sms: Failed to send SMS. Status: %s, Response: %s", response.status, response_text)
+
+
+
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+Demonstration of the threshold alerting system.
+
+This script shows how thresholds work by simulating plugin data
+with values that cross various threshold boundaries.
+"""
+
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from hbd.threshold import ThresholdChecker, AlertLevel
+
+
+def demo_basic_thresholds():
+    """Demonstrate basic threshold checking."""
+    print("=" * 70)
+    print("DEMO 1: Basic Threshold Checking")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                    "operator": ">",
+                    "hysteresis": 0.1,
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    
+    def notifier(msg):
+        notifications.append(msg)
+        print(f"  📧 NOTIFICATION: {msg}")
+    
+    checker = ThresholdChecker(config, notification_callback=notifier)
+    alert_states = {}
+    
+    # Simulate CPU values over time
+    test_values = [
+        (50.0, "Normal operation"),
+        (85.0, "Crosses WARNING threshold"),
+        (87.0, "Still in WARNING"),
+        (95.0, "Escalates to CRITICAL"),
+        (92.0, "Still CRITICAL (in hysteresis)"),
+        (85.0, "Still CRITICAL (above recovery threshold of 81)"),
+        (79.0, "Recovers to OK"),
+        (50.0, "Back to normal"),
+    ]
+    
+    print("\nSimulating CPU usage over time:")
+    print("-" * 70)
+    
+    for value, description in test_values:
+        print(f"\n📊 CPU: {value}% - {description}")
+        
+        plugin_data = {"cpu_percent": value}
+        state_changes = checker.check_plugin_data(
+            host_name="testhost",
+            plugin_name="cpu_monitor",
+            data=plugin_data,
+            alert_states=alert_states,
+        )
+        
+        current_state = alert_states.get("cpu_monitor.cpu_percent")
+        if current_state:
+            print(f"  Current state: {current_state.level.name}")
+        
+        if state_changes:
+            for metric, old_level, new_level, val in state_changes:
+                print(f"  ⚠️  State change: {old_level.name} → {new_level.name}")
+    
+    print(f"\n📈 Summary: {len(notifications)} notifications sent")
+    print("=" * 70)
+
+
+def demo_multiple_metrics():
+    """Demonstrate monitoring multiple metrics."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 2: Multiple Metrics and Alert Summary")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {"warning": 80.0, "critical": 90.0},
+                "load_1min": {"warning": 4.0, "critical": 8.0},
+            },
+            "memory_monitor": {
+                "percent": {"warning": 85.0, "critical": 95.0},
+                "available_mb": {
+                    "warning": 1000,
+                    "critical": 500,
+                    "operator": "<",
+                },
+            },
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    # Simulate problematic system state
+    print("\nSimulating a system under load:")
+    print("-" * 70)
+    
+    scenarios = [
+        {
+            "name": "Initial state - all OK",
+            "cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 60.0, "available_mb": 2000},
+        },
+        {
+            "name": "CPU spikes to WARNING",
+            "cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 60.0, "available_mb": 2000},
+        },
+        {
+            "name": "Memory also reaches WARNING",
+            "cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 88.0, "available_mb": 800},
+        },
+        {
+            "name": "CPU escalates to CRITICAL",
+            "cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
+            "memory_monitor": {"percent": 88.0, "available_mb": 800},
+        },
+        {
+            "name": "System recovering",
+            "cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 65.0, "available_mb": 1500},
+        },
+    ]
+    
+    for scenario in scenarios:
+        print(f"\n📍 {scenario['name']}")
+        
+        # Check CPU metrics
+        checker.check_plugin_data(
+            "testhost",
+            "cpu_monitor",
+            scenario["cpu_monitor"],
+            alert_states
+        )
+        
+        # Check memory metrics
+        checker.check_plugin_data(
+            "testhost",
+            "memory_monitor",
+            scenario["memory_monitor"],
+            alert_states
+        )
+        
+        # Show alert summary
+        summary = checker.get_alert_summary(alert_states)
+        print(f"  Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
+        
+        # Show active alerts
+        active = checker.get_active_alerts(alert_states)
+        if active:
+            print(f"  Active alerts:")
+            for alert in active:
+                print(f"    - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
+    
+    print(f"\n📈 Total notifications sent: {len(notifications)}")
+    print("=" * 70)
+
+
+def demo_hysteresis():
+    """Demonstrate hysteresis effect."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 3: Hysteresis Prevents Flapping")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                    "hysteresis": 0.1,  # 10% hysteresis
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    print("\nCritical threshold: 90%")
+    print("Hysteresis: 10%")
+    print("Recovery threshold: 81% (90 - 10% of 90)")
+    print("\nSimulating CPU fluctuating near CRITICAL threshold:")
+    print("-" * 70)
+    
+    # Simulate fluctuating values
+    test_values = [
+        (75.0, "Normal"),
+        (92.0, "Crosses CRITICAL"),
+        (88.0, "Drops but still above 81% (stays CRITICAL)"),
+        (86.0, "Still above 81% (stays CRITICAL)"),
+        (83.0, "Still above 81% (stays CRITICAL)"),
+        (80.0, "Below 81% - recovers to OK"),
+        (88.0, "Rises again but below 90% (stays OK)"),
+        (91.0, "Crosses CRITICAL again"),
+    ]
+    
+    for value, description in test_values:
+        print(f"\n📊 CPU: {value:5.1f}% - {description}")
+        
+        plugin_data = {"cpu_percent": value}
+        state_changes = checker.check_plugin_data(
+            "testhost",
+            "cpu_monitor",
+            plugin_data,
+            alert_states,
+        )
+        
+        current_state = alert_states.get("cpu_monitor.cpu_percent")
+        print(f"  State: {current_state.level.name}")
+        
+        if state_changes:
+            print(f"  📧 Notification sent (state changed)")
+        else:
+            print(f"  ✓  No notification (state unchanged - hysteresis working)")
+    
+    print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
+    print("=" * 70)
+
+
+def demo_inverse_threshold():
+    """Demonstrate inverse thresholds (less than)."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 4: Inverse Thresholds (Alert When Low)")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "memory_monitor": {
+                "available_mb": {
+                    "warning": 1000,   # Warn when < 1000 MB
+                    "critical": 500,   # Critical when < 500 MB
+                    "operator": "<",
+                    "hysteresis": 0.1,
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    print("\nMonitoring available memory (alert when LOW):")
+    print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
+    print("-" * 70)
+    
+    test_values = [
+        (2000, "Plenty of memory"),
+        (800, "Drops below 1000 MB - WARNING"),
+        (450, "Drops below 500 MB - CRITICAL"),
+        (520, "Rises but still in hysteresis zone - stays CRITICAL"),
+        (600, "Enough recovery - back to WARNING"),
+        (1200, "Fully recovered - OK"),
+    ]
+    
+    for value, description in test_values:
+        print(f"\n💾 Available: {value} MB - {description}")
+        
+        plugin_data = {"available_mb": value}
+        state_changes = checker.check_plugin_data(
+            "testhost",
+            "memory_monitor",
+            plugin_data,
+            alert_states,
+        )
+        
+        current_state = alert_states.get("memory_monitor.available_mb")
+        print(f"  State: {current_state.level.name}")
+        
+        if state_changes:
+            for metric, old_level, new_level, val in state_changes:
+                print(f"  📧 {old_level.name} → {new_level.name}")
+    
+    print(f"\n📈 Notifications sent: {len(notifications)}")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    print("\n")
+    print("╔" + "═" * 68 + "╗")
+    print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
+    print("╚" + "═" * 68 + "╝")
+    
+    demo_basic_thresholds()
+    demo_multiple_metrics()
+    demo_hysteresis()
+    demo_inverse_threshold()
+    
+    print("\n\n" + "=" * 70)
+    print("DEMONSTRATION COMPLETE")
+    print("=" * 70)
+    print("\nKey takeaways:")
+    print("  • Thresholds detect when metrics exceed configured limits")
+    print("  • Notifications sent only on state changes, not every check")
+    print("  • Hysteresis prevents alert flapping")
+    print("  • Supports both 'greater than' and 'less than' thresholds")
+    print("  • Multiple metrics can be monitored simultaneously")
+    print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
+    print("=" * 70)
+    print()
@@ -0,0 +1,291 @@
+# Configuration Reload
+
+The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
+
+## How to Reload Configuration
+
+Send a SIGHUP signal to the running hbd process:
+
+```bash
+# Find the process ID
+ps aux | grep hbd
+
+# Or use pidof/pgrep
+pidof hbd
+pgrep -f hbd
+
+# Send SIGHUP signal
+kill -HUP <pid>
+
+# Or if using systemd
+systemctl reload heartbeat
+```
+
+## What Can Be Reloaded
+
+The following configuration sections can be reloaded without restarting:
+
+### ✅ Fully Reloadable
+
+- **Notification Channels** (`notification_channels`)
+  - Add, remove, or modify notification channel definitions
+  - Update tokens, API keys, SMTP credentials
+  - Change recipient lists
+
+- **Threshold Configurations** (`threshold_configs`)
+  - Modify warning and critical thresholds
+  - Add or remove threshold rules
+  - Change operators and hysteresis values
+  - Update display formats
+
+- **Host Configuration** (`hosts`)
+  - Change watch status
+  - Update notification channel assignments
+  - Modify threshold config assignments
+  - Change dyndns status
+
+- **Host Lists**
+  - `watchhosts` - hosts to monitor
+  - `dyndnshosts` - hosts with dynamic DNS
+  - `drophosts` - hosts to ignore
+
+- **Runtime Settings**
+  - `grace` - grace period multiplier
+  - `interval` - expected heartbeat interval
+  - `threshold_renotify_interval` - re-notification interval
+  - `debug` - debug level
+  - `verbose` - verbose output
+
+- **DNS Settings**
+  - `dyndomains` - dynamic DNS domains
+  - `nsupdate_bin` - nsupdate binary path
+  - `rndc_key` - RNDC key path
+
+### ⚠️ Requires Restart
+
+The following settings **cannot** be reloaded and require a service restart:
+
+- **Network Ports**
+  - `hb_port` - UDP heartbeat port
+  - `hbd_port` - HTTP API port
+  - `ws_port` - WebSocket port
+  - `wss_port` - Secure WebSocket port
+
+- **SSL/TLS Settings**
+  - `cert_path` - SSL certificate path
+  - `wss_pem` - SSL certificate file
+  - `wss_key` - SSL key file
+
+- **Persistence**
+  - `pickfile` - Pickle file path
+
+- **Logging**
+  - `logfile` - Log file path
+
+- **Journal Settings**
+  - `journal_enabled` - Enable/disable journaling
+  - `journal_dir` - Journal directory
+  - `journal_file` - Journal filename
+  - `journal_max_size` - Maximum journal size
+  - `journal_max_backups` - Number of backup files
+
+## Reload Process
+
+When a SIGHUP signal is received:
+
+1. **Configuration File Loading**
+   - The config file is re-read from disk
+   - YAML parsing is performed
+   - Validation checks are run
+
+2. **Component Updates**
+   - Notification system is updated with new channel definitions
+   - Threshold checker reloads all threshold configurations
+   - Alert states are preserved to maintain hysteresis
+
+3. **Error Handling**
+   - If reload fails, the previous configuration is kept
+   - Error messages are logged
+   - Service continues running with old configuration
+
+4. **Logging**
+   - Reload start and completion are logged
+   - Each component reports its reload status
+   - Total number of thresholds is reported
+
+## Example Reload Session
+
+```bash
+# Terminal 1: Watch the logs
+tail -f /var/log/heartbeat.log
+
+# Terminal 2: Edit configuration
+vim /path/to/.hb.yaml
+
+# Make changes to notification channels or thresholds
+# Save the file
+
+# Terminal 3: Trigger reload
+kill -HUP $(pgrep -f hbd)
+
+# Terminal 1: See reload messages
+2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
+2026-04-01 12:34:56 INFO: ============================================================
+2026-04-01 12:34:56 INFO: Starting configuration reload...
+2026-04-01 12:34:56 INFO: ============================================================
+2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
+2026-04-01 12:34:56 INFO: Notification configuration reloaded
+2026-04-01 12:34:56 INFO: Reloading threshold configuration...
+2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
+2026-04-01 12:34:56 INFO: ============================================================
+2026-04-01 12:34:56 INFO: Configuration reload completed successfully
+2026-04-01 12:34:56 INFO: ============================================================
+```
+
+## Common Use Cases
+
+### 1. Update Notification Credentials
+
+If you need to rotate API keys or update SMTP passwords:
+
+```yaml
+notification_channels:
+  pushover_standard:
+    type: pushover
+    token: new-token-here    # Updated
+    user: new-user-key-here  # Updated
+```
+
+Just edit the config file and send SIGHUP - no restart needed.
+
+### 2. Adjust Threshold Values
+
+Fine-tune alerting thresholds based on observed behavior:
+
+```yaml
+threshold_configs:
+  default:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 85.0   # Increased from 80.0
+          critical: 95.0  # Increased from 90.0
+```
+
+Send SIGHUP to apply the new thresholds immediately.
+
+### 3. Add New Notification Channels
+
+Add a new notification destination:
+
+```yaml
+notification_channels:
+  email_oncall:
+    type: email
+    recipients: [oncall@example.com]
+    sender: alerts@example.com
+    smtp_server: smtp.example.com
+
+hosts:
+  critical_server:
+    threshold_config: default
+    watch: true
+    notification_channels: [pushover_standard, email_oncall]  # Added
+```
+
+The new channel becomes active immediately after SIGHUP.
+
+### 4. Update Watch List
+
+Start or stop monitoring hosts without restart:
+
+```yaml
+hosts:
+  new_server:
+    threshold_config: default
+    watch: true           # Start watching
+    notification_channels: [pushover_standard]
+```
+
+## Best Practices
+
+1. **Test Configuration Before Reload**
+   - Validate YAML syntax before sending SIGHUP
+   - Check for typos in channel names
+   - Verify threshold values are reasonable
+
+2. **Monitor Reload Logs**
+   - Always check logs after reload to confirm success
+   - Look for error messages if reload fails
+   - Verify expected number of thresholds loaded
+
+3. **Backup Before Changes**
+   - Keep a backup of working configuration
+   - Use version control (git) for config files
+   - Document why changes were made
+
+4. **Gradual Rollout**
+   - Test changes on development server first
+   - Apply to one production server at a time
+   - Verify behavior before applying everywhere
+
+5. **Plan for Restart-Required Changes**
+   - Schedule downtime for port or SSL changes
+   - Use blue-green deployment if possible
+   - Keep service downtime minimal
+
+## Troubleshooting
+
+### Reload Doesn't Apply Changes
+
+**Check:**
+- Is the config file path correct?
+- Did you save the file after editing?
+- Are there YAML syntax errors?
+- Check the logs for error messages
+
+**Solution:**
+```bash
+# Validate YAML syntax
+python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
+
+# Check file modification time
+ls -l .hb.yaml
+
+# View logs
+journalctl -u heartbeat -f
+```
+
+### Partial Configuration Applied
+
+**Cause:** Some sections reloaded, others didn't.
+
+**Solution:** Check logs to see which components failed. Common issues:
+- Invalid channel type
+- Missing required threshold fields
+- Invalid host references
+
+### Service Becomes Unresponsive
+
+**Cause:** Malformed configuration caused an exception.
+
+**Solution:**
+1. Revert to backup configuration
+2. Send SIGHUP again to reload the good config
+3. If service is completely stuck, restart it
+
+## Implementation Details
+
+The reload mechanism uses:
+
+- **Signal Handling**: SIGHUP triggers reload event
+- **Async-Safe Reloading**: Configuration is loaded asynchronously
+- **Component Coordination**: All affected components are updated atomically
+- **State Preservation**: Alert states and hysteresis information are maintained
+- **Error Recovery**: Failed reloads don't affect running configuration
+
+## See Also
+
+- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
+- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
+- Configuration examples in `hbd/config_*.yaml`
@@ -0,0 +1,632 @@
+# HTTP API and Web UI Documentation
+
+## Overview
+
+The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
+
+## Base URL
+
+All API endpoints are relative to the server base URL:
+```
+http://your-server:50004
+```
+
+Default port is `50004` (configurable via `hbd_port` in configuration).
+
+---
+
+## Authentication
+
+When [user accounts are configured](USERS.md), every request must be authenticated.
+
+- **Browser requests** to HTML pages are redirected to `/login` automatically.  JavaScript `fetch()` calls on the dashboards send the session cookie automatically — no JS changes are needed.
+- **API / programmatic requests** must include the token in an `Authorization: Bearer <token>` header or an `X-Auth-Token` header.
+
+Unauthenticated API requests receive `401 Unauthorized`.  When no users are configured the server runs in unauthenticated mode and all endpoints are open.
+
+### Login
+
+```bash
+TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
+  -H 'Content-Type: application/json' \
+  -d '{"username":"alice","password":"secret"}' | jq -r .token)
+
+curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
+```
+
+See [User Management](USERS.md) for full authentication documentation.
+
+---
+
+## API Endpoints
+
+### Authentication
+
+| Method | Path | Description | Auth required |
+|--------|------|-------------|---------------|
+| `POST` | `/api/0/auth/login` | Obtain session token | No |
+| `POST` | `/api/0/auth/logout` | Invalidate session | Token |
+
+### Users
+
+| Method | Path | Description | Role |
+|--------|------|-------------|------|
+| `GET` | `/api/0/users` | List all users | Admin |
+| `GET` | `/api/0/users/me` | Own profile | Authenticated |
+
+### Host Management
+
+#### GET /api/0/hosts
+Get list of all monitored hosts with their state information.  When auth is enabled, only hosts the caller has at least **monitor** access to are returned.
+
+**Response:**
+```json
+[
+  {
+    "name": "webserver01",
+    "dyn": false,
+    "owner": "alice",
+    "managers": ["bob"],
+    "monitors": ["carol"],
+    "connections": [...]
+  }
+]
+```
+
+#### GET /api/0/messages
+Get recent heartbeat messages (last 30).
+
+**Response:**
+```json
+[
+  {
+    "time": 1711234567.123,
+    "host": "webserver01",
+    "msg": "heartbeat received"
+  }
+]
+```
+
+---
+
+### Plugin Data Endpoints
+
+#### GET /api/0/hosts/{hostname}/plugins
+Get all plugin data for a specific host.
+
+**Parameters:**
+- `hostname` (path): Name of the host
+
+**Response:**
+```json
+{
+  "hostname": "webserver01",
+  "plugins": {
+    "cpu_monitor": {
+      "timestamp": 1711234567.123,
+      "data": {
+        "cpu_percent": 45.2,
+        "load_1min": 2.5,
+        "load_5min": 2.1,
+        "load_15min": 1.8
+      },
+      "sample_count": 100
+    },
+    "memory_monitor": {
+      "timestamp": 1711234568.456,
+      "data": {
+        "percent": 65.4,
+        "available_mb": 4096,
+        "total_mb": 16384
+      },
+      "sample_count": 100
+    }
+  }
+}
+```
+
+**Example:**
+```bash
+curl http://localhost:50004/api/0/hosts/webserver01/plugins
+```
+
+#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
+Get detailed historical data for a specific plugin.
+
+**Parameters:**
+- `hostname` (path): Name of the host
+- `plugin_name` (path): Name of the plugin
+- `limit` (query, optional): Number of recent samples to return (default: 10)
+
+**Response:**
+```json
+{
+  "hostname": "webserver01",
+  "plugin": "cpu_monitor",
+  "samples": [
+    {
+      "timestamp": 1711234567.123,
+      "data": {
+        "cpu_percent": 45.2,
+        "load_1min": 2.5
+      }
+    },
+    {
+      "timestamp": 1711234267.123,
+      "data": {
+        "cpu_percent": 42.1,
+        "load_1min": 2.3
+      }
+    }
+  ],
+  "sample_count": 2
+}
+```
+
+**Examples:**
+```bash
+# Get last 1 sample (most recent)
+curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
+
+# Get last 50 samples
+curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
+
+# Get disk monitor data
+curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
+```
+
+---
+
+### Host Access
+
+#### GET /api/0/hosts/{hostname}/access
+Get owner/managers/monitors for a host. Requires **monitor** role or higher.
+
+**Response:**
+```json
+{
+  "owner": "alice",
+  "managers": ["bob"],
+  "monitors": ["carol"]
+}
+```
+
+#### PUT /api/0/hosts/{hostname}/access
+Update owner/managers/monitors. Requires **owner** role or admin.
+
+**Request body** (all fields optional):
+```json
+{ "owner": "bob", "managers": ["carol"], "monitors": [] }
+```
+
+Changes take effect immediately but are not written back to the config file. Update the config file and send `SIGHUP` to make them permanent.
+
+---
+
+### Alert Endpoints
+
+#### GET /api/0/hosts/{hostname}/alerts
+Get alert states for a specific host.
+
+**Parameters:**
+- `hostname` (path): Name of the host
+
+**Response:**
+```json
+{
+  "hostname": "webserver01",
+  "alerts": [
+    {
+      "metric_path": "cpu_monitor.cpu_percent",
+      "level": "WARNING",
+      "since": 1711234000.0,
+      "last_value": 85.5,
+      "last_check": 1711234567.123,
+      "notification_count": 2
+    },
+    {
+      "metric_path": "disk_monitor./.percent",
+      "level": "OK",
+      "since": 1711230000.0,
+      "last_value": 65.0,
+      "last_check": 1711234567.123,
+      "notification_count": 0
+    }
+  ],
+  "summary": {
+    "ok": 15,
+    "warning": 1,
+    "critical": 0,
+    "unknown": 0
+  }
+}
+```
+
+**Example:**
+```bash
+curl http://localhost:50004/api/0/hosts/webserver01/alerts
+```
+
+#### GET /api/0/alerts
+Get all active alerts across all monitored hosts.
+
+**Response:**
+```json
+{
+  "alerts": [
+    {
+      "hostname": "webserver01",
+      "metric_path": "cpu_monitor.cpu_percent",
+      "level": "CRITICAL",
+      "since": 1711234000.0,
+      "last_value": 95.5,
+      "last_check": 1711234567.123,
+      "notification_count": 3
+    },
+    {
+      "hostname": "database01",
+      "metric_path": "memory_monitor.percent",
+      "level": "WARNING",
+      "since": 1711233000.0,
+      "last_value": 88.2,
+      "last_check": 1711234567.123,
+      "notification_count": 1
+    }
+  ],
+  "summary": {
+    "critical": 1,
+    "warning": 1,
+    "unknown": 0,
+    "total": 2
+  },
+  "host_count": 5
+}
+```
+
+**Example:**
+```bash
+curl http://localhost:50004/api/0/alerts | jq .
+```
+
+---
+
+## Web UI Pages
+
+### Login
+**URL:** `/login`
+
+Shown automatically when a browser request is made without a valid session (when users are configured). After successful login the browser is redirected to the originally requested page.
+
+### Logout
+**URL:** `/logout`
+
+Clears the session cookie and redirects to `/login`.
+
+### Live Dashboard
+**URL:** `/live`
+
+Real-time dashboard showing:
+- Host connection states
+- IPv4/IPv6 connectivity
+- Latency metrics
+- Recent messages
+
+**Features:**
+- WebSocket-powered live updates
+- Sortable columns
+- Color-coded status indicators
+
+### Plugin Metrics
+**URL:** `/plugins`
+
+Interactive visualization of plugin metrics:
+- Select host and plugin from dropdown
+- View current metric values
+- Automatic refresh every 30 seconds
+- Support for nested metrics (e.g., per-partition disk stats)
+
+**Features:**
+- Card-based metric display
+- Unit formatting (%, MB, GB)
+- Nested object visualization
+- Auto-refresh
+
+**Screenshots of available data:**
+- CPU usage, load average, frequency
+- Memory usage, available memory, swap
+- Disk usage per partition, I/O statistics
+- Network interface statistics, connection counts
+- Custom plugin data
+
+### Alerts Dashboard
+**URL:** `/alerts`
+
+Comprehensive alert monitoring:
+- Summary cards (Critical, Warning, Total Hosts)
+- Filter by severity (All, Critical, Warning)
+- Alert details with duration
+- Auto-refresh every 15 seconds
+
+**Features:**
+- Color-coded alert levels
+- Duration tracking
+- Filterable list
+- Real-time updates
+- Summary statistics
+
+---
+
+## Integration Examples
+
+### Monitoring Script
+
+```bash
+#!/bin/bash
+# Check for critical alerts and send notification
+
+# Log in first (when auth is configured)
+TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
+  -H 'Content-Type: application/json' \
+  -d '{"username":"monitor","password":"secret"}' | jq -r .token)
+AUTH="-H \"Authorization: Bearer $TOKEN\""
+
+RESPONSE=$(curl -s $AUTH http://localhost:50004/api/0/alerts)
+CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
+
+if [ "$CRITICAL_COUNT" -gt 0 ]; then
+    echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
+    echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
+    # Send notification
+    # mail -s "Critical Alerts" admin@example.com < alert_details.txt
+fi
+```
+
+### Python Client
+
+```python
+import requests
+import json
+
+BASE = 'http://localhost:50004'
+
+# Log in (skip if auth not configured)
+resp = requests.post(f'{BASE}/api/0/auth/login',
+                     json={"username": "alice", "password": "secret"})
+token = resp.json().get("token")
+headers = {"Authorization": f"Bearer {token}"} if token else {}
+
+# Get all plugin data for a host
+response = requests.get(f'{BASE}/api/0/hosts/webserver01/plugins', headers=headers)
+data = response.json()
+
+print(f"Host: {data['hostname']}")
+print(f"Plugins: {', '.join(data['plugins'].keys())}")
+
+for plugin, info in data['plugins'].items():
+    print(f"\n{plugin}:")
+    for metric, value in info['data'].items():
+        print(f"  {metric}: {value}")
+
+# Check for alerts
+response = requests.get(f'{BASE}/api/0/alerts', headers=headers)
+alerts = response.json()
+
+if alerts['summary']['critical'] > 0:
+    print(f"\n⚠️  {alerts['summary']['critical']} CRITICAL ALERTS!")
+    for alert in alerts['alerts']:
+        if alert['level'] == 'CRITICAL':
+            print(f"  - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
+```
+
+### Grafana Integration
+
+The API endpoints can be used with Grafana's JSON datasource plugin:
+
+1. Install the SimpleJSON datasource plugin
+2. Configure datasource URL: `http://your-server:50004`
+3. Create queries:
+   - Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
+   - Alerts: `/api/0/alerts`
+
+### Prometheus Integration
+
+Export metrics in Prometheus format (future enhancement):
+
+```python
+# Example prometheus exporter
+from prometheus_client import Gauge, generate_latest
+import requests
+
+cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
+memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
+
+def collect_metrics():
+    hosts = requests.get('http://localhost:50004/api/0/hosts').json()
+    for host in hosts:
+        hostname = host['name']
+        plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
+        
+        if 'cpu_monitor' in plugins['plugins']:
+            cpu_data = plugins['plugins']['cpu_monitor']['data']
+            cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
+        
+        if 'memory_monitor' in plugins['plugins']:
+            mem_data = plugins['plugins']['memory_monitor']['data']
+            memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
+```
+
+---
+
+## Response Formats
+
+### Success Response
+All successful API calls return HTTP 200 with JSON body:
+```json
+{
+  "field": "value",
+  ...
+}
+```
+
+### Error Response
+API errors return appropriate HTTP status codes with JSON:
+```json
+{
+  "error": "Host 'unknown-host' not found"
+}
+```
+
+**Common Status Codes:**
+- `200 OK` - Success
+- `400 Bad Request` - Invalid parameters
+- `401 Unauthorized` - Missing or invalid session token
+- `403 Forbidden` - Authenticated but insufficient role
+- `404 Not Found` - Resource not found
+- `500 Internal Server Error` - Server error
+
+---
+
+## WebSocket API
+
+For real-time updates, connect to the WebSocket endpoint:
+
+**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
+
+**Messages:**
+```json
+{
+  "type": "host",
+  "data": {
+    "name": "webserver01",
+    "state": "UP"
+  }
+}
+```
+
+```json
+{
+  "type": "plugin",
+  "data": {
+    "host": "webserver01",
+    "plugin": "cpu_monitor",
+    "data": {...},
+    "timestamp": 1711234567.123
+  }
+}
+```
+
+---
+
+## Configuration
+
+### Enable HTTP Server
+
+```yaml
+# In your hbd configuration file
+hbd_host: ""           # Listen on all interfaces
+hbd_port: 50004        # HTTP port
+ws_port: 50005         # WebSocket port (optional)
+# wss_port: 50006      # Secure WebSocket (requires SSL)
+```
+
+### SSL/TLS Configuration
+
+For secure WebSocket connections:
+
+```yaml
+wss_port: 50006
+cert_path: /etc/heartbeat/certs/
+wss_pem: server.pem
+wss_key: server.key
+```
+
+---
+
+## Rate Limiting
+
+The API currently does not implement rate limiting. For production use, consider:
+
+- Placing behind a reverse proxy (nginx, Apache)
+- Using API gateway for rate limiting
+- Implementing caching for frequently accessed endpoints
+
+---
+
+## CORS Support
+
+By default, CORS is not enabled. To enable for web applications:
+
+```python
+# In http.py, add CORS middleware
+from aiohttp_cors import setup as cors_setup
+
+app = web.Application()
+cors = cors_setup(app)
+
+# Configure CORS for all routes
+for route in list(app.router.routes()):
+    cors.add(route, {
+        "*": aiohttp_cors.ResourceOptions(
+            allow_credentials=True,
+            expose_headers="*",
+            allow_headers="*",
+        )
+    })
+```
+
+---
+
+## Performance Considerations
+
+### Caching
+- Plugin data is cached in memory (last 100 samples per plugin)
+- No database queries required
+- Responses are fast (<10ms typical)
+
+### Scalability
+- Each host stores its own data independently
+- Memory usage: ~1KB per host + ~1KB per plugin sample
+- For 100 hosts with 5 plugins: ~50MB memory
+
+### Best Practices
+1. Use `limit` parameter to control response size
+2. Cache responses on client side when appropriate
+3. Use WebSocket for real-time updates instead of polling
+4. Consider pagination for large deployments (future enhancement)
+
+---
+
+## Troubleshooting
+
+### API Returns 401
+- Auth is configured — include `Authorization: Bearer <token>` header
+- Token may have expired (24 h TTL) — log in again
+
+### API Returns 403
+- Authenticated user lacks the required role for this host/action
+- Check host's `owner`, `managers`, `monitors` config
+
+### API Returns 404
+- Verify hostname in URL matches actual host name
+- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
+
+### No Plugin Data
+- Verify client is configured with plugins
+- Check client logs for plugin errors
+- Ensure plugins are sending data (check journal logs)
+
+### Empty Alerts
+- Verify thresholds are configured
+- Check host is in `watchhosts` list
+- Ensure plugins are collecting metrics
+- Review server logs for threshold checker errors
+
+---
+
+## See Also
+
+- [User Management](USERS.md)
+- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
+- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
+- [Message Journal Documentation](MESSAGE_JOURNAL.md)
+- Configuration examples: `hbd/config_example.yaml`
@@ -0,0 +1,413 @@
+# Message Journal
+
+The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
+
+## Overview
+
+The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
+- Audit message history
+- Debug connection issues
+- Analyze traffic patterns
+- Replay messages for testing
+- Create historical reports
+
+## Features
+
+- **JSON Format**: Each message is logged as a single JSON line for easy parsing
+- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
+- **Automatic Cleanup**: Keeps only a configurable number of backup files
+- **Thread-Safe**: Safe for concurrent access from multiple async tasks
+- **Configurable**: All settings controllable via configuration file
+- **Performance**: Non-blocking async operation with minimal overhead
+
+## Configuration
+
+Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
+
+```yaml
+# Message journal configuration
+journal_enabled: true                          # Enable/disable journaling
+journal_dir: /var/log/heartbeat                # Directory for journal files
+journal_file: messages.journal                 # Base filename
+journal_max_size: 104857600                    # Max size in bytes (100MB default)
+journal_max_backups: 10                        # Number of backup files to keep
+```
+
+### Configuration Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `journal_enabled` | `true` | Enable or disable message journaling |
+| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
+| `journal_file` | `messages.journal` | Base filename for the journal |
+| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
+| `journal_max_backups` | `10` | Number of rotated backup files to keep |
+
+## File Format
+
+Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
+
+```json
+{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
+{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
+```
+
+### Entry Structure
+
+Each journal entry contains:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `timestamp` | float | Unix timestamp (seconds since epoch) |
+| `datetime` | string | ISO 8601 formatted datetime |
+| `source_ip` | string | Source IP address |
+| `source_port` | integer | Source UDP port |
+| `message` | object | Complete parsed message dictionary |
+
+## Log Rotation
+
+### How Rotation Works
+
+1. Journal writes messages to the current file
+2. When file size exceeds `journal_max_size`, rotation is triggered
+3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
+4. New empty file is created as the current journal
+5. Old backup files exceeding `journal_max_backups` are deleted
+
+### Example File Structure
+
+```
+/var/log/heartbeat/
+├── messages.journal                    # Current active journal
+├── messages.journal.20260328-120000   # Rotated backup
+├── messages.journal.20260328-140000   # Rotated backup
+└── messages.journal.20260328-160000   # Rotated backup (oldest)
+```
+
+### Rotation Behavior
+
+- Rotation is triggered when the next message would exceed the size limit
+- Rotation is automatic and requires no manual intervention
+- Old backups are deleted in FIFO order (oldest first)
+- Rotation is thread-safe and won't lose messages
+
+## Usage Examples
+
+### Reading Journal Files
+
+#### Using Python
+
+```python
+import json
+
+# Read all entries from current journal
+with open('/var/log/heartbeat/messages.journal', 'r') as f:
+    for line in f:
+        entry = json.loads(line)
+        print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
+```
+
+#### Using jq (command line)
+
+```bash
+# View all messages
+cat /var/log/heartbeat/messages.journal | jq .
+
+# Filter by message type
+cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
+
+# Filter by hostname
+cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
+
+# Count messages by type
+cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
+
+# Extract timestamps and source IPs
+cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
+```
+
+#### Using shell tools
+
+```bash
+# Count total messages
+wc -l /var/log/heartbeat/messages.journal
+
+# View recent messages
+tail -n 100 /var/log/heartbeat/messages.journal | jq .
+
+# Search for specific host
+grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
+
+# Check journal file size
+du -h /var/log/heartbeat/messages.journal
+```
+
+### Analyzing Historical Data
+
+```bash
+# Combine all journal files (current + backups)
+cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
+
+# Count messages per host
+cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
+
+# Find all plugin messages
+cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
+
+# Extract CPU metrics from plugin messages
+cat /var/log/heartbeat/messages.journal* | \
+    jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
+```
+
+## Integration with Log Management
+
+### Logrotate
+
+While the journal has built-in rotation, you can also use logrotate for additional management:
+
+```
+/var/log/heartbeat/messages.journal.* {
+    daily
+    rotate 30
+    compress
+    delaycompress
+    missingok
+    notifempty
+}
+```
+
+### Elasticsearch/OpenSearch
+
+Import journal data into Elasticsearch for advanced analysis:
+
+```python
+from elasticsearch import Elasticsearch
+import json
+
+es = Elasticsearch(['localhost:9200'])
+
+with open('/var/log/heartbeat/messages.journal', 'r') as f:
+    for line in f:
+        entry = json.loads(line)
+        es.index(index='heartbeat-messages', body=entry)
+```
+
+### Splunk
+
+Create a Splunk input for the journal:
+
+```ini
+[monitor:///var/log/heartbeat/messages.journal*]
+sourcetype = heartbeat_json
+index = heartbeat
+```
+
+## Performance Considerations
+
+### Overhead
+
+- Journal writing is async and non-blocking
+- Typical overhead: < 1ms per message
+- Minimal impact on heartbeat processing
+
+### Disk Usage
+
+Calculate expected disk usage:
+
+```
+Messages per day = (86400 seconds / interval) * number_of_hosts
+Average message size ≈ 200-500 bytes
+Daily disk usage = Messages per day * Average message size
+
+Example:
+- 100 hosts
+- 30 second interval  
+- 2880 messages/day per host
+- 288,000 messages/day total
+- ~60-140 MB/day
+```
+
+### Recommendations
+
+- **Small deployments** (< 50 hosts): Default settings work well
+- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
+- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
+
+## Monitoring
+
+### Check Journal Status
+
+The journal exposes statistics that can be queried:
+
+```python
+from hbd.journal import get_journal
+
+journal = get_journal()
+stats = journal.get_stats()
+print(f"Current size: {stats['current_size']:,} bytes")
+print(f"Rotation threshold: {stats['rotation_threshold']}")
+```
+
+### Log Messages
+
+Journal operations are logged at appropriate levels:
+
+- `INFO`: Initialization, rotation events, cleanup
+- `DEBUG`: Individual message logging
+- `WARNING`: Non-critical issues
+- `ERROR`: Critical failures
+
+Check hbd logs for journal-related messages:
+
+```bash
+grep journal /var/log/heartbeat.log
+```
+
+## Troubleshooting
+
+### Journal Files Not Created
+
+**Problem**: No journal files appear in the configured directory.
+
+**Solutions**:
+- Check `journal_enabled: true` in configuration
+- Verify directory exists and hbd has write permissions
+- Check hbd logs for initialization errors
+- Verify disk space is available
+
+### Rotation Not Working
+
+**Problem**: Journal file grows beyond `journal_max_size`.
+
+**Solutions**:
+- Check that `journal_max_size` is properly configured
+- Verify hbd has permission to rename/create files
+- Check for filesystem issues
+- Review hbd logs for rotation errors
+
+### Missing Messages
+
+**Problem**: Some messages don't appear in journal.
+
+**Solutions**:
+- Verify `journal_enabled: true`
+- Check for write errors in hbd logs
+- Verify sufficient disk space
+- Check if filesystem is read-only
+
+### Performance Issues
+
+**Problem**: Journal causing slow message processing.
+
+**Solutions**:
+- Use faster storage (SSD) for journal directory
+- Increase `journal_max_size` to reduce rotation frequency
+- Disable journal if not needed: `journal_enabled: false`
+- Consider async syslog forwarding instead
+
+## Security Considerations
+
+### File Permissions
+
+Ensure proper permissions on journal files:
+
+```bash
+# Journal directory
+chmod 750 /var/log/heartbeat
+chown hbd:hbd /var/log/heartbeat
+
+# Journal files
+chmod 640 /var/log/heartbeat/messages.journal*
+```
+
+### Sensitive Data
+
+Journal files may contain:
+- Hostnames and IP addresses
+- System metrics
+- Custom message content
+
+**Recommendations**:
+- Restrict read access to authorized users only
+- Consider encryption for archived journals
+- Implement log retention policies
+- Sanitize data if sharing for debugging
+
+## API Reference
+
+### MessageJournal Class
+
+```python
+class MessageJournal:
+    def __init__(self, config: Dict[str, Any])
+    async def initialize(self) -> bool
+    async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
+    async def close(self)
+    def get_stats(self) -> Dict[str, Any]
+```
+
+### Module Functions
+
+```python
+def get_journal(config: Dict = None) -> MessageJournal
+async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
+```
+
+## Example: Custom Message Processing
+
+Process journal messages in real-time:
+
+```python
+import asyncio
+import json
+from pathlib import Path
+
+async def tail_journal(journal_path):
+    """Follow journal file and process new messages."""
+    path = Path(journal_path)
+    
+    with open(path, 'r') as f:
+        # Jump to end
+        f.seek(0, 2)
+        
+        while True:
+            line = f.readline()
+            if line:
+                entry = json.loads(line)
+                await process_message(entry)
+            else:
+                await asyncio.sleep(0.1)
+
+async def process_message(entry):
+    """Process a journal entry."""
+    msg = entry['message']
+    
+    # Alert on boot messages
+    if msg.get('boot'):
+        print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
+    
+    # Track CPU usage
+    if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
+        cpu = msg.get('cpu_percent', 0)
+        if cpu > 90:
+            print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
+```
+
+## Future Enhancements
+
+Potential improvements for future versions:
+
+- Compression of rotated logs (gzip)
+- Time-based rotation in addition to size-based
+- Filtering to exclude certain message types
+- Structured logging output formats (CEF, GELF)
+- Remote syslog forwarding
+- Message deduplication
+- Journal file encryption
+- Signed journal entries
+
+## See Also
+
+- [Configuration Guide](../hbd/config.py) - Full configuration options
+- [UDP Protocol](../hbd/udp.py) - Message handling
+- [Server Architecture](../hbd/server.py) - Server initialization
@@ -0,0 +1,326 @@
+# Nagios Plugin Integration Guide
+
+The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
+
+## Quick Start
+
+### 1. Install Nagios Plugins
+
+**Debian/Ubuntu:**
+```bash
+sudo apt-get install nagios-plugins
+```
+
+**RHEL/CentOS/Fedora:**
+```bash
+sudo yum install nagios-plugins-all
+# or
+sudo dnf install nagios-plugins-all
+```
+
+**Arch Linux:**
+```bash
+sudo pacman -S monitoring-plugins
+```
+
+### 2. Configure Heartbeat
+
+Add the `nagios_runner` section to your `~/.hb.yaml` config:
+
+```yaml
+nagios_runner:
+  interval: 60          # Run plugins every 60 seconds
+  timeout: 30           # Command timeout in seconds
+  commands:
+    - name: check_disk_root
+      command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+    
+    - name: check_load
+      command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+    
+    - name: check_procs
+      command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
+```
+
+### 3. Start Heartbeat Client
+
+```bash
+hbc -v localhost
+```
+
+The client will now execute the configured Nagios plugins and send their results to the server.
+
+## How It Works
+
+### Nagios Plugin Standard
+
+Nagios plugins follow a simple interface:
+
+1. **Exit Codes:**
+   - `0` = OK
+   - `1` = WARNING
+   - `2` = CRITICAL
+   - `3` = UNKNOWN
+
+2. **Output Format:**
+   ```
+   STATUS - Message | performance_data
+   ```
+
+3. **Performance Data Format:**
+   ```
+   'label'=value[UOM];[warn];[crit];[min];[max]
+   ```
+
+### Example Plugin Output
+
+```bash
+$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
+```
+
+This output includes:
+- **Status:** `DISK OK`
+- **Message:** `free space: / 156 GB (78%)`
+- **Performance Data:** `/=44GB;127;142;0;159`
+  - Current value: 44GB
+  - Warning threshold: 127GB
+  - Critical threshold: 142GB
+  - Min: 0GB
+  - Max: 159GB
+
+### Data Collected
+
+The `nagios_runner` plugin collects:
+
+**For each configured command:**
+- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
+- `{name}_status_code` - Numeric exit code (0-3)
+- `{name}_output` - Status message
+- `{name}_{metric}` - Each performance metric value
+- `{name}_{metric}_uom` - Unit of measurement (if present)
+- `{name}_{metric}_warn` - Warning threshold (if present)
+- `{name}_{metric}_crit` - Critical threshold (if present)
+- `{name}_{metric}_min` - Minimum value (if present)
+- `{name}_{metric}_max` - Maximum value (if present)
+
+## Configuration Options
+
+```yaml
+nagios_runner:
+  # Collection interval in seconds (default: 60)
+  interval: 60
+  
+  # Command execution timeout in seconds (default: 30)
+  timeout: 30
+  
+  # Execute commands via shell (default: true)
+  # Set to false for direct execution (more secure but less flexible)
+  shell: true
+  
+  # List of Nagios plugins to run
+  commands:
+    - name: unique_name       # Required: unique identifier
+      command: /path/to/plugin [args]  # Required: full command to execute
+```
+
+## Common Nagios Plugins
+
+### System Resources
+
+**Disk Space:**
+```yaml
+- name: check_disk_root
+  command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+```
+
+**Load Average:**
+```yaml
+- name: check_load
+  command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+```
+
+**Swap Usage:**
+```yaml
+- name: check_swap
+  command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
+```
+
+**Process Count:**
+```yaml
+- name: check_procs
+  command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
+```
+
+**Users Logged In:**
+```yaml
+- name: check_users
+  command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
+```
+
+### Network Services
+
+**SSH:**
+```yaml
+- name: check_ssh
+  command: /usr/lib/nagios/plugins/check_ssh localhost
+```
+
+**HTTP:**
+```yaml
+- name: check_http_local
+  command: /usr/lib/nagios/plugins/check_http -H localhost
+  
+- name: check_http_ssl
+  command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
+```
+
+**DNS:**
+```yaml
+- name: check_dns
+  command: /usr/lib/nagios/plugins/check_dns -H google.com
+```
+
+**Ping:**
+```yaml
+- name: check_ping_gateway
+  command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
+```
+
+### Databases
+
+**MySQL:**
+```yaml
+- name: check_mysql
+  command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
+```
+
+**PostgreSQL:**
+```yaml
+- name: check_pgsql
+  command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
+```
+
+## Writing Custom Nagios Plugins
+
+You can write your own Nagios-compatible plugins in any language. Here's a simple example:
+
+**Bash:**
+```bash
+#!/bin/bash
+# /usr/local/bin/check_example.sh
+
+# Get the value to check
+value=$(some_command)
+
+# Define thresholds
+warn=80
+crit=90
+
+# Check and output result
+if [ $value -ge $crit ]; then
+    echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
+    exit 2
+elif [ $value -ge $warn ]; then
+    echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
+    exit 1
+else
+    echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
+    exit 0
+fi
+```
+
+**Python:**
+```python
+#!/usr/bin/env python3
+# /usr/local/bin/check_example.py
+
+import sys
+
+def check_something():
+    value = get_value()  # Your check logic here
+    warn = 80
+    crit = 90
+    
+    perfdata = f"value={value};{warn};{crit};0;100"
+    
+    if value >= crit:
+        print(f"CRITICAL - Value is {value} | {perfdata}")
+        sys.exit(2)
+    elif value >= warn:
+        print(f"WARNING - Value is {value} | {perfdata}")
+        sys.exit(1)
+    else:
+        print(f"OK - Value is {value} | {perfdata}")
+        sys.exit(0)
+
+if __name__ == "__main__":
+    check_something()
+```
+
+Then configure in Heartbeat:
+```yaml
+nagios_runner:
+  commands:
+    - name: my_custom_check
+      command: /usr/local/bin/check_example.sh
+```
+
+## Troubleshooting
+
+### Plugin not found
+```
+Error: Command not found
+```
+**Solution:** Use the full path to the plugin. Common locations:
+- `/usr/lib/nagios/plugins/`
+- `/usr/lib64/nagios/plugins/`
+- `/usr/local/nagios/libexec/`
+
+### Permission denied
+```
+Error: Permission denied
+```
+**Solution:** Ensure the plugin is executable:
+```bash
+chmod +x /path/to/plugin
+```
+
+### Timeout errors
+```
+Command timed out after 30s
+```
+**Solution:** Increase the timeout in config:
+```yaml
+nagios_runner:
+  timeout: 60  # Increase timeout
+```
+
+### No performance data
+If performance data is not being parsed:
+1. Check plugin output includes `|` separator
+2. Verify performance data format: `'label'=value[UOM];...`
+3. Enable debug logging: `hbc -v -x localhost`
+
+## Benefits
+
+1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
+2. **No Rewriting:** Use plugins as-is without modification
+3. **Community Support:** Well-documented and maintained plugins
+4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
+5. **Standard Interface:** Consistent exit codes and output format
+6. **Performance Data:** Automatic extraction of metrics
+
+## Resources
+
+- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
+- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
+- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
+- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
+
+## Next Steps
+
+- Configure threshold alerts based on Nagios plugin status codes
+- View plugin data in the Heartbeat web UI
+- Create custom plugins for your specific monitoring needs
+- Integrate with existing Nagios/Icinga configurations
@@ -0,0 +1,295 @@
+# Notification System
+
+## Overview
+
+Notifications are dispatched to the **owner and managers** of a host, each via their own configured notification channels. Channel definitions are global; users reference them by name. No users configured → no notifications sent.
+
+## Architecture
+
+```
+Alert event (udp.py / threshold.py)
+  └─ notify.send_notification(host_name, Notification)
+       ├─ look up host.owner + host.managers
+       ├─ for each user → user.notification_channels
+       └─ for each channel → _dispatch_to_channel (filtered by min_level)
+```
+
+Every notification carries:
+- **title** — `[LEVEL] hostname` (e.g. `[CRITICAL] webserver01`)
+- **body** — detail message (metric value, threshold, duration)
+- **url** — link to the plugin metrics page (`{base_url}/plugins#{hostname}`)
+- **level** — `RECOVER | WARNING | CRITICAL | INFO`
+
+## Configuration
+
+### Base URL
+
+Set `base_url` so notification links point to your hbd instance:
+
+```yaml
+base_url: https://hbd.example.com
+```
+
+### Global channel definitions
+
+Define channels once; reference them by name from user configs:
+
+```yaml
+notification_channels:
+
+  pushover_ops:
+    type: pushover
+    token: your-app-token
+    user: your-user-key
+    min_level: WARNING        # optional, default: WARNING
+
+  email_ops:
+    type: email
+    recipients: [ops@example.com]
+    sender: hbd@example.com
+    smtp_server: smtp.example.com
+    smtp_port: 587
+    smtp_user: hbd@example.com
+    smtp_password: secret
+    min_level: WARNING
+
+  matrix_oncall:
+    type: matrix
+    homeserver: https://matrix.example.org
+    access_token: syt_xxx
+    room_id: "!abc:matrix.example.org"
+    min_level: CRITICAL       # only send critical alerts to this room
+
+  sms_oncall:
+    type: sms_voipms
+    api_user: me@example.com
+    api_password: secret
+    did: "5551234567"         # your voip.ms DID number
+    dst: "5559876543"         # destination number
+    min_level: CRITICAL
+
+  signal_ops:
+    type: signal
+    cli_path: /usr/local/bin/signal-cli
+    user: +12025551234
+    recipient: +12025559999
+
+  mattermost_devops:
+    type: mattermost
+    host: mattermost.example.com
+    token: webhook-token
+    channel: devops-alerts
+    username: heartbeat-bot
+```
+
+### Users with notification channels
+
+Each user lists which global channels they receive notifications on:
+
+```yaml
+users:
+  alice:
+    full_name: Alice Smith
+    password: pbkdf2:sha256:...
+    admin: true
+    notification_channels: [pushover_ops, email_ops]
+
+  bob:
+    full_name: Bob Jones
+    password: pbkdf2:sha256:...
+    notification_channels: [sms_oncall, matrix_oncall]
+```
+
+### Host access — owner and managers
+
+Notifications for a host go to its owner and all managers:
+
+```yaml
+hosts:
+  webserver01:
+    owner: alice             # receives all notifications for this host
+    managers: [bob]          # also receives notifications
+    threshold_config: default
+    watch: true              # bold in dashboard (cosmetic only)
+    dyndns: false
+
+  dbserver01:
+    owner: alice
+    managers: [bob]
+    threshold_config: database
+    dyndns: false
+```
+
+`watch: true` only affects display (bold name in the live dashboard). Notifications are now controlled entirely by owner/managers.
+
+## Channel Types
+
+### `min_level` filtering
+
+Every channel accepts an optional `min_level` field:
+
+| Value | Channels receive |
+|---|---|
+| `WARNING` (default) | WARNING, CRITICAL, RECOVER |
+| `CRITICAL` | CRITICAL only (and RECOVER) |
+
+`RECOVER` is always passed through — you don't want to miss a recovery.
+
+### pushover
+
+Sends push notifications via [Pushover](https://pushover.net). Includes title, body, and a clickable URL.
+
+```yaml
+type: pushover
+token: your-app-token     # Required: Pushover application token
+user: your-user-key       # Required: Recipient's user key
+min_level: WARNING
+```
+
+### email
+
+Sends via SMTP. Subject = title, body = message + URL on final line.
+
+```yaml
+type: email
+recipients: [ops@example.com, oncall@example.com]
+sender: hbd@example.com
+smtp_server: smtp.example.com
+smtp_port: 587             # 587 = STARTTLS (default), 465 = SSL
+smtp_user: hbd@example.com
+smtp_password: secret
+min_level: WARNING
+```
+
+### matrix
+
+Sends a formatted HTML message to a Matrix room via [matrix-nio](https://github.com/poljar/matrix-nio).
+
+```yaml
+type: matrix
+homeserver: https://matrix.example.org
+access_token: syt_xxx      # Bot account access token
+room_id: "!abc:matrix.example.org"
+min_level: WARNING
+```
+
+**Setup:**
+1. Create a bot Matrix account
+2. Obtain its access token (Element → Settings → Help & About → Access Token)
+3. Invite the bot to the target room and note the room ID
+
+### sms_voipms
+
+Sends SMS via the [voip.ms REST API](https://voip.ms/api/v1/rest.php). Message is truncated to 160 characters.
+
+```yaml
+type: sms_voipms
+api_user: me@example.com   # voip.ms account email
+api_password: secret       # voip.ms API password
+did: "5551234567"          # Your voip.ms DID (sending number)
+dst: "5559876543"          # Destination number
+min_level: CRITICAL
+```
+
+### signal
+
+Sends via [signal-cli](https://github.com/AsamK/signal-cli).
+
+```yaml
+type: signal
+cli_path: /usr/local/bin/signal-cli
+user: +12025551234         # Your registered Signal number
+recipient: +12025559999    # Recipient number
+min_level: WARNING
+```
+
+**Setup:**
+```bash
+signal-cli -u +12025551234 register
+signal-cli -u +12025551234 verify CODE
+```
+
+### mattermost
+
+Sends via Mattermost incoming webhook. Message is formatted as Markdown.
+
+```yaml
+type: mattermost
+host: mattermost.example.com
+token: your-webhook-token
+channel: devops-alerts
+username: heartbeat-bot    # Optional: display name
+icon: https://…/icon.png   # Optional: bot icon URL
+min_level: WARNING
+```
+
+## Notification events
+
+| Source | Level | Title example | Body example |
+|---|---|---|---|
+| Host overdue | CRITICAL | `[CRITICAL] webserver01` | `IPv4 overdue` |
+| Host recover | RECOVER | `[RECOVER] webserver01` | `IPv4 back after being overdue for 5:23` |
+| Host boot | INFO | `[INFO] webserver01` | `webserver01 booted` |
+| Host shutdown | INFO | `[INFO] webserver01` | `IPv4 shutdown` |
+| Threshold breach | WARNING/CRITICAL | `[CRITICAL] webserver01` | `cpu_percent = 95.2 (threshold: > 90.0)` |
+| Threshold reminder | CRITICAL | `[REMINDER/CRITICAL] webserver01` | `REMINDER (CRITICAL): … ongoing for 3600s` |
+| Connection issue | WARNING | `[WARNING] webserver01` | `new address detected …` |
+
+Reminder notifications (re-notify) are sent only for CRITICAL level alerts.
+
+## API reference
+
+### `send_notification(host_name, notif) -> dict`
+
+Main entry point. Dispatches to owner + managers.
+
+```python
+from hbd.server.notify import send_notification, Notification
+
+send_notification(
+    "webserver01",
+    Notification(
+        title="[CRITICAL] webserver01",
+        body="cpu_percent = 95.2 (threshold: > 90.0)",
+        level="CRITICAL",
+        url="https://hbd.example.com/plugins#webserver01",
+    ),
+)
+```
+
+Returns `{channel_name: bool}` for each channel dispatched.
+
+### `setup(cfg, loop=None)`
+
+Called once at startup from `main.py`. Pass the running asyncio event loop so Matrix sends work correctly.
+
+## Troubleshooting
+
+**No notifications sent:**
+- Check that users are configured (`users:` section in yaml)
+- Check that the host has an `owner` or `managers` set
+- Check that users have `notification_channels` listed
+- Check that the channel names in user config match keys under `notification_channels:`
+
+**min_level filtering too aggressive:**
+- Default is `WARNING` — both WARNING and CRITICAL are sent
+- Set `min_level: WARNING` explicitly if you were expecting warnings but set CRITICAL
+
+**Matrix sends time out:**
+- Verify the access token is valid and the bot is in the room
+- `matrix-nio` must be installed: `pip install matrix-nio`
+
+**voip.ms SMS fails:**
+- Enable the API in your voip.ms account (Account → API)
+- Verify the DID is SMS-capable in your voip.ms account
+
+**Signal not found:**
+- Specify full `cli_path`
+- Run `signal-cli -u +NUMBER receive` to sync trust store
+
+**Email authentication failed:**
+- Use app-specific passwords for Gmail/Fastmail
+- Verify port: 587 for STARTTLS, 465 for SSL
+
+**Pushover `400` errors:**
+- Double-check `token` (app) and `user` (user key) — they are different values
@@ -0,0 +1,567 @@
+# Plugin Development Guide
+
+This guide explains how to create custom plugins for the Heartbeat monitoring system.
+
+## Table of Contents
+
+- [Plugin Architecture](#plugin-architecture)
+- [Plugin Types](#plugin-types)
+- [Creating a Plugin](#creating-a-plugin)
+- [Plugin Lifecycle](#plugin-lifecycle)
+- [Server-initiated InfoPlugin refresh](#server-initiated-infoplugin-refresh)
+- [Configuration](#configuration)
+- [Best Practices](#best-practices)
+- [Examples](#examples)
+- [Testing](#testing)
+
+## Plugin Architecture
+
+Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
+
+### Key Concepts
+
+- **Plugin Registry**: Central registry that manages all loaded plugins
+- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
+- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
+- **Async/Await**: All plugin methods are async for non-blocking operation
+
+## Plugin Types
+
+### InfoPlugin
+
+InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
+
+- **Runs once** at startup (interval = 0)
+- **Cached** - data is collected once and reused
+- **Lightweight** - no periodic overhead
+
+**Use InfoPlugin for:**
+- Operating system details
+- Hardware information
+- Software versions
+- Configuration data
+- Static inventory
+
+### MonitorPlugin
+
+MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
+
+- **Runs periodically** based on configured interval
+- **Scheduled** - collected at regular intervals
+- **Dynamic** - captures changing system state
+
+**Use MonitorPlugin for:**
+- Resource usage (CPU, memory, disk, network)
+- Performance metrics
+- Counters and gauges
+- Time-series data
+
+## Creating a Plugin
+
+### Step 1: Choose Plugin Type
+
+Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
+
+### Step 2: Create Plugin File
+
+Create a new Python file in `hbd/plugins/` directory:
+
+```python
+"""
+My awesome plugin for Heartbeat.
+
+Brief description of what this plugin does.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+
+# Import psutil or other dependencies if needed
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from hbd.plugin import MonitorPlugin  # or InfoPlugin
+
+logger = logging.getLogger(__name__)
+
+
+class MyAwesomePlugin(MonitorPlugin):  # or InfoPlugin
+    """
+    One-line description of the plugin.
+    
+    Collects:
+    - List of metrics/data collected
+    - Another metric
+    
+    Configuration:
+        interval: Collection interval in seconds (default: 60)
+        option1: Description of option1 (default: value)
+        option2: Description of option2 (default: value)
+    """
+    
+    name = "my_awesome_plugin"  # Unique plugin name
+    interval = 60  # For MonitorPlugin, use 0 for InfoPlugin
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize the plugin with optional configuration."""
+        super().__init__(config)
+        
+        # Extract configuration options
+        self.option1 = self.config.get('option1', 'default_value')
+        self.option2 = self.config.get('option2', True)
+        
+        # Check dependencies
+        if psutil is None:
+            raise ImportError("psutil is required for my_awesome_plugin")
+    
+    async def initialize(self):
+        """
+        Initialize the plugin.
+        
+        This is called once when the plugin is loaded.
+        Use this to verify dependencies, establish connections, etc.
+        
+        Returns:
+            True if initialization successful, False otherwise
+        """
+        logger.info(f"My awesome plugin initialized (option1: {self.option1})")
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        """
+        Collect data.
+        
+        This is called periodically (MonitorPlugin) or once (InfoPlugin).
+        
+        Returns:
+            Dictionary of collected data (will be sent to server)
+        """
+        try:
+            data = await self._collect_metrics()
+            logger.debug(f"Collected {len(data)} metrics")
+            return data
+        except Exception as e:
+            logger.error(f"Error collecting data: {e}")
+            return {"error": str(e)}
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Internal method to collect actual metrics."""
+        metrics = {}
+        
+        # Collect your data here
+        metrics['metric1'] = self._get_metric1()
+        metrics['metric2'] = self._get_metric2()
+        
+        return metrics
+    
+    def _get_metric1(self):
+        """Helper method for metric collection."""
+        # Implementation here
+        return 42
+    
+    def _get_metric2(self):
+        """Helper method for metric collection."""
+        # Implementation here
+        return "hello"
+    
+    async def cleanup(self):
+        """
+        Cleanup resources.
+        
+        This is called when the plugin is unloaded or the client shuts down.
+        Use this to close connections, release resources, etc.
+        """
+        logger.info("My awesome plugin cleanup")
+
+
+# Plugin instance for automatic discovery
+plugin = MyAwesomePlugin
+```
+
+### Step 3: Test Your Plugin
+
+Create a test script to verify your plugin works:
+
+```python
+#!/usr/bin/env python3
+import asyncio
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
+
+async def test():
+    # Create plugin instance
+    plugin = MyAwesomePlugin({'option1': 'test_value'})
+    
+    # Initialize
+    if not await plugin.initialize():
+        print("Failed to initialize")
+        return False
+    
+    # Collect data
+    data = await plugin.collect()
+    print(f"Collected data: {data}")
+    
+    # Cleanup
+    await plugin.cleanup()
+    
+    return True
+
+if __name__ == '__main__':
+    success = asyncio.run(test())
+    sys.exit(0 if success else 1)
+```
+
+## Plugin Lifecycle
+
+Understanding the plugin lifecycle helps you implement plugins correctly:
+
+```
+1. Plugin Discovery
+   └─> Loader scans hbd/plugins/ directory
+   └─> Finds Python files (except those starting with _)
+   └─> Imports modules
+
+2. Plugin Instantiation
+   └─> Creates instance with configuration
+   └─> __init__() is called
+
+3. Plugin Initialization
+   └─> initialize() is called
+   └─> Plugin verifies dependencies, establishes connections
+   └─> Returns True/False for success/failure
+
+4. Plugin Registration
+   └─> If initialization succeeds, plugin is registered
+   └─> Plugin becomes active
+
+5. Data Collection
+   └─> For InfoPlugin: collect() called once after initialization
+   └─> For MonitorPlugin: collect() called periodically based on interval
+   └─> Data is sent to server via PLG message
+
+6. Plugin Shutdown
+   └─> cleanup() is called
+   └─> Plugin releases resources, closes connections
+```
+
+## Server-initiated InfoPlugin refresh
+
+When a heartbeat packet arrives from a host the server has no plugin data for (e.g. after a server restart), the server sets `request_update = 1` in the ACK reply. The client detects this flag and immediately re-runs all InfoPlugins — clearing their cached results first — then resends the data as PLG messages.
+
+This means InfoPlugin data will always reach the server as soon as possible without requiring a client restart. No action is needed from plugin authors: the framework handles cache invalidation and re-collection automatically.
+
+The lifecycle for this case looks like:
+
+```
+Server restarts, host reconnects
+   └─> hbd receives HTB with no existing plugin_data for host
+   └─> hbd sets request_update=1 in ACK
+
+Client receives ACK
+   └─> Detects request_update flag
+   └─> Clears _cache on every registered InfoPlugin
+   └─> Calls collect() on each InfoPlugin
+   └─> Sends fresh PLG messages to server
+```
+
+If you write an `InfoPlugin` with side effects in `_collect_info()` (opening connections, writing files, etc.), be aware it may be called more than once per client session when this mechanism triggers.
+
+## Configuration
+
+### Plugin-Specific Configuration
+
+Plugins receive configuration through the `config` parameter in `__init__`:
+
+```python
+def __init__(self, config: Optional[Dict[str, Any]] = None):
+    super().__init__(config)
+    
+    # Access configuration with defaults
+    self.interval = self.config.get('interval', 60)
+    self.threshold = self.config.get('threshold', 80)
+    self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
+```
+
+### Client Configuration File
+
+Users configure plugins in the client configuration YAML:
+
+```yaml
+plugins:
+  my_awesome_plugin:
+    enabled: true
+    interval: 120
+    option1: custom_value
+    option2: false
+```
+
+## Best Practices
+
+### 1. Error Handling
+
+Always handle errors gracefully:
+
+```python
+async def collect(self) -> Dict[str, Any]:
+    try:
+        return await self._collect_metrics()
+    except Exception as e:
+        logger.error(f"Error collecting metrics: {e}")
+        return {"error": str(e)}
+```
+
+### 2. Logging
+
+Use appropriate log levels:
+
+```python
+logger.debug("Detailed information for debugging")
+logger.info("Normal operation messages")
+logger.warning("Warning messages for unusual but handled situations")
+logger.error("Error messages for failures")
+```
+
+### 3. Dependencies
+
+Check for optional dependencies:
+
+```python
+try:
+    import some_optional_library
+except ImportError:
+    some_optional_library = None
+
+# Later in __init__:
+if some_optional_library is None:
+    raise ImportError("some_optional_library is required")
+```
+
+### 4. Performance
+
+- Keep collection methods fast (< 1 second)
+- Use async/await for I/O operations
+- Cache expensive computations
+- Don't block the event loop
+
+### 5. Data Structure
+
+Return clean, structured data:
+
+```python
+{
+    'metric_name': value,
+    'nested_data': {
+        'sub_metric': value
+    },
+    'list_data': [item1, item2],
+    'timestamp': time.time()  # Optional timestamp
+}
+```
+
+### 6. Documentation
+
+Document your plugin thoroughly:
+
+- Class docstring with description and configuration
+- Method docstrings explaining purpose and return values
+- Inline comments for complex logic
+
+## Examples
+
+### Example 1: Simple InfoPlugin
+
+```python
+from hbd.plugin import InfoPlugin
+import platform
+
+class SimpleInfoPlugin(InfoPlugin):
+    """Collect basic system information."""
+    
+    name = "simple_info"
+    interval = 0  # InfoPlugin
+    
+    async def initialize(self):
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        return {
+            'hostname': platform.node(),
+            'system': platform.system(),
+            'python_version': platform.python_version()
+        }
+    
+    async def cleanup(self):
+        pass
+
+plugin = SimpleInfoPlugin
+```
+
+### Example 2: MonitorPlugin with State
+
+```python
+from hbd.plugin import MonitorPlugin
+import time
+
+class CounterPlugin(MonitorPlugin):
+    """Track a counter over time."""
+    
+    name = "counter"
+    interval = 30
+    
+    def __init__(self, config=None):
+        super().__init__(config)
+        self._counter = 0
+        self._start_time = time.time()
+    
+    async def initialize(self):
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        self._counter += 1
+        uptime = time.time() - self._start_time
+        
+        return {
+            'count': self._counter,
+            'uptime': uptime,
+            'rate': self._counter / uptime
+        }
+    
+    async def cleanup(self):
+        pass
+
+plugin = CounterPlugin
+```
+
+### Example 3: Plugin with External Command
+
+```python
+from hbd.plugin import MonitorPlugin
+import asyncio
+
+class CommandPlugin(MonitorPlugin):
+    """Execute external command and capture output."""
+    
+    name = "command_executor"
+    interval = 60
+    
+    def __init__(self, config=None):
+        super().__init__(config)
+        self.command = self.config.get('command', 'echo "no command"')
+    
+    async def initialize(self):
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        try:
+            process = await asyncio.create_subprocess_shell(
+                self.command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(),
+                timeout=30
+            )
+            
+            return {
+                'exit_code': process.returncode,
+                'stdout': stdout.decode('utf-8'),
+                'stderr': stderr.decode('utf-8')
+            }
+        except Exception as e:
+            return {'error': str(e)}
+    
+    async def cleanup(self):
+        pass
+
+plugin = CommandPlugin
+```
+
+## Testing
+
+### Unit Testing
+
+Create unit tests for your plugins:
+
+```python
+import unittest
+import asyncio
+
+class TestMyPlugin(unittest.TestCase):
+    def setUp(self):
+        self.plugin = MyAwesomePlugin({'option1': 'test'})
+    
+    def test_initialization(self):
+        result = asyncio.run(self.plugin.initialize())
+        self.assertTrue(result)
+    
+    def test_collection(self):
+        asyncio.run(self.plugin.initialize())
+        data = asyncio.run(self.plugin.collect())
+        
+        self.assertIsInstance(data, dict)
+        self.assertIn('metric1', data)
+        self.assertGreater(data['metric1'], 0)
+    
+    def tearDown(self):
+        asyncio.run(self.plugin.cleanup())
+
+if __name__ == '__main__':
+    unittest.main()
+```
+
+### Integration Testing
+
+Test your plugin with the actual client:
+
+```bash
+# Create test configuration
+cat > test_config.yaml <<EOF
+server: localhost
+plugins:
+  my_awesome_plugin:
+    enabled: true
+    interval: 10
+    option1: test_value
+EOF
+
+# Run client in test mode
+python -m hbd.hbc -c test_config.yaml --verbose
+```
+
+## Troubleshooting
+
+### My plugin isn't loading
+
+1. Check filename doesn't start with underscore
+2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
+3. Check `initialize()` returns True
+4. Look for import errors in logs
+
+### Plugin loads but doesn't collect data
+
+1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
+2. Verify `collect()` returns a dictionary
+3. Check for exceptions in `collect()` method
+4. Enable DEBUG logging to see detailed errors
+
+### Data isn't appearing on server
+
+1. Verify client is connected to server
+2. Check server logs for PLG message handling
+3. Verify returned data is JSON-serializable
+4. Check for large data sizes (may exceed UDP packet size)
+
+## Further Reading
+
+- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
+- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
+- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
+- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
@@ -0,0 +1,260 @@
+# User Management
+
+Heartbeat supports optional user accounts with role-based access control per host. When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
+
+---
+
+## Overview
+
+Users are defined in the server config file. Each host can have an **owner**, zero or more **managers**, and zero or more **monitors**. A **default owner** catches any host that does not name an explicit owner.
+
+### Roles
+
+| Role | Inherits | Permissions |
+|------|----------|-------------|
+| **monitor** | — | View host status, plugin data, alerts; acknowledge alerts they were notified for |
+| **manager** | monitor | + Queue commands (`/c`), trigger DNS re-registration (`/n`), queue upgrades (`/u`); add/remove monitors |
+| **owner** | manager | + Drop host (`/d`); add/remove managers; transfer ownership; update host access |
+| **admin** *(flag)* | owner on all hosts | Full access to every host and the user list |
+
+`admin` is a flag on the user, not a per-host role. An admin user has owner-level access on every host without being listed as owner/manager/monitor.
+
+---
+
+## Configuration
+
+### Defining users
+
+```yaml
+users:
+  andreas:
+    full_name: Andreas Wrede
+    avatar: /path/to/avatar.png   # file path, URL, or base64 data URI (optional)
+    password: pbkdf2:sha256:...   # generated with: hbd passwd andreas
+    admin: true                   # optional — grants server-wide owner access
+
+  bob:
+    full_name: Bob Smith
+    password: pbkdf2:sha256:...
+    notification_channels: [pushover_standard]
+
+  carol:
+    full_name: Carol Jones
+    password: pbkdf2:sha256:...
+
+default_owner: andreas            # owns hosts with no explicit owner
+                                  # falls back to the first admin user if omitted
+```
+
+### Client-declared host ownership
+
+A host can declare its own owner directly in the hbc or hbc_mini client configuration. This is useful for hosts that are not listed in the server config, or during initial setup before a server-side config entry has been created.
+
+**`~/.hbc.yaml`** (hbc):
+```yaml
+owner: andreas
+```
+
+**`~/.hbc.json`** (hbc_mini):
+```json
+{ "owner": "andreas" }
+```
+
+When set, the value is included in the `os_info` plugin data sent to the server. The server applies it as `host.owner` the first time `os_info` arrives, provided no owner has been configured server-side for that host. Server-configured ownership always takes precedence.
+
+---
+
+### Assigning roles to hosts
+
+```yaml
+hosts:
+  webserver01:
+    owner: andreas
+    managers: [bob]
+    monitors: [carol]
+    threshold_config: default
+    watch: true
+    notification_channels: [pushover_standard]
+
+  unattended-host:              # no owner → owned by default_owner
+    threshold_config: default
+    watch: true
+```
+
+### Generating a password hash
+
+```bash
+hbd passwd andreas
+```
+
+Enter and confirm the password when prompted. Paste the printed hash into the config file under the user's `password` key.
+
+You can also generate a hash non-interactively from Python:
+
+```python
+from hbd.server.users import hash_password
+print(hash_password("mysecret"))
+```
+
+Passwords are stored as PBKDF2-HMAC-SHA256 hashes (260 000 iterations). No third-party libraries are required — only Python's standard `hashlib`.
+
+---
+
+## Authentication
+
+When at least one user is defined, every request must be authenticated. Unauthenticated requests to HTML pages are redirected to `/login`; unauthenticated API requests receive `401 Unauthorized`.
+
+### Browser login
+
+Navigate to any page — you will be redirected to `/login` automatically. After submitting valid credentials the server sets an `hbd_session` cookie (HttpOnly, SameSite=Lax, 24 h lifetime). All subsequent requests, including JavaScript `fetch()` calls on the dashboards, carry the cookie automatically.
+
+To log out, visit `/logout`.
+
+### API / programmatic login
+
+```bash
+# Log in and capture the token
+TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
+  -H 'Content-Type: application/json' \
+  -d '{"username":"andreas","password":"mysecret"}' | jq -r .token)
+
+# Use the token in subsequent requests
+curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
+```
+
+The token is identical to the session cookie value — both mechanisms work simultaneously.
+
+```bash
+# Log out
+curl -s -X POST http://localhost:50004/api/0/auth/logout \
+  -H "Authorization: Bearer $TOKEN"
+```
+
+---
+
+## API Endpoints
+
+### Authentication
+
+#### POST /api/0/auth/login
+Obtain a session token.
+
+**Request body:**
+```json
+{ "username": "andreas", "password": "mysecret" }
+```
+
+**Response:**
+```json
+{ "token": "<opaque-hex-token>", "username": "andreas" }
+```
+Also sets the `hbd_session` cookie for browser clients.
+
+**Status codes:** `200 OK`, `401 Unauthorized`, `404` (auth not configured)
+
+---
+
+#### POST /api/0/auth/logout
+Invalidate the current session.
+
+**Headers:** `Authorization: Bearer <token>` or cookie
+
+**Response:** `{ "success": true }`
+
+---
+
+### Users
+
+#### GET /api/0/users
+List all users. **Admin only.**
+
+**Response:**
+```json
+[
+  { "username": "andreas", "full_name": "Andreas Wrede", "avatar": "", "admin": true, "notification_channels": [] },
+  { "username": "bob",     "full_name": "Bob Smith",     "avatar": "", "admin": false, "notification_channels": ["pushover_standard"] }
+]
+```
+
+---
+
+#### GET /api/0/users/me
+Return the currently authenticated user's profile.
+
+**Response:**
+```json
+{ "username": "carol", "full_name": "Carol Jones", "avatar": "", "admin": false, "notification_channels": [] }
+```
+
+---
+
+### Host Access
+
+#### GET /api/0/hosts/{hostname}/access
+Return owner/managers/monitors for a host. Requires at least **monitor** role.
+
+**Response:**
+```json
+{
+  "owner": "andreas",
+  "managers": ["bob"],
+  "monitors": ["carol"]
+}
+```
+
+---
+
+#### PUT /api/0/hosts/{hostname}/access
+Update owner/managers/monitors. Requires **owner** role or admin.
+
+**Request body** (all fields optional):
+```json
+{
+  "owner": "bob",
+  "managers": ["carol"],
+  "monitors": []
+}
+```
+
+Changes take effect immediately in memory. They are not written back to the config file — reload (`SIGHUP`) will re-apply config values. To make changes permanent, update the config file.
+
+---
+
+## Host visibility
+
+When users are configured, `GET /api/0/hosts` only returns hosts the authenticated user has at least monitor access to. Admins see all hosts.
+
+---
+
+## Config reload
+
+On `SIGHUP`, the server reloads the config file, re-loads the user registry, and re-applies `owner`/`managers`/`monitors` from config to all known hosts. Existing sessions remain valid after a reload.
+
+---
+
+## No-auth mode
+
+If `users:` is absent or empty, the server starts in **unauthenticated mode**:
+
+- No login required — all pages and API endpoints are accessible without credentials.
+- All permission checks pass unconditionally.
+- `/login`, `/logout`, and the auth/user API endpoints return `404`.
+
+This preserves full backwards compatibility with existing deployments.
+
+---
+
+## Security notes
+
+- Session tokens are 64-character cryptographically random hex strings (`secrets.token_hex(32)`).
+- Sessions expire after 24 hours (configurable via `users_mod.SESSION_TTL`).
+- Cookies are `HttpOnly` and `SameSite=Lax` — they are not accessible to JavaScript and are not sent on cross-site requests.
+- The HTTP API does not yet enforce TLS. For production use, place hbd behind a TLS-terminating reverse proxy (nginx, Caddy, etc.) or enable WSS.
+
+---
+
+## See Also
+
+- [HTTP API Documentation](HTTP_API.md)
+- [Notifications](NOTIFICATIONS.md)
+- Configuration example: `hbd/config_example.yaml`
@@ -0,0 +1,602 @@
+# Plugin Error Checking Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Improve plugin error checking in hbc, especially for nagios_runner, and fix logger messages silently discarded in daemon mode.
+
+**Architecture:** Three focused changes across three files: (1) `hbd/client/plugin.py` gains a `skip_reason` attribute on Plugin and updated PluginLoader messaging; (2) `hbd/client/plugins/nagios_runner.py` gains async subprocess execution, stderr capture, signal-killed process handling, and init-time command path validation; (3) `hbd/client/main.py` gains proper post-fork logging reconfiguration to syslog.
+
+**Tech Stack:** Python 3.11+, asyncio, `logging.handlers.SysLogHandler`, pytest
+
+---
+
+## File Map
+
+| Action | Path | What changes |
+|---|---|---|
+| Modify | `hbd/client/plugin.py` | `Plugin.__init__` gains `skip_reason`; `PluginLoader` checks it |
+| Modify | `hbd/client/plugins/nagios_runner.py` | async subprocess, stderr, signal codes, init validation, `skip_reason` |
+| Modify | `hbd/client/main.py` | `_reconfigure_logging_for_daemon()` helper; remove redundant syslog calls |
+| Create | `tests/test_plugin.py` | PluginLoader messaging tests |
+| Create | `tests/test_nagios_runner.py` | NagiosRunnerPlugin behaviour tests |
+
+Run tests throughout with:
+```bash
+python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
+```
+
+---
+
+## Task 1: Plugin.skip_reason + PluginLoader messaging
+
+**Files:**
+- Modify: `hbd/client/plugin.py:40-48` (Plugin.__init__)
+- Modify: `hbd/client/plugin.py:369-381` (PluginLoader.load_from_directory)
+- Create: `tests/test_plugin.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Create `tests/test_plugin.py`:
+
+```python
+import asyncio
+import logging
+import textwrap
+
+from hbd.client.plugin import Plugin, PluginLoader, PluginRegistry
+
+
+def test_plugin_skip_reason_defaults_none(tmp_path):
+    plugin_code = textwrap.dedent("""
+        from hbd.client.plugin import MonitorPlugin
+
+        class MinimalPlugin(MonitorPlugin):
+            name = "minimal"
+            version = "1.0.0"
+            interval = 60
+
+            async def initialize(self):
+                return True
+
+            async def _collect_metrics(self):
+                return {}
+    """)
+    (tmp_path / "minimal.py").write_text(plugin_code)
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+    asyncio.run(loader.load_from_directory(tmp_path))
+    plugin = registry.get("minimal")
+    assert plugin is not None
+    assert plugin.skip_reason is None
+
+
+def test_loader_logs_info_when_skip_reason_set(tmp_path, caplog):
+    plugin_code = textwrap.dedent("""
+        from hbd.client.plugin import MonitorPlugin
+
+        class SkippablePlugin(MonitorPlugin):
+            name = "skippable"
+            version = "1.0.0"
+            interval = 60
+
+            async def initialize(self):
+                self.skip_reason = "not configured in yaml"
+                return False
+
+            async def _collect_metrics(self):
+                return {}
+    """)
+    (tmp_path / "skippable.py").write_text(plugin_code)
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+
+    with caplog.at_level(logging.INFO, logger="plugin.loader"):
+        count = asyncio.run(loader.load_from_directory(tmp_path))
+
+    assert count == 0
+    assert any("skipped: not configured in yaml" in r.message for r in caplog.records)
+    assert not any("failed initialization" in r.message for r in caplog.records)
+
+
+def test_loader_logs_warning_when_no_skip_reason(tmp_path, caplog):
+    plugin_code = textwrap.dedent("""
+        from hbd.client.plugin import MonitorPlugin
+
+        class FailPlugin(MonitorPlugin):
+            name = "fail"
+            version = "1.0.0"
+            interval = 60
+
+            async def initialize(self):
+                return False
+
+            async def _collect_metrics(self):
+                return {}
+    """)
+    (tmp_path / "fail_plugin.py").write_text(plugin_code)
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+
+    with caplog.at_level(logging.WARNING, logger="plugin.loader"):
+        count = asyncio.run(loader.load_from_directory(tmp_path))
+
+    assert count == 0
+    assert any("failed initialization" in r.message for r in caplog.records)
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+python -m pytest tests/test_plugin.py -v
+```
+Expected: `test_plugin_skip_reason_defaults_none` FAILS (attribute missing), others may error.
+
+- [ ] **Step 3: Add `skip_reason` to `Plugin.__init__`**
+
+In `hbd/client/plugin.py`, in `Plugin.__init__` (around line 46), add one line:
+
+```python
+def __init__(self, config: Optional[Dict[str, Any]] = None):
+    self.config = config or {}
+    self.logger = logging.getLogger(f"plugin.{self.name}")
+    self._initialized = False
+    self.skip_reason: Optional[str] = None
+```
+
+- [ ] **Step 4: Update PluginLoader messaging**
+
+In `hbd/client/plugin.py`, replace the `if not initialized:` block (around line 372):
+
+```python
+                    if not initialized:
+                        if plugin.skip_reason:
+                            self.logger.info(
+                                f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
+                            )
+                        else:
+                            self.logger.warning(
+                                f"Plugin {plugin.name} failed initialization, skipping"
+                            )
+                        continue
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+python -m pytest tests/test_plugin.py -v
+```
+Expected: all 3 tests PASS.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add hbd/client/plugin.py tests/test_plugin.py
+git commit -m "feat: add skip_reason to Plugin; improve PluginLoader init messaging"
+```
+
+---
+
+## Task 2: NagiosRunnerPlugin — skip_reason when no commands
+
+**Files:**
+- Modify: `hbd/client/plugins/nagios_runner.py:88-105` (initialize)
+- Modify: `tests/test_nagios_runner.py` (create)
+
+- [ ] **Step 1: Write failing test**
+
+Create `tests/test_nagios_runner.py`:
+
+```python
+import asyncio
+import logging
+import os
+import stat
+
+import pytest
+
+from hbd.client.plugins.nagios_runner import (
+    NagiosRunnerPlugin,
+    NAGIOS_OK,
+    NAGIOS_WARNING,
+    NAGIOS_CRITICAL,
+    NAGIOS_UNKNOWN,
+)
+
+
+def test_no_commands_sets_skip_reason():
+    plugin = NagiosRunnerPlugin(config={"commands": []})
+    result = asyncio.run(plugin.initialize())
+    assert result is False
+    assert plugin.skip_reason is not None
+    assert "nagios_runner.commands" in plugin.skip_reason
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```bash
+python -m pytest tests/test_nagios_runner.py::test_no_commands_sets_skip_reason -v
+```
+Expected: FAIL — `plugin.skip_reason` is `None`.
+
+- [ ] **Step 3: Set skip_reason in NagiosRunnerPlugin.initialize()**
+
+In `hbd/client/plugins/nagios_runner.py`, replace the early-return block in `initialize()` (around line 96):
+
+```python
+        if not self.commands:
+            self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
+            self.logger.info("No Nagios commands configured")
+            return False
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+```bash
+python -m pytest tests/test_nagios_runner.py::test_no_commands_sets_skip_reason -v
+```
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
+git commit -m "feat: set skip_reason on nagios_runner when no commands configured"
+```
+
+---
+
+## Task 3: NagiosRunnerPlugin — async subprocess, stderr capture, negative return codes
+
+**Files:**
+- Modify: `hbd/client/plugins/nagios_runner.py` (imports + `_run_nagios_plugin`)
+- Modify: `tests/test_nagios_runner.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Append to `tests/test_nagios_runner.py`:
+
+```python
+def test_stderr_used_when_stdout_empty(tmp_path):
+    script = tmp_path / "check_err.sh"
+    script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n")
+    script.chmod(script.stat().st_mode | stat.S_IEXEC)
+
+    config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
+    plugin = NagiosRunnerPlugin(config=config)
+    asyncio.run(plugin.initialize())
+    data = asyncio.run(plugin._collect_metrics())
+
+    assert "error from stderr" in data["t_output"]
+    assert data["t_status_code"] == NAGIOS_CRITICAL
+
+
+def test_stderr_appended_when_both_present(tmp_path):
+    script = tmp_path / "check_both.sh"
+    script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n")
+    script.chmod(script.stat().st_mode | stat.S_IEXEC)
+
+    config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
+    plugin = NagiosRunnerPlugin(config=config)
+    asyncio.run(plugin.initialize())
+    data = asyncio.run(plugin._collect_metrics())
+
+    assert "OK - all good" in data["t_output"]
+    assert "extra detail" in data["t_output"]
+    assert data["t_status_code"] == NAGIOS_OK
+
+
+def test_negative_returncode_maps_to_unknown():
+    # kill -9 $$ kills the shell itself; asyncio sees returncode -9
+    config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5}
+    plugin = NagiosRunnerPlugin(config=config)
+    asyncio.run(plugin.initialize())
+    data = asyncio.run(plugin._collect_metrics())
+
+    assert data["t_status_code"] == NAGIOS_UNKNOWN
+    assert "signal" in data["t_output"].lower()
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+python -m pytest tests/test_nagios_runner.py::test_stderr_used_when_stdout_empty \
+    tests/test_nagios_runner.py::test_stderr_appended_when_both_present \
+    tests/test_nagios_runner.py::test_negative_returncode_maps_to_unknown -v
+```
+Expected: all FAIL — current implementation ignores stderr and doesn't handle negative codes.
+
+- [ ] **Step 3: Update imports in nagios_runner.py**
+
+Replace the import block at the top of `hbd/client/plugins/nagios_runner.py`:
+
+```python
+import asyncio
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+from hbd.client.plugin import MonitorPlugin
+```
+
+(Remove `import subprocess`; add `import asyncio` and `import os`.)
+
+- [ ] **Step 4: Upgrade collection log level from DEBUG to INFO**
+
+In `hbd/client/plugins/nagios_runner.py`, in `_collect_metrics()`, change the debug log (around line 144) so results are visible at INFO level:
+
+```python
+                self.logger.info(
+                    f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
+                )
+```
+
+- [ ] **Step 5: Replace `_run_nagios_plugin` with async implementation**
+
+Replace the entire `_run_nagios_plugin` method in `hbd/client/plugins/nagios_runner.py`:
+
+```python
+    async def _run_nagios_plugin(
+        self,
+        command: str
+    ) -> Tuple[int, str, Dict[str, Any]]:
+        """Execute a Nagios plugin and parse its output."""
+        try:
+            proc = await asyncio.create_subprocess_shell(
+                command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            try:
+                stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                    proc.communicate(), timeout=self.timeout
+                )
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.communicate()
+                self.logger.error(f"Command timed out: {command}")
+                return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
+
+            status_code = proc.returncode
+
+            if status_code < 0:
+                return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
+
+            if status_code > 3:
+                status_code = NAGIOS_UNKNOWN
+
+            stdout = stdout_bytes.decode(errors="replace").strip()
+            stderr = stderr_bytes.decode(errors="replace").strip()
+
+            # Parse perfdata from stdout before mixing in stderr
+            perfdata = self._parse_perfdata(stdout)
+
+            # Build status message
+            status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
+
+            if not stdout and stderr:
+                output_msg = stderr
+            elif stdout and stderr:
+                output_msg = f"{status_part} [stderr: {stderr}]"
+            else:
+                output_msg = status_part
+
+            return status_code, output_msg, perfdata
+
+        except Exception as e:
+            self.logger.error(f"Error executing command: {e}")
+            return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
+```
+
+Also remove the now-unused `self.shell` line from `__init__` (the `shell` config key is no longer used since `create_subprocess_shell` always uses a shell):
+
+In `NagiosRunnerPlugin.__init__`, remove:
+```python
+        self.shell: bool = config.get("shell", True) if config else True
+```
+
+- [ ] **Step 6: Run tests to verify they pass**
+
+```bash
+python -m pytest tests/test_nagios_runner.py -v
+```
+Expected: all tests PASS including the 3 new ones.
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
+git commit -m "feat: async subprocess in nagios_runner with stderr capture and signal handling"
+```
+
+---
+
+## Task 4: NagiosRunnerPlugin — command path validation at init
+
+**Files:**
+- Modify: `hbd/client/plugins/nagios_runner.py` (initialize)
+- Modify: `tests/test_nagios_runner.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Append to `tests/test_nagios_runner.py`:
+
+```python
+def test_absolute_path_not_found_warns(caplog):
+    fake_cmd = "/nonexistent_hbc_test_path/check_something"
+    config = {"commands": [{"name": "t", "command": fake_cmd}]}
+    plugin = NagiosRunnerPlugin(config=config)
+
+    with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
+        asyncio.run(plugin.initialize())
+
+    assert any("not found" in r.message for r in caplog.records)
+
+
+def test_absolute_path_not_executable_warns(caplog, tmp_path):
+    non_exec = tmp_path / "check_test"
+    non_exec.write_text("#!/bin/sh\necho OK\n")
+    non_exec.chmod(0o644)  # readable but not executable
+
+    config = {"commands": [{"name": "t", "command": str(non_exec)}]}
+    plugin = NagiosRunnerPlugin(config=config)
+
+    with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
+        asyncio.run(plugin.initialize())
+
+    assert any("not executable" in r.message for r in caplog.records)
+
+
+def test_relative_path_not_checked(caplog):
+    # Relative paths (resolved via PATH) must not generate warnings
+    config = {"commands": [{"name": "t", "command": "echo OK"}]}
+    plugin = NagiosRunnerPlugin(config=config)
+
+    with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
+        asyncio.run(plugin.initialize())
+
+    assert not any(
+        "not found" in r.message or "not executable" in r.message
+        for r in caplog.records
+    )
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+python -m pytest tests/test_nagios_runner.py::test_absolute_path_not_found_warns \
+    tests/test_nagios_runner.py::test_absolute_path_not_executable_warns \
+    tests/test_nagios_runner.py::test_relative_path_not_checked -v
+```
+Expected: `test_absolute_path_not_found_warns` and `test_absolute_path_not_executable_warns` FAIL (no warnings logged); `test_relative_path_not_checked` may pass.
+
+- [ ] **Step 3: Add command path validation to `initialize()`**
+
+In `hbd/client/plugins/nagios_runner.py`, extend `initialize()` by adding validation after the existing "log each command" loop (after line 103, before `return True`):
+
+```python
+        # Validate absolute command paths early
+        for cmd_config in self.commands:
+            name = cmd_config.get("name", "unnamed")
+            command = cmd_config.get("command", "")
+            if not command:
+                continue
+            exe = command.split()[0]
+            if os.path.isabs(exe):
+                if not os.path.isfile(exe):
+                    self.logger.warning(
+                        f"Command '{name}': executable not found: {exe}"
+                    )
+                elif not os.access(exe, os.X_OK):
+                    self.logger.warning(
+                        f"Command '{name}': executable not executable: {exe}"
+                    )
+```
+
+- [ ] **Step 4: Run full test suite to verify all pass**
+
+```bash
+python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
+```
+Expected: all tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
+git commit -m "feat: validate absolute command paths at nagios_runner init"
+```
+
+---
+
+## Task 5: Daemon mode logging — route to syslog after fork
+
+**Files:**
+- Modify: `hbd/client/main.py` (new helper + updated daemon block)
+
+No automated test for daemonization itself (fork behaviour is hard to unit-test). Manual verification steps are provided below.
+
+- [ ] **Step 1: Add `_reconfigure_logging_for_daemon` helper**
+
+In `hbd/client/main.py`, add this function just before `def build_parser()` (around line 589):
+
+```python
+def _reconfigure_logging_for_daemon(log_level: int) -> None:
+    """Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
+    from logging.handlers import SysLogHandler
+
+    root = logging.getLogger()
+    for handler in root.handlers[:]:
+        root.removeHandler(handler)
+        handler.close()
+
+    try:
+        syslog_handler = SysLogHandler(
+            address="/dev/log",
+            facility=SysLogHandler.LOG_DAEMON,
+        )
+    except OSError:
+        syslog_handler = SysLogHandler(
+            address=("localhost", 514),
+            facility=SysLogHandler.LOG_DAEMON,
+        )
+        # Attach the fallback first so the warning reaches syslog
+        syslog_handler.setFormatter(
+            logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
+        )
+        root.addHandler(syslog_handler)
+        root.setLevel(log_level)
+        logging.warning("/dev/log not found, using syslog UDP localhost:514")
+        return
+
+    syslog_handler.setFormatter(
+        logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
+    )
+    root.addHandler(syslog_handler)
+    root.setLevel(log_level)
+```
+
+- [ ] **Step 2: Update the daemon block in `main()`**
+
+In `hbd/client/main.py`, replace the entire `if args.daemon:` block (lines 664–675):
+
+```python
+    if args.daemon:
+        print("Daemonizing...")
+        daemonize()
+        _reconfigure_logging_for_daemon(log_level)
+        logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
+```
+
+This removes the `import syslog`, `syslog.openlog()`, and `syslog.syslog()` calls (now handled by the logging system) and removes the no-op second `logging.basicConfig()` call.
+
+- [ ] **Step 3: Run existing test suite to confirm no regressions**
+
+```bash
+python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
+```
+Expected: all tests still PASS.
+
+- [ ] **Step 4: Manual smoke test — verify syslog output in daemon mode**
+
+```bash
+# In one terminal, tail syslog
+sudo journalctl -f -t hbc
+
+# In another terminal, start hbc in daemon mode (replace HOST with a real or dummy host)
+python -m hbd.client.main -d -v localhost
+
+# Expected in journalctl output:
+#   hbc[<pid>]: hbc.main INFO: Starting hbc for <hostname> -> ['localhost']
+#   hbc[<pid>]: hbc.main INFO: hbc starting, sending heartbeat to localhost
+#   hbc[<pid>]: plugin.loader INFO: ...
+
+# Stop the daemon
+pkill -f "hbd.client.main"
+```
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/client/main.py
+git commit -m "fix: reconfigure logging to syslog after daemonize() instead of no-op basicConfig"
+```
@@ -0,0 +1,781 @@
+# Gitea OAuth2 Authentication Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add Gitea as an OAuth2 login provider that coexists with password auth, auto-provisioning new users on first login.
+
+**Architecture:** A new `oauth.py` module owns all Gitea-specific logic (CSRF state, URL building, token exchange, user-info fetch). `users.py` gains one function to upsert an OAuth-sourced user. `http.py` gets two new route handlers and a small login-page change. No new dependencies — `aiohttp.ClientSession` is already used in the codebase.
+
+**Tech Stack:** Python 3.12, aiohttp 3.x, pytest, pytest-asyncio
+
+---
+
+## File Map
+
+| Action | Path | Responsibility |
+|--------|------|----------------|
+| Modify | `hbd/server/config.py` | Add `"oauth": {}` default |
+| Create | `hbd/server/oauth.py` | CSRF state, URL builder, token exchange, user-info fetch |
+| Modify | `hbd/server/users.py` | Add `provision_oauth_user()` |
+| Modify | `hbd/server/http.py` | Import oauth, two new routes, login page button |
+| Create | `tests/test_oauth.py` | All new unit tests |
+
+---
+
+## Task 1: Add config default and `is_enabled()`
+
+**Files:**
+- Modify: `hbd/server/config.py:34` (after the `"users"` line)
+- Create: `hbd/server/oauth.py`
+- Create: `tests/test_oauth.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Create `tests/test_oauth.py`:
+
+```python
+import pytest
+from hbd.server import oauth
+
+
+CFG_OFF = {}
+CFG_ON = {
+    "oauth": {
+        "gitea": {
+            "url": "https://git.example.com",
+            "client_id": "cid",
+            "client_secret": "csec",
+        }
+    }
+}
+CFG_PARTIAL = {"oauth": {"gitea": {"url": "https://git.example.com"}}}
+
+
+def test_is_enabled_when_all_keys_present():
+    assert oauth.is_enabled(CFG_ON) is True
+
+
+def test_is_enabled_false_when_no_oauth_key():
+    assert oauth.is_enabled(CFG_OFF) is False
+
+
+def test_is_enabled_false_when_partial_config():
+    assert oauth.is_enabled(CFG_PARTIAL) is False
+```
+
+- [ ] **Step 2: Run to confirm failure**
+
+```
+pytest tests/test_oauth.py -v
+```
+
+Expected: `ModuleNotFoundError: No module named 'hbd.server.oauth'`
+
+- [ ] **Step 3: Add config default**
+
+In `hbd/server/config.py`, add after the `"default_owner"` line (currently line 35):
+
+```python
+    # OAuth2 providers
+    "oauth": {},                 # oauth.gitea.{url,client_id,client_secret}
+```
+
+- [ ] **Step 4: Create `hbd/server/oauth.py` with `is_enabled`**
+
+```python
+"""Gitea OAuth2 support.
+
+Config shape (in ~/.hb.yaml):
+
+    oauth:
+      gitea:
+        url: https://git.example.com
+        client_id: <client-id>
+        client_secret: <client-secret>
+
+Register a Gitea OAuth2 application at:
+  Gitea → Settings → Applications → OAuth2
+Set the redirect URI to:
+  https://<hbd-host>/login/oauth/gitea/callback
+"""
+
+import logging
+import secrets
+import time
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+STATE_TTL = 600  # 10 minutes
+
+# state_token -> expiry timestamp
+_states: dict[str, float] = {}
+
+
+class OAuthError(Exception):
+    """Raised when the OAuth2 flow fails for any reason."""
+
+
+def _gitea_cfg(config: dict) -> dict:
+    """Return the gitea sub-dict or {} if absent/incomplete."""
+    return config.get("oauth", {}).get("gitea", {})
+
+
+def is_enabled(config: dict) -> bool:
+    """Return True when all three required Gitea OAuth keys are present."""
+    g = _gitea_cfg(config)
+    return bool(g.get("url") and g.get("client_id") and g.get("client_secret"))
+```
+
+- [ ] **Step 5: Run to confirm tests pass**
+
+```
+pytest tests/test_oauth.py -v
+```
+
+Expected: 3 passed
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add hbd/server/config.py hbd/server/oauth.py tests/test_oauth.py
+git commit -m "feat: add oauth module skeleton and is_enabled()"
+```
+
+---
+
+## Task 2: CSRF state management
+
+**Files:**
+- Modify: `hbd/server/oauth.py` (add `make_state`, `validate_state`)
+- Modify: `tests/test_oauth.py` (add state tests)
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/test_oauth.py`:
+
+```python
+import time as time_mod
+
+
+def test_make_state_returns_unique_tokens():
+    s1 = oauth.make_state()
+    s2 = oauth.make_state()
+    assert s1 != s2
+    assert len(s1) == 64  # 32 bytes hex
+
+
+def test_validate_state_valid():
+    state = oauth.make_state()
+    assert oauth.validate_state(state) is True
+
+
+def test_validate_state_consumed_on_use():
+    state = oauth.make_state()
+    oauth.validate_state(state)
+    assert oauth.validate_state(state) is False  # replay rejected
+
+
+def test_validate_state_unknown():
+    assert oauth.validate_state("notastate") is False
+
+
+def test_validate_state_expired(monkeypatch):
+    state = oauth.make_state()
+    # Wind expiry into the past
+    monkeypatch.setitem(oauth._states, state, time_mod.time() - 1)
+    assert oauth.validate_state(state) is False
+```
+
+- [ ] **Step 2: Run to confirm failure**
+
+```
+pytest tests/test_oauth.py -v -k "state"
+```
+
+Expected: `AttributeError: module 'hbd.server.oauth' has no attribute 'make_state'`
+
+- [ ] **Step 3: Implement state functions**
+
+Add to `hbd/server/oauth.py` after the `_states` dict definition:
+
+```python
+def make_state() -> str:
+    """Generate a CSRF state token, store it with TTL, and return it."""
+    _purge_states()
+    token = secrets.token_hex(32)
+    _states[token] = time.time() + STATE_TTL
+    return token
+
+
+def validate_state(state: str) -> bool:
+    """Return True if *state* is known and unexpired; always removes it."""
+    expiry = _states.pop(state, None)
+    if expiry is None:
+        return False
+    return time.time() < expiry
+
+
+def _purge_states() -> None:
+    now = time.time()
+    expired = [k for k, exp in list(_states.items()) if exp < now]
+    for k in expired:
+        del _states[k]
+```
+
+- [ ] **Step 4: Run to confirm tests pass**
+
+```
+pytest tests/test_oauth.py -v
+```
+
+Expected: 8 passed
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/server/oauth.py tests/test_oauth.py
+git commit -m "feat: add OAuth2 CSRF state management"
+```
+
+---
+
+## Task 3: `provision_oauth_user` in users.py
+
+**Files:**
+- Modify: `hbd/server/users.py` (add `provision_oauth_user`)
+- Modify: `tests/test_oauth.py` (add provisioning tests)
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/test_oauth.py`:
+
+```python
+from hbd.server import users as users_mod
+from hbd.server.users import User
+
+
+def _reset_users(entries=None):
+    users_mod.users = entries or {}
+
+
+def test_provision_oauth_user_new():
+    _reset_users()
+    user = users_mod.provision_oauth_user("gituser", "Git User", "https://example.com/avatar.png")
+    assert user.username == "gituser"
+    assert user.full_name == "Git User"
+    assert user.avatar == "https://example.com/avatar.png"
+    assert user.admin is False
+    assert user.password_hash == ""
+    assert "gituser" in users_mod.users
+
+
+def test_provision_oauth_user_no_password_login():
+    _reset_users()
+    user = users_mod.provision_oauth_user("gituser", "Git User", "")
+    assert user.check_password("anything") is False
+
+
+def test_provision_oauth_user_existing_updates_profile():
+    existing = User(
+        username="alice",
+        full_name="Old Name",
+        avatar="old.png",
+        password_hash="pbkdf2:sha256:1:salt:abc",
+        admin=True,
+        notification_channels=["chan1"],
+    )
+    _reset_users({"alice": existing})
+    user = users_mod.provision_oauth_user("alice", "New Name", "new.png")
+    assert user.full_name == "New Name"
+    assert user.avatar == "new.png"
+    # Preserved
+    assert user.admin is True
+    assert user.password_hash == "pbkdf2:sha256:1:salt:abc"
+    assert user.notification_channels == ["chan1"]
+
+
+def test_provision_oauth_user_does_not_overwrite_with_empty():
+    existing = User(username="bob", full_name="Bob", avatar="bob.png")
+    _reset_users({"bob": existing})
+    user = users_mod.provision_oauth_user("bob", "", "")
+    assert user.full_name == "Bob"
+    assert user.avatar == "bob.png"
+```
+
+- [ ] **Step 2: Run to confirm failure**
+
+```
+pytest tests/test_oauth.py -v -k "provision"
+```
+
+Expected: `AttributeError: module 'hbd.server.users' has no attribute 'provision_oauth_user'`
+
+- [ ] **Step 3: Implement `provision_oauth_user`**
+
+Add to `hbd/server/users.py` after the `authenticate()` function (after line 187):
+
+```python
+def provision_oauth_user(username: str, full_name: str, avatar: str) -> "User":
+    """Create or update a user sourced from an OAuth2 provider.
+
+    New users are inserted with no password_hash — they can only authenticate
+    via OAuth.  Existing users (e.g. defined in config with a password) have
+    their display name and avatar refreshed; all other attributes are preserved.
+    """
+    user = users.get(username)
+    if user is None:
+        user = User(username=username, full_name=full_name, avatar=avatar)
+        users[username] = user
+        logger.info("Provisioned OAuth user %r", username)
+    else:
+        if full_name:
+            user.full_name = full_name
+        if avatar:
+            user.avatar = avatar
+    return user
+```
+
+- [ ] **Step 4: Run to confirm tests pass**
+
+```
+pytest tests/test_oauth.py -v
+```
+
+Expected: 12 passed
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/server/users.py tests/test_oauth.py
+git commit -m "feat: add provision_oauth_user() to users module"
+```
+
+---
+
+## Task 4: URL builder, token exchange, and user-info fetch
+
+**Files:**
+- Modify: `hbd/server/oauth.py` (add `authorization_url`, `exchange_code`, `fetch_user`)
+- Modify: `tests/test_oauth.py` (add async tests with mocked HTTP)
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/test_oauth.py`:
+
+```python
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from urllib.parse import urlparse, parse_qs
+
+
+def test_authorization_url_shape():
+    state = "teststate"
+    redirect_uri = "https://hbd.example.com/login/oauth/gitea/callback"
+    url = oauth.authorization_url(CFG_ON, state, redirect_uri)
+    parsed = urlparse(url)
+    qs = parse_qs(parsed.query)
+    assert parsed.scheme == "https"
+    assert parsed.netloc == "git.example.com"
+    assert parsed.path == "/login/oauth/authorize"
+    assert qs["client_id"] == ["cid"]
+    assert qs["state"] == ["teststate"]
+    assert qs["redirect_uri"] == [redirect_uri]
+    assert qs["scope"] == ["user:email"]
+    assert qs["response_type"] == ["code"]
+
+
+@pytest.mark.asyncio
+async def test_exchange_code_returns_token():
+    redirect_uri = "https://hbd.example.com/login/oauth/gitea/callback"
+    mock_response = AsyncMock()
+    mock_response.status = 200
+    mock_response.json = AsyncMock(return_value={"access_token": "tok123"})
+
+    mock_session = MagicMock()
+    mock_session.post = MagicMock(return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_response),
+        __aexit__=AsyncMock(return_value=False),
+    ))
+
+    with patch("hbd.server.oauth.aiohttp.ClientSession", return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_session),
+        __aexit__=AsyncMock(return_value=False),
+    )):
+        token = await oauth.exchange_code(CFG_ON, "mycode", redirect_uri)
+    assert token == "tok123"
+
+
+@pytest.mark.asyncio
+async def test_exchange_code_raises_on_error_status():
+    redirect_uri = "https://hbd.example.com/login/oauth/gitea/callback"
+    mock_response = AsyncMock()
+    mock_response.status = 401
+    mock_response.text = AsyncMock(return_value="unauthorized")
+
+    mock_session = MagicMock()
+    mock_session.post = MagicMock(return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_response),
+        __aexit__=AsyncMock(return_value=False),
+    ))
+
+    with patch("hbd.server.oauth.aiohttp.ClientSession", return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_session),
+        __aexit__=AsyncMock(return_value=False),
+    )):
+        with pytest.raises(oauth.OAuthError):
+            await oauth.exchange_code(CFG_ON, "badcode", redirect_uri)
+
+
+@pytest.mark.asyncio
+async def test_fetch_user_returns_profile():
+    mock_response = AsyncMock()
+    mock_response.status = 200
+    mock_response.json = AsyncMock(return_value={
+        "login": "alice",
+        "full_name": "Alice Smith",
+        "avatar_url": "https://git.example.com/avatars/alice.png",
+    })
+
+    mock_session = MagicMock()
+    mock_session.get = MagicMock(return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_response),
+        __aexit__=AsyncMock(return_value=False),
+    ))
+
+    with patch("hbd.server.oauth.aiohttp.ClientSession", return_value=AsyncMock(
+        __aenter__=AsyncMock(return_value=mock_session),
+        __aexit__=AsyncMock(return_value=False),
+    )):
+        profile = await oauth.fetch_user(CFG_ON, "tok123")
+    assert profile == {
+        "login": "alice",
+        "full_name": "Alice Smith",
+        "avatar_url": "https://git.example.com/avatars/alice.png",
+    }
+```
+
+- [ ] **Step 2: Run to confirm failure**
+
+```
+pytest tests/test_oauth.py -v -k "url or exchange or fetch"
+```
+
+Expected: `AttributeError: module 'hbd.server.oauth' has no attribute 'authorization_url'`
+
+- [ ] **Step 3: Implement the three functions**
+
+Add to `hbd/server/oauth.py`:
+
+```python
+import urllib.parse
+
+
+def authorization_url(config: dict, state: str, redirect_uri: str) -> str:
+    """Return the Gitea OAuth2 authorization URL to redirect the browser to."""
+    g = _gitea_cfg(config)
+    params = urllib.parse.urlencode({
+        "client_id": g["client_id"],
+        "redirect_uri": redirect_uri,
+        "response_type": "code",
+        "scope": "user:email",
+        "state": state,
+    })
+    return f"{g['url'].rstrip('/')}/login/oauth/authorize?{params}"
+
+
+async def exchange_code(config: dict, code: str, redirect_uri: str) -> str:
+    """Exchange an authorization *code* for a Gitea access token.
+
+    Returns the access token string.  Raises OAuthError on any failure.
+    """
+    g = _gitea_cfg(config)
+    url = f"{g['url'].rstrip('/')}/login/oauth/access_token"
+    payload = {
+        "client_id": g["client_id"],
+        "client_secret": g["client_secret"],
+        "code": code,
+        "grant_type": "authorization_code",
+        "redirect_uri": redirect_uri,
+    }
+    timeout = aiohttp.ClientTimeout(total=10)
+    try:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.post(url, json=payload, headers={"Accept": "application/json"}) as resp:
+                if resp.status != 200:
+                    text = await resp.text()
+                    raise OAuthError(f"Token exchange failed ({resp.status}): {text}")
+                data = await resp.json()
+    except aiohttp.ClientError as exc:
+        raise OAuthError(f"Token exchange network error: {exc}") from exc
+    token = data.get("access_token")
+    if not token:
+        raise OAuthError(f"No access_token in response: {data}")
+    return token
+
+
+async def fetch_user(config: dict, token: str) -> dict:
+    """Fetch the authenticated user's profile from Gitea.
+
+    Returns a dict with keys: login, full_name, avatar_url.
+    Raises OAuthError on any failure.
+    """
+    g = _gitea_cfg(config)
+    url = f"{g['url'].rstrip('/')}/api/v1/user"
+    timeout = aiohttp.ClientTimeout(total=10)
+    try:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(url, headers={"Authorization": f"token {token}"}) as resp:
+                if resp.status != 200:
+                    text = await resp.text()
+                    raise OAuthError(f"User fetch failed ({resp.status}): {text}")
+                data = await resp.json()
+    except aiohttp.ClientError as exc:
+        raise OAuthError(f"User fetch network error: {exc}") from exc
+    return {
+        "login": data.get("login", ""),
+        "full_name": data.get("full_name", ""),
+        "avatar_url": data.get("avatar_url", ""),
+    }
+```
+
+Also add `import urllib.parse` at the top of `oauth.py` (alongside the existing imports).
+
+- [ ] **Step 4: Run to confirm tests pass**
+
+```
+pytest tests/test_oauth.py -v
+```
+
+Expected: 17 passed
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/server/oauth.py tests/test_oauth.py
+git commit -m "feat: add authorization_url, exchange_code, fetch_user to oauth module"
+```
+
+---
+
+## Task 5: HTTP routes — redirect and callback
+
+**Files:**
+- Modify: `hbd/server/http.py`
+
+`http.py` defines all handlers inside `async def start(...)`. The two new handlers go in the same block, just before the `app = web.Application()` line (~line 900). The import goes at the top of the file.
+
+- [ ] **Step 1: Add the import**
+
+In `hbd/server/http.py`, add after the existing local imports (after `from . import users as users_mod`):
+
+```python
+from . import oauth as oauth_mod
+```
+
+- [ ] **Step 2: Add the two route handlers**
+
+In `hbd/server/http.py`, add the two handlers immediately before the `app = web.Application()` line:
+
+```python
+    async def oauth_gitea_redirect(request):
+        """GET /login/oauth/gitea — kick off the Gitea OAuth2 flow."""
+        if not oauth_mod.is_enabled(config):
+            return web.Response(status=404, text="OAuth not configured")
+        state = oauth_mod.make_state()
+        redirect_uri = f"{request.url.origin()}/login/oauth/gitea/callback"
+        raise web.HTTPFound(oauth_mod.authorization_url(config, state, redirect_uri))
+
+    async def oauth_gitea_callback(request):
+        """GET /login/oauth/gitea/callback — handle Gitea's redirect back."""
+        if not oauth_mod.is_enabled(config):
+            return web.Response(status=404, text="OAuth not configured")
+        code = request.rel_url.query.get("code", "")
+        state = request.rel_url.query.get("state", "")
+        if not code or not state:
+            return web.Response(status=400, text="Missing code or state")
+        if not oauth_mod.validate_state(state):
+            raise web.HTTPFound("/login?error=1")
+        redirect_uri = f"{request.url.origin()}/login/oauth/gitea/callback"
+        try:
+            token = await oauth_mod.exchange_code(config, code, redirect_uri)
+            profile = await oauth_mod.fetch_user(config, token)
+        except oauth_mod.OAuthError as exc:
+            logger.warning("OAuth error: %s", exc)
+            raise web.HTTPFound("/login?error=1")
+        user = users_mod.provision_oauth_user(
+            profile["login"],
+            profile["full_name"],
+            profile["avatar_url"],
+        )
+        session_token = users_mod.create_session(user.username)
+        resp = web.HTTPFound("/")
+        resp.set_cookie(
+            SESSION_COOKIE,
+            session_token,
+            max_age=users_mod.SESSION_TTL,
+            httponly=True,
+            samesite="Lax",
+        )
+        raise resp
+```
+
+- [ ] **Step 3: Register the routes**
+
+In `hbd/server/http.py`, add to the route list after the existing auth routes (after `web.post("/api/0/auth/logout", api_logout)`):
+
+```python
+            web.get("/login/oauth/gitea",          oauth_gitea_redirect),
+            web.get("/login/oauth/gitea/callback", oauth_gitea_callback),
+```
+
+- [ ] **Step 4: Manual smoke test**
+
+Start the server locally with OAuth configured in `~/.hb.yaml`:
+
+```yaml
+oauth:
+  gitea:
+    url: https://your-gitea-instance.example.com
+    client_id: your-client-id
+    client_secret: your-client-secret
+```
+
+Visit `http://localhost:50004/login/oauth/gitea` — confirm you are redirected to Gitea's authorization page.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add hbd/server/http.py
+git commit -m "feat: add Gitea OAuth2 redirect and callback routes"
+```
+
+---
+
+## Task 6: Login page — "Sign in with Gitea" button
+
+**Files:**
+- Modify: `hbd/server/http.py` (update `login_page` handler, ~line 625)
+
+- [ ] **Step 1: Replace the login page HTML**
+
+In `hbd/server/http.py`, find the `html = f"""` block inside `login_page` and replace it with:
+
+```python
+        gitea_button = ""
+        if oauth_mod.is_enabled(config):
+            gitea_url = _gitea_cfg_url(config)
+            gitea_button = f"""
+    <div class="divider">or</div>
+    <a href="/login/oauth/gitea" class="gitea-btn">
+      Sign in with Gitea
+    </a>"""
+
+        html = f"""<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <title>Heartbeat — Login</title>
+  <style>
+    body {{ font-family: sans-serif; background: #f5f5f5; display: flex;
+            justify-content: center; align-items: center; height: 100vh; margin: 0; }}
+    .box {{ background: #fff; padding: 2em 2.5em; border-radius: 8px;
+             box-shadow: 0 2px 12px rgba(0,0,0,.15); min-width: 300px; }}
+    h2 {{ margin: 0 0 1.2em; color: #333; font-size: 1.4em; }}
+    label {{ display: block; margin-bottom: .3em; font-size: .9em; color: #555; }}
+    input {{ width: 100%; padding: .5em .7em; border: 1px solid #ccc;
+              border-radius: 4px; font-size: 1em; box-sizing: border-box; }}
+    button {{ margin-top: 1.2em; width: 100%; padding: .6em; background: #0066cc;
+               color: #fff; border: none; border-radius: 4px; font-size: 1em; cursor: pointer; }}
+    button:hover {{ background: #0055aa; }}
+    .error {{ color: #c00; font-size: .9em; margin-bottom: .8em; }}
+    .field {{ margin-bottom: .9em; }}
+    .divider {{ text-align: center; margin: 1.2em 0 .8em; color: #999;
+                font-size: .85em; border-top: 1px solid #eee; padding-top: .8em; }}
+    .gitea-btn {{ display: block; width: 100%; padding: .6em; background: #609926;
+                  color: #fff; border-radius: 4px; font-size: 1em; text-align: center;
+                  text-decoration: none; box-sizing: border-box; }}
+    .gitea-btn:hover {{ background: #4e7d1e; }}
+  </style>
+</head>
+<body>
+  <div class="box">
+    <h2>Heartbeat</h2>
+    {'<p class="error">Invalid username, password, or OAuth error.</p>' if error else ''}
+    <form method="post">
+      <div class="field"><label>Username</label><input name="username" autofocus></div>
+      <div class="field"><label>Password</label><input name="password" type="password"></div>
+      <button type="submit">Sign in</button>
+    </form>{gitea_button}
+  </div>
+</body>
+</html>"""
+```
+
+- [ ] **Step 2: Add the `_gitea_cfg_url` helper**
+
+Add this small helper in `hbd/server/http.py` just before the `login_page` handler (around line 600) so the template can read the Gitea display URL without importing internal oauth details:
+
+```python
+def _gitea_cfg_url(config: dict) -> str:
+    return config.get("oauth", {}).get("gitea", {}).get("url", "")
+```
+
+Also update the `login_page` handler's `error` logic to show the error when the `?error=1` query param is present (set by the callback on OAuth failure):
+
+```python
+    async def login_page(request):
+        """GET /login — show login form; POST /login — process and redirect."""
+        if not users_mod.users_enabled():
+            raise web.HTTPFound("/")
+
+        error = ""
+        if request.method == "POST":
+            form = await request.post()
+            username = form.get("username", "")
+            password = form.get("password", "")
+            user = users_mod.authenticate(username, password)
+            if user:
+                token = users_mod.create_session(username)
+                redirect_to = request.rel_url.query.get("next", "/")
+                resp = web.HTTPFound(redirect_to)
+                resp.set_cookie(
+                    SESSION_COOKIE,
+                    token,
+                    max_age=users_mod.SESSION_TTL,
+                    httponly=True,
+                    samesite="Lax",
+                )
+                raise resp
+            error = "Invalid username or password."
+        elif request.rel_url.query.get("error"):
+            error = "Sign-in failed. Please try again."
+```
+
+- [ ] **Step 3: Manual verification**
+
+Start the server with OAuth configured. Visit `/login`. Confirm:
+- The "Sign in with Gitea" button appears (green, below a divider)
+- Clicking it redirects to Gitea
+- After authorising on Gitea, you are redirected back and land on `/` with a valid session cookie
+
+Without OAuth configured, confirm the button does not appear.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add hbd/server/http.py
+git commit -m "feat: add Sign in with Gitea button to login page"
+```
+
+---
+
+## Self-Review Notes
+
+- All 5 spec requirements covered: coexist ✓, auto-provision ✓, regular user ✓, any Gitea user ✓, config-driven ✓
+- `exchange_code` signature in Task 4 matches usage in Task 5 (`config, code, redirect_uri`) ✓
+- `fetch_user` returns `{login, full_name, avatar_url}` — matched in callback handler ✓
+- `validate_state` removes state on use (replay protection) ✓
+- `provision_oauth_user` skips empty strings so existing avatar/name aren't erased ✓
+- `_gitea_cfg_url` is a plain `def`, not `async` — safe to call in template prep ✓
@@ -0,0 +1,92 @@
+# Plugin Error Checking & Daemon Logging — Design Spec
+
+**Date:** 2026-04-25  
+**Scope:** hbc client — daemon mode logging, nagios_runner plugin robustness, PluginLoader messaging  
+**Files affected:** `hbd/client/main.py`, `hbd/client/plugins/nagios_runner.py`, `hbd/client/plugin.py`
+
+---
+
+## 1. Daemon Mode Logging
+
+### Problem
+In `main()`, `logging.basicConfig()` is called before `daemonize()` (establishing a StreamHandler to stderr), then called again after `daemonize()`. The second call is a no-op — Python ignores `basicConfig()` when handlers are already configured. After daemonization, stderr is redirected to `/dev/null`, so all subsequent log output is silently discarded.
+
+The existing `syslog.openlog()` / `syslog.syslog()` calls (lines 666–668) write a single startup message but do not integrate with the `logging` system, so plugin and connection log messages never reach syslog.
+
+### Fix
+After `daemonize()`, explicitly reconfigure the root logger:
+
+1. Remove all existing handlers (they now write to `/dev/null`).
+2. Add `logging.handlers.SysLogHandler(address='/dev/log', facility=LOG_DAEMON)`.
+3. Set formatter: `hbc[%(process)d]: %(name)s %(levelname)s: %(message)s`
+4. Preserve the `log_level` already determined from `-v`/`-x` CLI flags.
+
+Remove the redundant `syslog.openlog()` / `syslog.syslog()` calls — the logging system handles routing.
+
+**Fallback:** If `/dev/log` does not exist (containers, some BSDs), fall back to `SysLogHandler(address=('localhost', 514))`. Log one warning (to stderr, before handlers are replaced) so the operator knows.
+
+---
+
+## 2. Nagios Runner Improvements
+
+### 2a — Async Subprocess
+`_run_nagios_plugin()` is declared `async def` but calls `subprocess.run()` synchronously, blocking the event loop for the full command duration.
+
+**Fix:** Replace with `asyncio.create_subprocess_shell()` + `await proc.communicate()`. Enforce timeout with `asyncio.wait_for(..., timeout=self.timeout)` and catch `asyncio.TimeoutError`.
+
+### 2b — Stderr Capture
+Subprocess stderr is currently discarded (`capture_output=True` only captures stdout in the sync call; stderr content is lost).
+
+**Fix:** Pass `stderr=asyncio.subprocess.PIPE` to `create_subprocess_shell`. After `communicate()`, if stdout is empty but stderr has content, use stderr as the output message. If both have content, append stderr to the output for visibility.
+
+### 2c — Negative Return Codes
+A negative `returncode` means the process was killed by a signal (SIGKILL, OOM, etc.). The current code treats these as-is, which may produce unexpected status values.
+
+**Fix:** If `returncode < 0`, map to `NAGIOS_UNKNOWN` with message `"Process killed by signal {-returncode}"`.
+
+### 2d — Command Path Validation at Init
+`initialize()` currently only checks that the commands list is non-empty.
+
+**Fix:** For each command entry during `initialize()`:
+- Warn and skip the entry if `name` or `command` is missing.
+- Extract the executable (first whitespace-delimited token of the command string).
+- If the executable is an absolute path, check `os.path.isfile()` and `os.access(..., os.X_OK)`. Log a `WARNING` if either check fails.
+- Commands with relative paths or shell builtins are not checked (they may be on PATH) — just noted.
+- Validation warns only; all original entries in `self.commands` are retained and still attempted at collection time (where the existing missing-name/command guard already skips them). The plugin initializes successfully as long as the commands list is non-empty.
+
+---
+
+## 3. PluginLoader Messaging
+
+### Problem
+When `initialize()` returns `False`, the loader always logs:
+> `WARNING: Plugin X failed initialization, skipping`
+
+This is alarming when the real reason is simply "no commands configured". There is no API to distinguish "not configured" from "genuinely broken".
+
+### Fix
+Add an optional `skip_reason` attribute to `Plugin.__init__()` (defaults to `None`).
+
+In `PluginLoader.load_from_directory()`, after `initialize()` returns `False`:
+- If `plugin.skip_reason` is set → `logger.info(f"Plugin {plugin.name} skipped: {plugin.skip_reason}")`
+- If `plugin.skip_reason` is `None` → `logger.warning(f"Plugin {plugin.name} failed initialization, skipping")` (existing behaviour)
+
+In `NagiosRunnerPlugin.initialize()`, when no commands are configured:
+```python
+self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
+return False
+```
+
+Genuine failures (exceptions) continue to go through the existing `except` block in the loader, logging at `ERROR` with traceback — unchanged.
+
+---
+
+## Decisions
+
+| Topic | Decision |
+|---|---|
+| Daemon log destination | syslog only (LOG_DAEMON facility) |
+| Syslog fallback | localhost:514 UDP if `/dev/log` absent |
+| Nagios result log level | INFO for all statuses (OK/WARNING/CRITICAL/UNKNOWN) |
+| Invalid command handling at init | Warn and continue; still attempt at collection time |
+| PluginLoader API change | `skip_reason` attribute on Plugin base class, checked by loader |
@@ -0,0 +1,184 @@
+# Gitea OAuth2 Authentication — Design Spec
+
+Date: 2026-05-08
+
+## Overview
+
+Add Gitea as an OAuth2 login provider alongside the existing username/password
+authentication. Any user on the configured Gitea instance can sign in; their
+local account is auto-provisioned on first login as a regular (non-admin) user.
+Password login continues to work unchanged.
+
+---
+
+## Config
+
+A new optional `oauth.gitea` block in `~/.hb.yaml`. OAuth is disabled when the
+block is absent or any of the three required keys is missing.
+
+```yaml
+oauth:
+  gitea:
+    url: https://git.example.com   # Gitea base URL, no trailing slash
+    client_id: <gitea-app-client-id>
+    client_secret: <gitea-app-client-secret>
+```
+
+**Gitea setup:** Create an OAuth2 application in Gitea under
+*Settings → Applications → OAuth2*. Set the redirect URI to
+`https://<hbd-host>/login/oauth/gitea/callback`.
+
+`config.py` default:
+
+```python
+"oauth": {},
+```
+
+---
+
+## New module: `hbd/server/oauth.py`
+
+Owns all OAuth2 logic. No new dependencies — uses `aiohttp.ClientSession`
+already present in the codebase.
+
+### CSRF state store
+
+```python
+# state -> expires (float)
+_states: dict[str, float] = {}
+STATE_TTL = 600  # 10 minutes
+```
+
+`_states` is an in-memory dict. Entries are created on redirect and deleted on
+use or expiry. A purge runs on every new state generation.
+
+### Public API
+
+| Function | Description |
+|---|---|
+| `is_enabled(config)` | Returns `True` when url, client_id, and client_secret are all set |
+| `make_state()` | Generates a random state token, stores it with TTL, returns it |
+| `validate_state(state)` | Returns `True` and removes the state if valid and unexpired |
+| `authorization_url(config, state, redirect_uri)` | Builds the Gitea `/login/oauth/authorize` redirect URL with `client_id`, `redirect_uri`, `scope=user:email`, `state` |
+| `exchange_code(config, code, redirect_uri)` async | POSTs to Gitea `/login/oauth/access_token` with code and redirect_uri, returns the access token string or raises `OAuthError` |
+| `fetch_user(config, token)` async | GETs Gitea `/api/v1/user` with Bearer token, returns `{"login", "full_name", "avatar_url"}` or raises `OAuthError` |
+
+### Error handling
+
+`OAuthError(message)` is a module-level exception. The callback route catches it
+and renders the login page with an error message — identical to an invalid
+password error in UX terms.
+
+Network timeouts use a 10-second `aiohttp` timeout. Any non-2xx response from
+Gitea raises `OAuthError`.
+
+---
+
+## Change: `hbd/server/users.py`
+
+One new function added to the public API:
+
+```python
+def provision_oauth_user(username: str, full_name: str, avatar: str) -> User:
+```
+
+- If the username does not exist in the live `users` dict, creates a `User`
+  with no `password_hash` (so password login is impossible for this account)
+  and inserts it.
+- If the username already exists (e.g. was defined in config with a password),
+  updates `full_name` and `avatar` from the OAuth profile and returns the
+  existing user unchanged in all other respects (preserving admin flag,
+  notification channels, etc.).
+- Logs a one-line INFO message on first provision.
+
+---
+
+## Changes: `hbd/server/http.py`
+
+### Two new route handlers
+
+**`GET /login/oauth/gitea`**
+
+1. Checks `oauth.is_enabled(config)` — returns 404 if not.
+2. Calls `oauth.make_state()`.
+3. Constructs `redirect_uri` as `{request.url.origin()}/login/oauth/gitea/callback` using aiohttp's `request.url.origin()`.
+4. Redirects the browser to `oauth.authorization_url(config, state, redirect_uri)`.
+
+**`GET /login/oauth/gitea/callback`**
+
+1. Reads `code` and `state` query params; returns 400 if either is missing.
+2. Calls `oauth.validate_state(state)` — redirects to `/login` with error if
+   invalid (CSRF or replay protection).
+3. Reconstructs the same `redirect_uri` as the redirect handler (required by OAuth2 spec for token exchange).
+4. Calls `await oauth.exchange_code(config, code, redirect_uri)` to get the access token.
+4. Calls `await oauth.fetch_user(config, token)` to get the Gitea user profile.
+5. Calls `users_mod.provision_oauth_user(login, full_name, avatar_url)`.
+6. Calls `users_mod.create_session(username)` to get a session token.
+7. Sets `hbd_session` cookie (same flags as password login: httponly, Lax,
+   24h TTL).
+8. Redirects to `/`.
+9. Any `OAuthError` re-renders the login page with a generic error message.
+
+### Login page change
+
+When `oauth.is_enabled(config)` is `True`, the existing login form gains a
+separator and a "Sign in with Gitea" link button pointing to
+`/login/oauth/gitea`. The password form is always rendered regardless.
+
+### Route registration
+
+```python
+web.get("/login/oauth/gitea",          oauth_redirect),
+web.get("/login/oauth/gitea/callback", oauth_callback),
+```
+
+Added alongside the existing `/login` and `/logout` routes.
+
+---
+
+## Data flow
+
+```
+Browser                    hbd                        Gitea
+  |                          |                           |
+  |-- GET /login ----------->|                           |
+  |<- login page (+ button) -|                           |
+  |                          |                           |
+  |-- GET /login/oauth/gitea>|                           |
+  |<- 302 Gitea /authorize --|                           |
+  |                          |                           |
+  |-- GET /login/oauth/authorize ----------------------->|
+  |<- 302 /login/oauth/gitea/callback?code=..&state=.. --|
+  |                          |                           |
+  |-- GET /callback -------->|                           |
+  |                          |-- POST /access_token ---->|
+  |                          |<- {access_token} ---------|
+  |                          |-- GET /api/v1/user ------>|
+  |                          |<- {login, name, avatar} --|
+  |                          | provision_oauth_user()    |
+  |                          | create_session()          |
+  |<- 302 / (set cookie) ----|                           |
+```
+
+---
+
+## Testing
+
+- `test_oauth_state`: `make_state` + `validate_state` happy path; expired state
+  returns False; replay (double-use) returns False.
+- `test_provision_oauth_user_new`: new username creates User with no password.
+- `test_provision_oauth_user_existing`: existing config user updates name/avatar,
+  preserves admin flag and notification_channels.
+- `test_oauth_callback_invalid_state`: callback with bad state redirects to login.
+- Integration: mock Gitea endpoints with `aiohttp_client` fixture; full
+  redirect → callback → session cookie flow.
+
+---
+
+## Out of scope
+
+- Restricting login to specific Gitea organisations or teams.
+- Making OAuth users admin automatically.
+- Multiple OAuth providers.
+- Token refresh (Gitea access tokens are long-lived; the hbd session TTL governs
+  re-authentication).
@@ -1,11 +1,17 @@
-"""hbd package - scaffolding for heartbeat daemon
+"""hbd package - heartbeat monitoring system

-This package contains the refactored modules for the original monolithic
-`hbd` script. The initial implementation contains small scaffolds so you can
-start moving functionality into the package.
+This package contains both the heartbeat client (hbc) and server (hbd) components,
+organized into separate subpackages:
+
+- hbd.client: Client component with system monitoring plugins
+- hbd.server: Server/daemon component with web UI and notifications  
+- hbd.common: Shared utilities and protocol definitions
+
+Install options:
+- pip install hbd[client]  # Client only
+- pip install hbd[server]  # Server only
+- pip install hbd[all]     # Both client and server
 """

-__all__ = ["main", "__version__"]
-__version__ = "5.0.4"
-
-from .cli import main
+__all__ = ["__version__"]
+__version__ = "5.2.6"
@@ -1,54 +0,0 @@
-"""Command line interface for hbd package."""
-
-import argparse
-
-from .config import load_config
-from .server import run as run_server
-
-PUSHSRVS = ["all", "pushover", "mattermost"]
-
-
-def build_parser():
-    parser = argparse.ArgumentParser(
-        prog="hbd",
-        description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "-c", "--config", dest="configfile", help="Config file path (YAML)"
-    )
-    parser.add_argument(
-        "-f", "--foreground", action="store_true", help="Run in foreground"
-    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
-    parser.add_argument(
-        "-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS, help="Push service to use"
-    )
-    parser.add_argument(
-        "-x", "--debug", action="count", default=0, help="Increase debug level"
-    )
-    return parser
-
-
-def main(argv=None):
-    parser = build_parser()
-    args = parser.parse_args(argv)
-
-    config = load_config(args.configfile)
-
-    # Apply CLI overrides
-    if args.foreground:
-        config["foreground"] = True
-    if args.verbose:
-        config["verbose"] = True
-    if args.pushsrv:
-        config["pushsrv"] = args.pushsrv
-    if args.debug:
-        config.setdefault("debug", 0)
-        config["debug"] += args.debug
-
-    run_server(config)
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,3 @@
+"""HeartBeat Client (hbc) - System monitoring client."""
+
+from hbd import __version__
@@ -0,0 +1,61 @@
+"""Configuration loader and defaults for hbc (HeartBeat Client)."""
+
+import logging
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    import yaml
+except Exception:
+    yaml = None
+
+CLIENT_DEFAULTS = {
+    # Network settings
+    "hb_port": 50003,          # Port where hbd servers listen
+    "interval": 10,             # Heartbeat interval in seconds
+
+    # Host identity
+    "owner": None,             # Optional username to set as this host's owner on the server
+
+    # Runtime flags
+    "foreground": False,
+    "verbose": False,
+    "debug": 0,
+
+    # Plugin configuration
+    "plugins": {},              # Per-plugin configuration
+    "thresholds": {},           # Threshold configuration for monitoring
+}
+
+
+def load_config(path=None):
+    """Load configuration from a YAML file and merge with client defaults.
+
+    If YAML is not available or the file does not exist, defaults are returned.
+    
+    Args:
+        path: Path to YAML config file (default: ~/.hbc.yaml)
+        
+    Returns:
+        Dictionary with configuration
+    """
+    cfg = CLIENT_DEFAULTS.copy()
+    if not path:
+        # default path (~/.hbc.yaml)
+        path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
+
+    if os.path.exists(path):
+        if yaml:
+            logger.info("Loading configuration from %s", path)
+            with open(path) as fh:
+                data = yaml.safe_load(fh)
+            # Merge YAML data with defaults
+            # Keep all keys from YAML to support plugin configs and future extensions
+            for k, v in data.items():
+                cfg[k] = v
+        else:
+            # yaml not installed: do not attempt to parse; user must ensure defaults
+            logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
+    return cfg
@@ -0,0 +1,786 @@
+#!/usr/bin/env python3
+"""
+HeartBeat Client (hbc) - Async version with plugin support.
+
+Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
+system information via plugins.
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import signal
+import socket
+import sys
+import time
+from logging.handlers import SysLogHandler
+from pathlib import Path
+from typing import Dict, List, Optional
+
+# Import protocol and config
+from .config import load_config
+from ..common.proto import dicttos, stodict
+from .. import __version__
+
+# Import plugin system
+from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
+
+# Constants
+PORT = 50003
+INTERVAL = 10
+MAXRECV = 32767
+
+# Global state
+running = True
+dorestart = False
+shutdown_event: Optional[asyncio.Event] = None
+active_tasks: List[asyncio.Task] = []
+
+
+class AsyncConnection:
+    """Async UDP connection to a heartbeat server."""
+    
+    def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
+        self.conn_id = conn_id
+        self.addr = addr
+        self.port = port
+        self.af = af
+        self.name = name
+        
+        self.ackcount = 0
+        self.lastack = 0.0
+        self.send_count = 0
+        self.lastsend = 0.0
+        self.rtts = [0.0]
+        
+        self.transport: Optional[asyncio.DatagramTransport] = None
+        self.protocol: Optional[asyncio.DatagramProtocol] = None
+        self._dead = False
+        self._ever_opened = False
+        self._open_fail_count = 0   # consecutive failures before first success
+        self.request_info_event: asyncio.Event = asyncio.Event()
+
+        self.logger = logging.getLogger(f"hbc.conn.{addr}")
+
+    async def open(self) -> bool:
+        """Open the UDP connection.
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            loop = asyncio.get_event_loop()
+
+            # Create datagram endpoint
+            self.transport, self.protocol = await loop.create_datagram_endpoint(
+                lambda: HeartbeatProtocol(self),
+                family=self.af
+            )
+            self._ever_opened = True
+            self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
+            return True
+        except Exception as e:
+            self.logger.error(f"Failed to open connection: {e}")
+            return False
+    
+    def close(self):
+        """Close the connection."""
+        if self.transport:
+            self.transport.close()
+            self.transport = None
+            self.protocol = None
+    
+    async def sendto(self, msg: dict, msg_id: str = "HTB"):
+        """Send a message to the server.
+        
+        Args:
+            msg: Message dictionary
+            msg_id: Message ID (HTB, PLG, etc.)
+        """
+        if self._dead:
+            return
+
+        if not self.transport:
+            await self.open()
+
+        if not self.transport:
+            self.logger.error("Cannot send - no transport")
+            return
+        
+        # Add standard fields
+        msg["name"] = shortname(self.name)
+        msg["id"] = self.conn_id
+        msg["time"] = time.time()
+        
+        # Encode message
+        data = dicttos(msg_id, msg)
+        
+        # Send
+        self.transport.sendto(data, (self.addr, self.port))
+        self.send_count += 1
+        self.lastsend = time.time()
+        
+        self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
+    
+    def handle_ack(self, msg: dict, now: float):
+        """Handle ACK message from server.
+        
+        RTT is calculated as: (time ACK received) - (time HTB sent)
+        """
+        self.lastack = now
+        
+        # Calculate RTT: time ACK received minus time HTB sent
+        rtt = (now - self.lastsend) * 1000.0  # Convert to ms
+        
+        self.rtts.append(rtt)
+        if len(self.rtts) > 10:
+            self.rtts.pop(0)
+        
+        self.ackcount += 1
+        self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
+        if msg.get("request_update"):
+            self.logger.info("server requested plugin info refresh")
+            self.request_info_event.set()
+
+
+class HeartbeatProtocol(asyncio.DatagramProtocol):
+    """Protocol handler for incoming UDP messages."""
+    
+    def __init__(self, connection: AsyncConnection):
+        self.connection = connection
+        self.logger = logging.getLogger("hbc.protocol")
+    
+    def datagram_received(self, data: bytes, addr):
+        """Handle incoming datagram."""
+        try:
+            msg = stodict(data)
+            if not msg:
+                self.logger.warning(f"Failed to parse message from {addr}")
+                return
+            
+            now = time.time()
+            msg_id = msg.get("ID")
+            
+            if msg_id == "ACK":
+                self.connection.handle_ack(msg, now)
+            elif msg_id == "CMD":
+                # Command from server
+                asyncio.create_task(handle_command(self.connection, msg))
+            elif msg_id == "UPD":
+                # Update from server
+                asyncio.create_task(handle_update(self.connection, msg))
+            else:
+                self.logger.warning(f"Unknown message type: {msg_id}")
+                
+        except Exception as e:
+            self.logger.error(f"Error processing datagram: {e}", exc_info=True)
+    
+    def error_received(self, exc):
+        """Handle protocol errors — close transport so the heartbeat sender retries."""
+        self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — will retry")
+        self.connection.close()
+
+
+async def handle_command(conn: AsyncConnection, msg: dict):
+    """Execute a command received from server."""
+    import subprocess
+    
+    cmd = msg.get("cmd", "")
+    if not cmd:
+        return
+    
+    logger = logging.getLogger("hbc.command")
+    logger.info(f"Executing command: {cmd}")
+    
+    try:
+        result = subprocess.check_output(
+            cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
+        ).decode()
+        status = "OK"
+    except subprocess.CalledProcessError as e:
+        result = str(e)
+        status = "CalledProcessError"
+    except subprocess.TimeoutExpired:
+        result = "Command timed out"
+        status = "Timeout"
+    except Exception as e:
+        result = str(e)
+        status = "Error"
+    
+    # Send response
+    response = {
+        "service": "command",
+        "msg": f"{status} {result}"
+    }
+    await conn.sendto(response)
+
+
+async def handle_update(conn: AsyncConnection, _msg: dict):  # pyright: ignore[reportUnusedParameter]
+    """Handle self-update by running hb_install.sh."""
+    import shutil
+
+    logger = logging.getLogger("hbc.update")
+
+    installer = shutil.which("hb_install.sh")
+    if installer is None:
+        candidate = Path(sys.argv[0]).parent / "hb_install.sh"
+        if candidate.exists():
+            installer = str(candidate)
+
+    if installer is None:
+        error = "hb_install.sh not found in PATH or alongside hbc"
+        logger.error(error)
+        await conn.sendto({"service": "update", "msg": error})
+        return
+
+    logger.info(f"Running installer: {installer}")
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            installer, "client",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+        )
+        out, _ = await asyncio.wait_for(proc.communicate(), timeout=120)
+    except asyncio.TimeoutError:
+        error = "Installer timed out"
+        logger.error(error)
+        await conn.sendto({"service": "update", "msg": error})
+        return
+    except Exception as e:
+        error = f"Installer failed: {e}"
+        logger.error(error)
+        await conn.sendto({"service": "update", "msg": error})
+        return
+
+    if proc.returncode != 0:
+        error = f"Installer exited {proc.returncode}: {out.decode().strip()}"
+        logger.error(error)
+        await conn.sendto({"service": "update", "msg": error})
+        return
+
+    logger.info("Update successful, restart required")
+    await conn.sendto({"service": "update", "msg": "OK"})
+
+    # Trigger restart
+    global dorestart
+    dorestart = True
+    stop()
+
+
+async def heartbeat_sender(conn: AsyncConnection, interval: int):
+    """Send periodic heartbeats, retrying the connection if it is not open.
+
+    IPv6 connections that fail to open before their first successful send are
+    dropped after IPV6_EARLY_FAIL_LIMIT attempts so that a network without IPv6
+    does not keep a dead sender alive.  IPv4 connections are retried indefinitely.
+
+    Args:
+        conn: Connection to send on
+        interval: Heartbeat interval in seconds
+    """
+    logger = logging.getLogger("hbc.heartbeat")
+    IPV6_EARLY_FAIL_LIMIT = 3
+
+    while running and not conn._dead:
+        # Ensure transport is open before attempting to send.
+        if not conn.transport:
+            opened = await conn.open()
+            if opened:
+                conn._open_fail_count = 0
+            else:
+                conn._open_fail_count += 1
+                # Drop an IPv6 connection that has never come up within the
+                # first few attempts — it is likely unavailable on this network.
+                if (not conn._ever_opened
+                        and conn.af == socket.AF_INET6
+                        and conn._open_fail_count >= IPV6_EARLY_FAIL_LIMIT):
+                    logger.warning(
+                        f"IPv6 connection to {conn.addr} unreachable after "
+                        f"{conn._open_fail_count} attempts, disabling"
+                    )
+                    conn._dead = True
+                    break
+                # Retry after the normal interval; IPv4 retries forever.
+                try:
+                    if shutdown_event:
+                        await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
+                        break
+                    else:
+                        await asyncio.sleep(interval)
+                except asyncio.TimeoutError:
+                    pass
+                except asyncio.CancelledError:
+                    raise
+                continue
+
+        try:
+            msg = {
+                "acks": conn.ackcount,
+                "rtt": conn.rtts[-1],
+                "interval": interval
+            }
+            await conn.sendto(msg, "HTB")
+
+        except asyncio.CancelledError:
+            logger.debug("Heartbeat sender cancelled")
+            raise
+        except Exception as e:
+            logger.error(f"Error sending heartbeat: {e}", exc_info=True)
+
+        # Wait for next interval or shutdown event
+        try:
+            if shutdown_event:
+                await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
+                break
+            else:
+                await asyncio.sleep(interval)
+        except asyncio.TimeoutError:
+            pass  # Normal timeout, continue loop
+        except asyncio.CancelledError:
+            logger.debug("Heartbeat sender cancelled during sleep")
+            raise
+
+
+async def _info_plugin_refresh_loop(conn: AsyncConnection, info_plugins: List):
+    """Wait for server requests to re-send InfoPlugin data."""
+    logger = logging.getLogger("hbc.plugins")
+    while running:
+        await conn.request_info_event.wait()
+        if not running:
+            break
+        conn.request_info_event.clear()
+        logger.info("refreshing InfoPlugins on server request")
+        for plugin in info_plugins:
+            plugin._cache = None
+            try:
+                data = await plugin.collect()
+                if data:
+                    await conn.sendto({"plugin": plugin.name, **data}, "PLG")
+                    logger.info(f"Resent {plugin.name} data")
+            except Exception as e:
+                logger.error(f"Error re-collecting {plugin.name}: {e}", exc_info=True)
+
+
+async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
+    """Collect and send plugin data.
+
+    Args:
+        conn: Connection to send on
+        registry: Plugin registry
+    """
+    logger = logging.getLogger("hbc.plugins")
+
+    # Collect InfoPlugins once at startup
+    info_plugins = registry.get_by_type(InfoPlugin)
+    for plugin in info_plugins:
+        try:
+            data = await plugin.collect()
+            if data:
+                # Create PLG message with plugin name
+                plugin_msg = {"plugin": plugin.name, **data}
+                await conn.sendto(plugin_msg, "PLG")
+                logger.info(f"Sent {plugin.name} data")
+        except Exception as e:
+            logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
+
+    # Schedule MonitorPlugins
+    # Group plugins by interval
+    from collections import defaultdict
+    by_interval = defaultdict(list)
+
+    monitor_plugins = registry.get_by_type(MonitorPlugin)
+    for plugin in monitor_plugins:
+        by_interval[plugin.interval].append(plugin)
+
+    # Create tasks for each interval; always include the info-refresh watcher
+    tasks = [asyncio.create_task(_info_plugin_refresh_loop(conn, info_plugins))]
+    for interval, plugins in by_interval.items():
+        tasks.append(asyncio.create_task(
+            plugin_collector_interval(conn, plugins, interval)
+        ))
+
+    try:
+        await asyncio.gather(*tasks, return_exceptions=True)
+    except asyncio.CancelledError:
+        logger.debug("Plugin collector cancelled, cancelling sub-tasks")
+        for task in tasks:
+            if not task.done():
+                task.cancel()
+        raise
+
+
+async def plugin_collector_interval(
+    conn: AsyncConnection,
+    plugins: List,
+    interval: int
+):
+    """Collect plugins on a specific interval.
+    
+    Args:
+        conn: Connection to send on
+        plugins: List of plugins to collect
+        interval: Collection interval in seconds
+    """
+    logger = logging.getLogger(f"hbc.plugins.{interval}s")
+    
+    while running:
+        for plugin in plugins:
+            try:
+                data = await plugin.collect()
+                if data:
+                    # Don't use encode_plugin_data - create dict directly
+                    plugin_msg = {"plugin": plugin.name, **data}
+                    await conn.sendto(plugin_msg, "PLG")
+                    logger.debug(f"Sent {plugin.name} data")
+            except asyncio.CancelledError:
+                logger.debug("Plugin collector cancelled")
+                raise
+            except Exception as e:
+                logger.error(
+                    f"Error collecting {plugin.name}: {e}",
+                    exc_info=True
+                )
+        
+        # Wait for next interval or shutdown event
+        try:
+            if shutdown_event:
+                await asyncio.wait_for(
+                    shutdown_event.wait(),
+                    timeout=interval
+                )
+                break
+            else:
+                await asyncio.sleep(interval)
+        except asyncio.TimeoutError:
+            pass  # Normal timeout, continue loop
+        except asyncio.CancelledError:
+            logger.debug("Plugin collector cancelled during sleep")
+            raise
+
+
+def shortname(name: str) -> str:
+    """Extract short hostname."""
+    return name.split(".")[0]
+
+
+def stop():
+    """Stop the event loop."""
+    global running
+    running = False
+    
+    # Set shutdown event to wake up sleeping tasks
+    if shutdown_event:
+        shutdown_event.set()
+    
+    # Cancel all active tasks
+    for task in active_tasks:
+        if not task.done():
+            task.cancel()
+
+
+async def cleanup(connections: List[AsyncConnection]):
+    """Cleanup connections on shutdown."""
+    logger = logging.getLogger("hbc.cleanup")
+    logger.info("Cleaning up connections")
+    
+    target = next((c for c in connections if c.transport), connections[0] if connections else None)
+    if target and send_shutdown:
+        try:
+            await target.sendto({"shutdown": 1, "acks": target.ackcount})
+        except Exception as e:
+            logger.error(f"Error sending shutdown: {e}")
+    for conn in connections:
+        conn.close()
+    
+    # Give messages time to send
+    await asyncio.sleep(0.5)
+
+
+async def async_main(args, config):
+    """Async main function."""
+    global running, shutdown_event, active_tasks, send_shutdown 
+    
+    # Create shutdown event
+    shutdown_event = asyncio.Event()
+    active_tasks = []
+    
+    logger = logging.getLogger("hbc.main")
+    
+    # Setup
+    iam = socket.gethostname()
+    if args.name:
+        iam = args.name
+    
+    hb_hosts = args.hosts
+    hb_port = config.get("hb_port", PORT)
+    interval = config.get("interval", INTERVAL)
+    
+    logger.info(f"hbc {__version__} on {iam} -> {hb_hosts} port={hb_port}, interval={interval}s")
+    
+    # Create connections
+    connections = []
+    conn_id = 1
+    
+    for host in hb_hosts:
+        try:
+            addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
+        except socket.gaierror as e:
+            logger.error(f"Cannot resolve {host}: {e}")
+            continue
+        
+        for addr_info in addrs:
+            af = addr_info[0]
+            addr = addr_info[4][0]
+
+            conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
+            if not await conn.open():
+                logger.warning(f"Initial open to {addr} failed, heartbeat sender will retry")
+            connections.append(conn)
+            conn_id += 1
+
+    if not connections:
+        logger.error("No connections established (DNS resolution failed for all hosts)")
+        return 1
+    
+    logger.info(f"Created {len(connections)} connections")
+    
+    # Send boot/message if requested
+    send_shutdown = False
+    if args.boot or args.message:
+        boot_msg = {}
+        if args.boot:
+            boot_msg["boot"] = 1
+            args.boot = False  # Clear boot flag so we don't send it again in main loop
+            send_shutdown = True
+        if args.message:
+            boot_msg["service"] = "service"
+            boot_msg["msg"] = args.message
+        
+        boot_msg["acks"] = 0
+        target = next((c for c in connections if c.transport), connections[0])
+        await target.sendto(boot_msg)
+        
+        if args.message and not args.daemon:
+            # Message-only mode
+            await cleanup(connections)
+            return 0
+    
+    # Load plugins
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+    
+    plugin_dir = Path(__file__).parent / "plugins"
+    if plugin_dir.exists():
+        count = await loader.load_from_directory(plugin_dir, config)
+        logger.info(f"Loaded {count} plugins")
+    else:
+        logger.warning(f"Plugin directory not found: {plugin_dir}")
+    
+    # Setup signal handlers
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, stop)
+
+    def _sighup():
+        global dorestart
+        dorestart = True
+        stop()
+
+    loop.add_signal_handler(signal.SIGHUP, _sighup)
+    
+    # Start async tasks
+    # Heartbeat senders (one per connection)
+    for conn in connections:
+        task = asyncio.create_task(heartbeat_sender(conn, interval))
+        active_tasks.append(task)
+    
+    # Plugin collector (uses all connections, but we'll use first one)
+    if connections and registry.get_enabled():
+        task = asyncio.create_task(plugin_collector(connections[0], registry))
+        active_tasks.append(task)
+    
+    # Wait for stop or tasks to complete
+    try:
+        await asyncio.gather(*active_tasks, return_exceptions=True)
+    except asyncio.CancelledError:
+        logger.info("Tasks cancelled")
+    
+    # Cleanup
+    logger.info("Shutting down...")
+    await cleanup(connections)
+    await loader.unload_all()
+    
+    return 0
+
+
+def daemonize(
+    working_dir="/",
+    stdin="/dev/zero",
+    stdout="/dev/null",
+    stderr="/dev/null"
+):
+    """UNIX double-fork daemonization."""
+    try:
+        pid = os.fork()
+        if pid > 0:
+            os._exit(0)
+    except OSError as e:
+        sys.stderr.write(f"fork #1 failed: {e}\n")
+        os._exit(1)
+    
+    os.chdir(working_dir)
+    os.setsid()
+    os.umask(0)
+    
+    try:
+        pid = os.fork()
+        if pid > 0:
+            os._exit(0)
+    except OSError as e:
+        sys.stderr.write(f"fork #2 failed: {e}\n")
+        sys.exit(1)
+    
+    sys.stdout.flush()
+    sys.stderr.flush()
+    
+    si = open(stdin, "r")
+    so = open(stdout, "a+")
+    se = open(stderr, "a+")
+    
+    os.dup2(si.fileno(), sys.stdin.fileno())
+    os.dup2(so.fileno(), sys.stdout.fileno())
+    os.dup2(se.fileno(), sys.stderr.fileno())
+
+
+def _reconfigure_logging_for_daemon(log_level: int) -> None:
+    """Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
+    root = logging.getLogger()
+    for handler in root.handlers[:]:
+        root.removeHandler(handler)
+        handler.close()
+
+    use_udp_fallback = not os.path.exists("/dev/log")
+
+    if use_udp_fallback:
+        syslog_handler = SysLogHandler(
+            address=("localhost", 514),
+            facility=SysLogHandler.LOG_DAEMON,
+        )
+    else:
+        syslog_handler = SysLogHandler(
+            address="/dev/log",
+            facility=SysLogHandler.LOG_DAEMON,
+        )
+
+    syslog_handler.setFormatter(
+        logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
+    )
+    root.addHandler(syslog_handler)
+    root.setLevel(log_level)
+
+    if use_udp_fallback:
+        logging.warning("/dev/log not found, using syslog UDP localhost:514")
+
+
+def build_parser():
+    """Build argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="hbc",
+        description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-b", "--boot",
+        action="store_true",
+        help="Send a boot message"
+    )
+    parser.add_argument(
+        "-c", "--config",
+        dest="configfile",
+        help="Config file path (YAML)"
+    )
+    parser.add_argument(
+        "-m", "--message",
+        dest="message",
+        help="Send a message"
+    )
+    parser.add_argument(
+        "-n", "--name",
+        dest="name",
+        help="Name to use in heartbeat message"
+    )
+    parser.add_argument(
+        "-d", "--daemon",
+        action="store_true",
+        help="Run in daemon mode"
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Verbose output"
+    )
+    parser.add_argument(
+        "-x", "--debug",
+        action="count",
+        default=0,
+        help="Increase debug level"
+    )
+    parser.add_argument(
+        "hosts",
+        nargs="+",
+        help="Heartbeat daemon hosts to send to"
+    )
+    return parser
+
+
+def main(argv=None):
+    """Main entry point."""
+    global running, dorestart
+    
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    
+    # Setup logging
+    log_level = logging.WARNING
+    if args.verbose:
+        log_level = logging.INFO
+    if args.debug:
+        log_level = logging.DEBUG
+    
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s %(name)s %(levelname)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+
+    # Load config
+    config = load_config(args.configfile)
+    
+    # Daemonize if requested
+    if args.daemon:
+        logging.info("Daemonizing...")
+        daemonize()
+        _reconfigure_logging_for_daemon(log_level)
+        logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
+    
+    # Run async main
+    try:
+        exit_code = asyncio.run(async_main(args, config))
+    except KeyboardInterrupt:
+        logging.info("Interrupted by user")
+        exit_code = 0
+    except Exception as e:
+        logging.error(f"Fatal error: {e}", exc_info=True)
+        exit_code = 1
+    
+    # Handle restart
+    if dorestart:
+        logging.info("Restarting...")
+        os.execv(sys.argv[0], sys.argv)
+    
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,425 @@
+"""Plugin system for extending Heartbeat data collection and monitoring.
+
+This module provides the base classes and infrastructure for the plugin system
+that enables extending hbc (client) data collection and hbd (server) processing.
+
+Plugin Types:
+    - InfoPlugin: Collects static or rarely-changing information (OS, hardware)
+    - MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
+
+Plugins run on the client (hbc) to gather data, which is then sent to the server
+(hbd) for storage, threshold checking, and display.
+"""
+
+import importlib.util
+import inspect
+import logging
+import sys
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Type
+
+
+class Plugin(ABC):
+    """Base class for all plugins.
+
+    Attributes:
+        name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
+        version: Plugin version string
+        description: Human-readable description
+        interval: Collection interval in seconds (0 for InfoPlugin = collect once)
+        enabled: Whether plugin is active (can be disabled via config)
+        skip_reason: Set by plugin before returning False from initialize(); causes loader to log INFO instead of WARNING.
+    """
+    
+    name: str = ""
+    version: str = "1.0.0"
+    description: str = ""
+    interval: int = 0
+    enabled: bool = True
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """Initialize plugin with optional configuration.
+
+        Args:
+            config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
+        """
+        self.config = config or {}
+        self.logger = logging.getLogger(f"plugin.{self.name}")
+        self._initialized = False
+        self.skip_reason: Optional[str] = None
+        
+    @abstractmethod
+    async def initialize(self) -> bool:
+        """Initialize plugin (load resources, check dependencies).
+        
+        Called once when plugin is loaded. Plugins should validate dependencies
+        (e.g., check if psutil is available) and prepare any resources.
+        
+        Returns:
+            True if initialization succeeded, False otherwise
+        """
+        pass
+    
+    @abstractmethod
+    async def collect(self) -> Dict[str, Any]:
+        """Collect data from the system.
+        
+        This is the main method called on each collection interval. Should return
+        a dictionary of key-value pairs representing the collected data.
+        
+        Keys should be strings (metric names). Values can be:
+        - Scalars: int, float, str, bool
+        - Lists/dicts (will be serialized appropriately)
+        
+        Returns:
+            Dictionary of collected metrics, or empty dict on error
+        """
+        pass
+    
+    async def cleanup(self) -> None:
+        """Cleanup plugin resources before shutdown.
+        
+        Called when plugin is being unloaded or on system shutdown.
+        Override to release resources, close connections, etc.
+        """
+        pass
+    
+    def validate_data(self, data: Dict[str, Any]) -> bool:
+        """Validate collected data before sending to server.
+        
+        Override to implement custom validation logic.
+        
+        Args:
+            data: Data returned from collect()
+            
+        Returns:
+            True if data is valid, False otherwise
+        """
+        return isinstance(data, dict)
+
+
+class InfoPlugin(Plugin):
+    """Plugin for collecting static or rarely-changing information.
+    
+    InfoPlugins collect data that doesn't change frequently:
+    - OS name and version
+    - Hardware specifications (CPU model, RAM size)
+    - Network interface MAC addresses
+    
+    Characteristics:
+    - interval = 0 (collected once at startup by default)
+    - Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
+    - Data is cached and reused until next collection
+    """
+    
+    interval: int = 0  # Collect once at startup
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        self._cached_data: Optional[Dict[str, Any]] = None
+    
+    async def get_cached_data(self) -> Optional[Dict[str, Any]]:
+        """Get cached data if available (avoids re-collection).
+        
+        Returns:
+            Cached data dict, or None if not yet collected
+        """
+        return self._cached_data
+    
+    async def collect(self) -> Dict[str, Any]:
+        """Collect and cache static information."""
+        if self._cached_data is None:
+            self._cached_data = await self._collect_info()
+        return self._cached_data
+    
+    @abstractmethod
+    async def _collect_info(self) -> Dict[str, Any]:
+        """Internal method to perform actual data collection.
+        
+        Override this method instead of collect() for InfoPlugins.
+        """
+        pass
+    
+    def invalidate_cache(self) -> None:
+        """Force re-collection on next collect() call."""
+        self._cached_data = None
+
+
+class MonitorPlugin(Plugin):
+    """Plugin for collecting periodic monitoring data.
+    
+    MonitorPlugins collect time-series metrics that change frequently:
+    - CPU usage percentage
+    - Memory consumption
+    - Disk I/O statistics
+    - Network traffic
+    
+    Characteristics:
+    - interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
+    - Collected continuously on schedule
+    - Data includes timestamps for time-series tracking
+    """
+    
+    interval: int = 30  # Default: collect every 30 seconds
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        self._last_reading: Optional[Dict[str, Any]] = None
+    
+    def get_last_reading(self) -> Optional[Dict[str, Any]]:
+        """Get the last collected reading.
+        
+        Returns:
+            Last reading dict with timestamp, or None if not yet collected
+        """
+        return self._last_reading
+    
+    async def collect(self) -> Dict[str, Any]:
+        """Collect monitoring data and store as last reading."""
+        data = await self._collect_metrics()
+        if data:
+            # Add collection timestamp
+            import time
+            data['_timestamp'] = time.time()
+            self._last_reading = data
+        return data
+    
+    @abstractmethod
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Internal method to perform actual metric collection.
+        
+        Override this method instead of collect() for MonitorPlugins.
+        """
+        pass
+
+
+class PluginRegistry:
+    """Registry for managing loaded plugins.
+    
+    Maintains a collection of loaded plugins and provides methods to
+    query plugins by name, type, or interval.
+    """
+    
+    def __init__(self):
+        self._plugins: Dict[str, Plugin] = {}
+        self.logger = logging.getLogger("plugin.registry")
+    
+    def register(self, plugin: Plugin) -> bool:
+        """Register a plugin instance.
+        
+        Args:
+            plugin: Plugin instance to register
+            
+        Returns:
+            True if registered successfully, False if name conflict
+        """
+        if plugin.name in self._plugins:
+            self.logger.error(f"Plugin '{plugin.name}' already registered")
+            return False
+        
+        self._plugins[plugin.name] = plugin
+        self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
+        return True
+    
+    def unregister(self, name: str) -> bool:
+        """Unregister a plugin by name.
+        
+        Args:
+            name: Plugin name to unregister
+            
+        Returns:
+            True if unregistered, False if not found
+        """
+        if name in self._plugins:
+            del self._plugins[name]
+            self.logger.info(f"Unregistered plugin: {name}")
+            return True
+        return False
+    
+    def get(self, name: str) -> Optional[Plugin]:
+        """Get plugin by name.
+        
+        Args:
+            name: Plugin name
+            
+        Returns:
+            Plugin instance or None if not found
+        """
+        return self._plugins.get(name)
+    
+    def get_all(self) -> List[Plugin]:
+        """Get all registered plugins."""
+        return list(self._plugins.values())
+    
+    def get_enabled(self) -> List[Plugin]:
+        """Get all enabled plugins."""
+        return [p for p in self._plugins.values() if p.enabled]
+    
+    def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
+        """Get all plugins of a specific type.
+        
+        Args:
+            plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
+            
+        Returns:
+            List of plugins matching the type
+        """
+        return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
+    
+    def get_by_interval(self, interval: int) -> List[Plugin]:
+        """Get all plugins with a specific collection interval.
+        
+        Args:
+            interval: Interval in seconds (0 for one-time collection)
+            
+        Returns:
+            List of plugins with matching interval
+        """
+        return [p for p in self._plugins.values() if p.interval == interval]
+
+
+class PluginLoader:
+    """Load plugins from filesystem and instantiate them.
+    
+    Scans plugin directories for Python modules containing Plugin subclasses,
+    loads them dynamically, and registers them with the PluginRegistry.
+    """
+    
+    def __init__(self, registry: PluginRegistry):
+        self.registry = registry
+        self.logger = logging.getLogger("plugin.loader")
+        self._loaded_modules: Dict[str, Any] = {}
+    
+    async def load_from_directory(
+        self,
+        directory: Path,
+        config: Optional[Dict[str, Any]] = None
+    ) -> int:
+        """Load all plugins from a directory.
+        
+        Scans for .py files, imports them, finds Plugin subclasses,
+        instantiates them with config, initializes, and registers.
+        
+        Args:
+            directory: Path to plugin directory
+            config: Configuration dict (may contain per-plugin config)
+            
+        Returns:
+            Number of plugins successfully loaded
+        """
+        if not directory.exists() or not directory.is_dir():
+            self.logger.warning(f"Plugin directory not found: {directory}")
+            return 0
+        
+        loaded_count = 0
+        raw_config = config or {}
+        # Per-plugin config lives under the 'plugins' key or at top-level.
+        # CLIENT_DEFAULTS seeds "plugins": {} so the key always exists; check
+        # both the subdict and top-level so that either layout in .hbc.yaml works.
+        plugins_subconfig = raw_config.get("plugins", {})
+        
+        # Scan for Python files
+        for plugin_file in directory.glob("*.py"):
+            if plugin_file.name.startswith("_"):
+                continue  # Skip __init__.py and private modules
+            
+            self.logger.debug(f"Processing plugin file: {plugin_file.name}")
+            
+            try:
+                # Load module dynamically
+                module_name = f"plugins.{plugin_file.stem}"
+                spec = importlib.util.spec_from_file_location(module_name, plugin_file)
+                if not spec or not spec.loader:
+                    self.logger.warning(f"Could not create spec for {plugin_file}")
+                    continue
+                
+                module = importlib.util.module_from_spec(spec)
+                sys.modules[module_name] = module
+                spec.loader.exec_module(module)
+                self._loaded_modules[module_name] = module
+                
+                self.logger.debug(f"Loaded module: {module_name}")
+                
+                # Track which plugin classes we've already processed to avoid duplicates
+                processed_classes = set()
+                
+                # Find Plugin subclasses in module
+                for name, obj in inspect.getmembers(module, inspect.isclass):
+                    # Skip base classes and non-Plugin classes
+                    if obj in (Plugin, InfoPlugin, MonitorPlugin):
+                        self.logger.debug(f"Skipping base class: {name}")
+                        continue
+                    if not issubclass(obj, Plugin):
+                        self.logger.debug(f"Skipping non-Plugin class: {name}")
+                        continue
+                    
+                    # Skip if we've already processed this class (handles module-level aliases)
+                    if id(obj) in processed_classes:
+                        self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
+                        continue
+                    processed_classes.add(id(obj))
+                    
+                    self.logger.debug(f"Found plugin class: {name}")
+                    
+                    # Instantiate plugin with config — check plugins subdict first,
+                    # then top-level keys (e.g. nagios_runner: ... at root of config).
+                    plugin_instance_config = dict(plugins_subconfig.get(obj.name) or raw_config.get(obj.name) or {})
+                    # Propagate top-level owner so os_info (and any future plugin) can report it.
+                    if "owner" in raw_config and "owner" not in plugin_instance_config:
+                        plugin_instance_config["owner"] = raw_config["owner"]
+                    plugin = obj(config=plugin_instance_config)
+                    
+                    # Initialize plugin
+                    try:
+                        initialized = await plugin.initialize()
+                        if not initialized:
+                            if plugin.skip_reason:
+                                self.logger.info(
+                                    f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
+                                )
+                            else:
+                                self.logger.warning(
+                                    f"Plugin {plugin.name} failed initialization, skipping"
+                                )
+                            continue
+                    except Exception as e:
+                        self.logger.error(
+                            f"Error initializing plugin {plugin.name}: {e}",
+                            exc_info=True
+                        )
+                        continue
+                    
+                    # Register with registry
+                    if self.registry.register(plugin):
+                        loaded_count += 1
+                        self.logger.info(
+                            f"Loaded plugin: {plugin.name} v{plugin.version} "
+                            f"(interval: {plugin.interval}s)"
+                        )
+                
+            except Exception as e:
+                self.logger.error(
+                    f"Error loading plugin from {plugin_file}: {e}",
+                    exc_info=True
+                )
+        
+        return loaded_count
+    
+    async def unload_all(self) -> None:
+        """Unload all plugins and cleanup resources."""
+        for plugin in self.registry.get_all():
+            try:
+                await plugin.cleanup()
+            except Exception as e:
+                self.logger.error(
+                    f"Error cleaning up plugin {plugin.name}: {e}",
+                    exc_info=True
+                )
+            self.registry.unregister(plugin.name)
+        
+        # Remove loaded modules
+        for module_name in self._loaded_modules:
+            if module_name in sys.modules:
+                del sys.modules[module_name]
+        self._loaded_modules.clear()
@@ -0,0 +1,136 @@
+"""CPU Monitoring Plugin for Heartbeat.
+
+Collects CPU usage statistics including overall CPU percentage, per-core usage,
+load average, and process counts.
+"""
+
+from typing import Any, Dict, Optional
+import sys
+from pathlib import Path
+
+# Import from parent package
+from hbd.client.plugin import MonitorPlugin
+
+
+class CPUMonitorPlugin(MonitorPlugin):
+    """Monitor CPU usage and load.
+    
+    Collects:
+    - Overall CPU usage percentage
+    - Per-core CPU usage (if enabled in config)
+    - Load average (1min, 5min, 15min)
+    - Process count
+    - CPU frequency (if available)
+    """
+    
+    name = "cpu_monitor"
+    version = "1.0.0"
+    description = "CPU usage and load monitoring"
+    interval = 300  # MonitorPlugin: collect every 5 minutes by default
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        self.psutil = None
+        self.per_core = config.get("per_core", False) if config else False
+        self.interval = config.get("interval", 300) if config else 300
+    
+    async def initialize(self) -> bool:
+        """Initialize the CPU monitor plugin.
+        
+        Checks if psutil is available.
+        
+        Returns:
+            True if psutil is available, False otherwise
+        """
+        self.logger.info(f"Initializing {self.name} plugin")
+        
+        try:
+            import psutil
+            self.psutil = psutil
+            self.logger.info(f"{self.name} initialized successfully")
+            return True
+        except ImportError:
+            self.logger.error(
+                "psutil module not available. Install with: pip install psutil"
+            )
+            return False
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Collect CPU metrics.
+        
+        Returns:
+            Dictionary with CPU metrics
+        """
+        if not self.psutil:
+            return {}
+        
+        try:
+            data = {}
+            
+            # Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
+            # Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
+            data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
+            
+            # Per-core CPU usage (if enabled)
+            if self.per_core:
+                per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
+                data["cpu_per_core"] = per_core_percents
+                data["cpu_core_count"] = len(per_core_percents)
+            else:
+                # Just report core count
+                data["cpu_core_count"] = self.psutil.cpu_count()
+            
+            # Load average (Unix-like systems only)
+            try:
+                load_avg = self.psutil.getloadavg()
+                data["load_1min"] = round(load_avg[0], 2)
+                data["load_5min"] = round(load_avg[1], 2)
+                data["load_15min"] = round(load_avg[2], 2)
+            except (AttributeError, OSError):
+                # Not available on Windows
+                pass
+            
+            # Process count
+            try:
+                data["process_count"] = len(self.psutil.pids())
+            except Exception as e:
+                self.logger.warning(f"Could not get process count: {e}")
+            
+            # CPU frequency (if available)
+            try:
+                freq = self.psutil.cpu_freq()
+                if freq:
+                    data["cpu_freq_current"] = round(freq.current, 2)
+                    data["cpu_freq_min"] = round(freq.min, 2)
+                    data["cpu_freq_max"] = round(freq.max, 2)
+            except (AttributeError, OSError, RuntimeError, SystemError) as e:
+                # Not available on all systems, or may fail on FreeBSD with sysctl issues
+                self.logger.debug(f"CPU frequency not available: {e}")
+                pass
+            
+            # CPU times (user, system, idle, etc.)
+            try:
+                cpu_times = self.psutil.cpu_times_percent(interval=0)
+                data["cpu_user"] = round(cpu_times.user, 1)
+                data["cpu_system"] = round(cpu_times.system, 1)
+                data["cpu_idle"] = round(cpu_times.idle, 1)
+                if hasattr(cpu_times, "iowait"):
+                    data["cpu_iowait"] = round(cpu_times.iowait, 1)
+            except Exception as e:
+                self.logger.debug(f"Could not get CPU times: {e}")
+
+            # Uptime in seconds
+            try:
+                import time
+                data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
+            except Exception as e:
+                self.logger.debug(f"Could not get uptime: {e}")
+            
+            self.logger.debug(
+                f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
+            )
+            return data
+            
+        except Exception as e:
+            self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
+            return {}
@@ -0,0 +1,199 @@
+"""
+Disk monitoring plugin for Heartbeat.
+
+Collects disk usage and I/O statistics using psutil.
+"""
+
+import logging
+from typing import Dict, Any, Optional, List
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from hbd.client.plugin import MonitorPlugin
+
+logger = logging.getLogger(__name__)
+
+
+class DiskMonitorPlugin(MonitorPlugin):
+    """
+    Monitor disk usage and I/O statistics.
+    
+    Collects:
+    - Disk partition information
+    - Disk usage per partition (total, used, free, percent)
+    - Disk I/O counters (read/write bytes, read/write count)
+    - Disk I/O time statistics
+    
+    Configuration:
+        interval: Collection interval in seconds (default: 300)
+        partitions: List of mount points to monitor (default: all)
+        include_io: Include disk I/O statistics (default: True)
+        exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
+    """
+    
+    name = "disk_monitor"
+    interval = 300  # Collect every 5 minutes by default
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the disk monitor plugin.
+        
+        Args:
+            config: Optional configuration dict with keys:
+                   - interval: Collection interval in seconds (default: 300)
+                   - partitions: List of specific mount points to monitor
+                   - include_io: Include I/O statistics (default: True)
+                   - exclude_types: List of filesystem types to exclude
+        """
+        super().__init__(config)
+        self.partitions = self.config.get('partitions', None)  # None = all partitions
+        self.include_io = self.config.get('include_io', True)
+        self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
+        self.interval = self.config.get('interval', 300)
+        
+        if psutil is None:
+            raise ImportError("psutil library is required for disk_monitor plugin")
+        
+        # Store previous I/O counters for delta calculation
+        self._prev_io = {}
+    
+    async def initialize(self):
+        """Initialize the plugin (check psutil availability)."""
+        if psutil is None:
+            logger.error("psutil not available - disk_monitor cannot run")
+            return False
+        
+        logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
+        
+        # Initialize I/O counters if available
+        if self.include_io:
+            try:
+                self._prev_io = psutil.disk_io_counters(perdisk=True)
+            except Exception as e:
+                logger.warning(f"Could not initialize disk I/O counters: {e}")
+        
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        """
+        Collect current disk statistics.
+        
+        Returns:
+            Dictionary with disk metrics organized by partition:
+            - partitions: Dict of partition data, keyed by mount point
+              - device: Device name (e.g., /dev/sda1)
+              - fstype: Filesystem type (e.g., ext4)
+              - total: Total space in bytes
+              - used: Used space in bytes
+              - free: Free space in bytes
+              - percent: Usage percentage
+            - io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
+              - read_count: Number of reads
+              - write_count: Number of writes
+              - read_bytes: Bytes read
+              - write_bytes: Bytes written
+              - read_time: Time spent reading in ms
+              - write_time: Time spent writing in ms
+              - read_bytes_delta: Bytes read since last collection
+              - write_bytes_delta: Bytes written since last collection
+        """
+        if psutil is None:
+            logger.error("psutil not available")
+            return {}
+        
+        try:
+            data = await self._collect_metrics()
+            logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
+            return data
+        except Exception as e:
+            logger.error(f"Error collecting disk metrics: {e}")
+            return {"error": str(e)}
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Collect disk metrics from psutil."""
+        metrics = {}
+        
+        # Collect partition usage
+        partitions_data = {}
+        partitions = psutil.disk_partitions(all=False)
+        
+        for partition in partitions:
+            # Skip unwanted filesystem types
+            if partition.fstype in self.exclude_types:
+                continue
+            
+            # Skip if we're only monitoring specific partitions
+            if self.partitions and partition.mountpoint not in self.partitions:
+                continue
+            
+            try:
+                usage = psutil.disk_usage(partition.mountpoint)
+                partitions_data[partition.mountpoint] = {
+                    'device': partition.device,
+                    'fstype': partition.fstype,
+                    'total': usage.total,
+                    'used': usage.used,
+                    'free': usage.free,
+                    'percent': usage.percent
+                }
+            except PermissionError:
+                logger.debug(f"Permission denied accessing {partition.mountpoint}")
+                continue
+            except Exception as e:
+                logger.warning(f"Error reading {partition.mountpoint}: {e}")
+                continue
+        
+        metrics['partitions'] = partitions_data
+        
+        # Collect I/O statistics
+        if self.include_io:
+            try:
+                io_counters = psutil.disk_io_counters(perdisk=True)
+                io_data = {}
+                
+                for disk_name, counters in io_counters.items():
+                    disk_stats = {
+                        'read_count': counters.read_count,
+                        'write_count': counters.write_count,
+                        'read_bytes': counters.read_bytes,
+                        'write_bytes': counters.write_bytes,
+                    }
+                    
+                    # Add time statistics if available
+                    if hasattr(counters, 'read_time'):
+                        disk_stats['read_time'] = counters.read_time
+                    if hasattr(counters, 'write_time'):
+                        disk_stats['write_time'] = counters.write_time
+                    if hasattr(counters, 'busy_time'):
+                        disk_stats['busy_time'] = counters.busy_time
+                    
+                    # Calculate deltas from previous collection
+                    if disk_name in self._prev_io:
+                        prev = self._prev_io[disk_name]
+                        disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
+                        disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
+                        disk_stats['read_count_delta'] = counters.read_count - prev.read_count
+                        disk_stats['write_count_delta'] = counters.write_count - prev.write_count
+                    
+                    io_data[disk_name] = disk_stats
+                
+                metrics['io_counters'] = io_data
+                
+                # Store current counters for next delta calculation
+                self._prev_io = io_counters
+                
+            except Exception as e:
+                logger.warning(f"Could not collect disk I/O statistics: {e}")
+        
+        return metrics
+    
+    async def cleanup(self):
+        """Cleanup (nothing to do for this plugin)."""
+        logger.info("Disk monitor cleanup")
+
+
+# Plugin instance for automatic discovery
+plugin = DiskMonitorPlugin
@@ -0,0 +1,168 @@
+"""
+Filesystem information plugin for Heartbeat.
+
+Collects static filesystem and partition information using psutil.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from hbd.client.plugin import InfoPlugin
+
+logger = logging.getLogger(__name__)
+
+
+class FilesystemInfoPlugin(InfoPlugin):
+    """
+    Collect filesystem and partition information.
+    
+    This is an InfoPlugin that collects static information once during startup.
+    
+    By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
+    Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
+    
+    Collects:
+    - List of mounted filesystems
+    - Partition details (device, mount point, filesystem type, options)
+    - Filesystem capabilities and features
+    
+    Configuration:
+        include_pseudo: Include pseudo/virtual filesystems (default: False)
+        exclude_types: List of additional filesystem types to exclude (default: [])
+    """
+    
+    name = "filesystem_info"
+    interval = 0  # InfoPlugin - collect once
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the filesystem info plugin.
+        
+        Args:
+            config: Optional configuration dict with keys:
+                   - include_pseudo: Include pseudo/virtual filesystems (default: False)
+                   - exclude_types: List of filesystem types to exclude (default: [])
+        """
+        super().__init__(config)
+        self.include_pseudo = self.config.get('include_pseudo', False)
+        # By default, no exclusions since all=False filters most pseudo filesystems
+        # Users can add specific types to exclude if needed
+        self.exclude_types = set(self.config.get('exclude_types', []))
+        
+        if psutil is None:
+            raise ImportError("psutil library is required for filesystem_info plugin")
+    
+    async def initialize(self):
+        """Initialize the plugin (check psutil availability)."""
+        if psutil is None:
+            logger.error("psutil not available - filesystem_info cannot run")
+            return False
+        
+        logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        """
+        Collect filesystem information.
+        
+        Returns only physical mounted filesystems by default.
+        
+        Returns:
+            Dictionary with filesystem data:
+            - filesystems: List of filesystem dictionaries:
+              - device: Device name (e.g., /dev/sda1)
+              - mountpoint: Mount point path
+              - fstype: Filesystem type (e.g., ext4, xfs, btrfs)
+              - opts: Mount options (comma-separated string)
+              - maxfile: Maximum filename length
+              - maxpath: Maximum path length
+            - filesystem_types: List of unique filesystem types found
+            - mount_count: Total number of mounted filesystems
+        """
+        if psutil is None:
+            logger.error("psutil not available")
+            return {}
+        
+        try:
+            data = await self._collect_info()
+            logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
+            return data
+        except Exception as e:
+            logger.error(f"Error collecting filesystem info: {e}")
+            return {"error": str(e)}
+    
+    async def _collect_info(self) -> Dict[str, Any]:
+        """Collect filesystem information from psutil."""
+        info = {}
+        filesystems = []
+        filesystem_types = set()
+        
+        # Get mounted disk partitions
+        # all=False returns only physical devices (real mounted filesystems)
+        # all=True would include pseudo filesystems (proc, sysfs, etc.)
+        partitions = psutil.disk_partitions(all=self.include_pseudo)
+        
+        for partition in partitions:
+            # Additional filtering if exclude_types is specified
+            if partition.fstype in self.exclude_types:
+                continue
+            
+            fs_info = {
+                'device': partition.device,
+                'mountpoint': partition.mountpoint,
+                'fstype': partition.fstype,
+                'opts': partition.opts,
+            }
+            
+            # Try to get filesystem capabilities
+            try:
+                # Get path configuration for this mount point
+                import os
+                if hasattr(os, 'pathconf'):
+                    try:
+                        # Maximum filename length
+                        max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
+                        if max_name:
+                            fs_info['maxfile'] = max_name
+                    except (OSError, ValueError):
+                        pass
+                    
+                    try:
+                        # Maximum path length
+                        max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
+                        if max_path:
+                            fs_info['maxpath'] = max_path
+                    except (OSError, ValueError):
+                        pass
+            except Exception as e:
+                logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
+            
+            filesystems.append(fs_info)
+            filesystem_types.add(partition.fstype)
+        
+        info['filesystems'] = filesystems
+        info['filesystem_types'] = sorted(list(filesystem_types))
+        info['mount_count'] = len(filesystems)
+        
+        # Add some additional filesystem statistics
+        try:
+            # Get boot time (useful for determining filesystem mount times)
+            boot_time = psutil.boot_time()
+            info['boot_time'] = boot_time
+        except Exception as e:
+            logger.debug(f"Could not get boot time: {e}")
+        
+        return info
+    
+    async def cleanup(self):
+        """Cleanup (nothing to do for this plugin)."""
+        logger.info("Filesystem info cleanup")
+
+
+# Plugin instance for automatic discovery
+plugin = FilesystemInfoPlugin
@@ -0,0 +1,175 @@
+"""
+Memory monitoring plugin for Heartbeat.
+
+Collects memory and swap usage statistics using psutil.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from hbd.client.plugin import MonitorPlugin
+
+
+def _zfs_arc_bytes() -> int:
+    """Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
+
+    ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
+    kernel (it is not in SReclaimable), so it would otherwise be counted as
+    used memory.
+    """
+    try:
+        with open("/proc/spl/kstat/zfs/arcstats") as fh:
+            for line in fh:
+                parts = line.split()
+                if len(parts) >= 3 and parts[0] == "size":
+                    return int(parts[2])
+    except (OSError, ValueError):
+        pass
+    return 0
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryMonitorPlugin(MonitorPlugin):
+    """
+    Monitor memory and swap usage.
+    
+    Collects:
+    - Physical memory (RAM) usage and statistics
+    - Virtual memory details
+    - Swap memory usage and statistics
+    - Memory available for applications
+    
+    Configuration:
+        interval: Collection interval in seconds (default: 300)
+        include_swap: Include swap statistics (default: True)
+    """
+    
+    name = "memory_monitor"
+    interval = 300  # Collect every 5 minutes by default
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the memory monitor plugin.
+        
+        Args:
+            config: Optional configuration dict with keys:
+                   - interval: Collection interval in seconds (default: 300)
+                   - include_swap: Include swap statistics (default: True)
+        """
+        super().__init__(config)
+        self.include_swap = self.config.get('include_swap', True)
+        self.interval = self.config.get('interval', 300)
+        
+        if psutil is None:
+            raise ImportError("psutil library is required for memory_monitor plugin")
+    
+    async def initialize(self):
+        """Initialize the plugin (check psutil availability)."""
+        if psutil is None:
+            logger.error("psutil not available - memory_monitor cannot run")
+            return False
+        
+        logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        """
+        Collect current memory statistics.
+        
+        Returns:
+            Dictionary with memory metrics:
+            - memory_total: Total physical RAM in bytes
+            - memory_available: Available memory in bytes
+            - memory_used: Used memory in bytes
+            - memory_free: Free memory in bytes
+            - memory_percent: Memory usage percentage
+            - memory_active: Active memory (Unix)
+            - memory_inactive: Inactive memory (Unix)
+            - memory_buffers: Buffers (Linux)
+            - memory_cached: Cached (Linux)
+            - memory_shared: Shared (Linux)
+            - swap_total: Total swap in bytes (if include_swap)
+            - swap_used: Used swap in bytes (if include_swap)
+            - swap_free: Free swap in bytes (if include_swap)
+            - swap_percent: Swap usage percentage (if include_swap)
+            - swap_sin: Bytes swapped in from disk (if include_swap)
+            - swap_sout: Bytes swapped out to disk (if include_swap)
+        """
+        if psutil is None:
+            logger.error("psutil not available")
+            return {}
+        
+        try:
+            data = await self._collect_metrics()
+            logger.debug(f"Collected memory metrics: {len(data)} fields")
+            return data
+        except Exception as e:
+            logger.error(f"Error collecting memory metrics: {e}")
+            return {"error": str(e)}
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Collect memory metrics from psutil."""
+        metrics = {}
+        
+        # Virtual (physical) memory statistics
+        vmem = psutil.virtual_memory()
+
+        # psutil's available already excludes page cache / file buffers
+        # (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
+        # does not include it in SReclaimable / MemAvailable even though it is
+        # reclaimable.
+        arc_bytes = _zfs_arc_bytes()
+        available = min(vmem.available + arc_bytes, vmem.total)
+        used = vmem.total - available
+        percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
+
+        metrics['memory_total'] = vmem.total
+        metrics['memory_available'] = available
+        metrics['memory_used'] = used
+        metrics['memory_free'] = vmem.free
+        metrics['memory_percent'] = percent
+        
+        # Platform-specific memory details
+        if hasattr(vmem, 'active'):
+            metrics['memory_active'] = vmem.active
+        if hasattr(vmem, 'inactive'):
+            metrics['memory_inactive'] = vmem.inactive
+        if hasattr(vmem, 'buffers'):
+            metrics['memory_buffers'] = vmem.buffers
+        if hasattr(vmem, 'cached'):
+            metrics['memory_cached'] = vmem.cached
+        if hasattr(vmem, 'shared'):
+            metrics['memory_shared'] = vmem.shared
+        
+        # Swap memory statistics
+        if self.include_swap:
+            try:
+                swap = psutil.swap_memory()
+                metrics['swap_total'] = swap.total
+                metrics['swap_used'] = swap.used
+                metrics['swap_free'] = swap.free
+                metrics['swap_percent'] = swap.percent
+                
+                # Swap in/out counters (may not be available on all platforms)
+                if hasattr(swap, 'sin'):
+                    metrics['swap_sin'] = swap.sin
+                if hasattr(swap, 'sout'):
+                    metrics['swap_sout'] = swap.sout
+            except Exception as e:
+                logger.warning(f"Could not collect swap statistics: {e}")
+        
+        return metrics
+    
+    async def cleanup(self):
+        """Cleanup (nothing to do for this plugin)."""
+        logger.info("Memory monitor cleanup")
+
+
+# Plugin instance for automatic discovery
+plugin = MemoryMonitorPlugin
@@ -0,0 +1,287 @@
+"""Nagios Plugin Runner for Heartbeat.
+
+Executes Nagios-compatible monitoring plugins and parses their output.
+
+Nagios Plugin Standard:
+- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+- Output format: Single line status message, optional performance data
+- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
+
+Example configuration in ~/.hb.yaml:
+```yaml
+nagios_runner:
+  interval: 60
+  commands:
+    - name: check_disk_root
+      command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+    - name: check_procs
+      command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
+    - name: check_load
+      command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+```
+"""
+
+import asyncio
+import os
+import re
+import shlex
+from typing import Any, Dict, List, Optional, Tuple
+
+from hbd.client.plugin import MonitorPlugin
+
+
+# Nagios exit codes
+NAGIOS_UNKNOWN = 3
+
+STATUS_NAMES = {
+    0: "OK",
+    1: "WARNING",
+    2: "CRITICAL",
+    3: "UNKNOWN",
+}
+
+
+class NagiosRunnerPlugin(MonitorPlugin):
+    """Run Nagios-compatible monitoring plugins.
+    
+    This plugin executes external Nagios plugins and collects their output,
+    including status codes, messages, and performance data.
+    
+    Configuration:
+        interval: Collection interval in seconds (default: 300)
+        commands: List of command definitions with 'name' and 'command' keys
+        timeout: Command execution timeout in seconds (default: 30)
+
+    Example:
+        nagios_runner:
+          interval: 300  # Check every 5 minutes
+          timeout: 30
+          commands:
+            - name: check_disk
+              command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
+            - name: check_load
+              command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+    """
+    
+    name = "nagios_runner"
+    version = "1.0.0"
+    description = "Execute Nagios-compatible monitoring plugins"
+    interval = 300  # MonitorPlugin: collect every 5 minutes by default
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        
+        # Extract configuration
+        self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
+        self.timeout: int = config.get("timeout", 30) if config else 30
+        self.interval = config.get("interval", 300) if config else 300
+    
+    async def initialize(self) -> bool:
+        """Initialize the Nagios runner plugin.
+
+        Returns:
+            True if at least one command is configured, False otherwise
+        """
+        self.logger.info(f"Initializing {self.name} plugin")
+
+        if not self.commands:
+            self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
+            return False
+
+        self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
+        for cmd_config in self.commands:
+            name = cmd_config.get("name", "unnamed")
+            self.logger.info(f"  - {name}: {cmd_config.get('command', 'N/A')}")
+
+        # Validate absolute command paths early
+        for cmd_config in self.commands:
+            name = cmd_config.get("name", "unnamed")
+            command = cmd_config.get("command", "")
+            if not command:
+                continue
+            try:
+                tokens = shlex.split(command)
+            except ValueError:
+                continue  # malformed command string; skip validation
+            if not tokens:
+                continue
+            exe = tokens[0]
+            if os.path.isabs(exe):
+                if not os.path.isfile(exe):
+                    self.logger.warning(
+                        f"Command '{name}': executable not found: {exe}"
+                    )
+                elif not os.access(exe, os.X_OK):
+                    self.logger.warning(
+                        f"Command '{name}': executable not executable: {exe}"
+                    )
+
+        return True
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Collect metrics from all configured Nagios plugins.
+        
+        Returns:
+            Dictionary with results from all plugins
+        """
+        results = {}
+
+        for cmd_config in self.commands:
+            name = cmd_config.get("name")
+            command = cmd_config.get("command")
+
+            if not name or not command:
+                self.logger.warning("Skipping command with missing name or command")
+                continue
+
+            # Execute plugin
+            try:
+                status_code, output, perfdata = await self._run_nagios_plugin(command)
+
+                # Store results
+                results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
+                results[f"{name}_status_code"] = status_code
+                results[f"{name}_output"] = output
+
+                # Parse and add performance data
+                if perfdata:
+                    for metric_name, metric_value in perfdata.items():
+                        results[f"{name}_{metric_name}"] = metric_value
+
+                self.logger.info(
+                    f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
+                )
+
+            except Exception as e:
+                self.logger.error(f"Error running {name}: {e}", exc_info=True)
+                results[f"{name}_status"] = "ERROR"
+                results[f"{name}_status_code"] = NAGIOS_UNKNOWN
+                results[f"{name}_output"] = str(e)
+
+        return results
+    
+    async def _run_nagios_plugin(
+        self,
+        command: str
+    ) -> Tuple[int, str, Dict[str, Any]]:
+        """Execute a Nagios plugin and parse its output."""
+        try:
+            proc = await asyncio.create_subprocess_shell(
+                command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            try:
+                stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                    proc.communicate(), timeout=self.timeout
+                )
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.communicate()
+                self.logger.error(f"Command timed out: {command}")
+                return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
+
+            status_code = proc.returncode
+
+            if status_code < 0:
+                return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
+
+            if status_code > 3:
+                status_code = NAGIOS_UNKNOWN
+
+            stdout = stdout_bytes.decode(errors="replace").strip()
+            stderr = stderr_bytes.decode(errors="replace").strip()
+
+            # Parse perfdata from stdout before mixing in stderr
+            perfdata = self._parse_perfdata(stdout)
+
+            # Build status message
+            status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
+
+            if not stdout and stderr:
+                output_msg = stderr
+            elif stdout and stderr:
+                output_msg = f"{status_part} [stderr: {stderr}]"
+            else:
+                output_msg = status_part
+
+            return status_code, output_msg, perfdata
+
+        except Exception as e:
+            self.logger.error(f"Error executing command: {e}")
+            return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
+    
+    def _parse_perfdata(self, output: str) -> Dict[str, Any]:
+        """Parse Nagios performance data from plugin output.
+        
+        Nagios performance data format:
+        'label'=value[UOM];[warn];[crit];[min];[max]
+        
+        Multiple metrics separated by spaces.
+        
+        Args:
+            output: Plugin output string
+            
+        Returns:
+            Dictionary of metric_name: value
+        """
+        perfdata = {}
+        
+        # Performance data comes after the pipe character
+        if '|' not in output:
+            return perfdata
+        
+        perf_section = output.split('|', 1)[1].strip()
+        
+        # Regex to match performance data format
+        # Matches: 'label'=value or label=value
+        perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
+        
+        for match in re.finditer(perf_regex, perf_section):
+            label = match.group(1).strip()
+            value_str = match.group(2)
+            uom = match.group(3) or ""
+            warn = match.group(4)
+            crit = match.group(5)
+            min_val = match.group(6)
+            max_val = match.group(7)
+            
+            # Convert value to float
+            try:
+                value = float(value_str)
+            except ValueError:
+                continue
+            
+            # Store the value
+            perfdata[label] = value
+            
+            # Optionally store UOM as separate field
+            if uom:
+                perfdata[f"{label}_uom"] = uom
+            
+            # Store thresholds if present
+            if warn:
+                try:
+                    perfdata[f"{label}_warn"] = float(warn)
+                except ValueError:
+                    pass
+            
+            if crit:
+                try:
+                    perfdata[f"{label}_crit"] = float(crit)
+                except ValueError:
+                    pass
+            
+            if min_val:
+                try:
+                    perfdata[f"{label}_min"] = float(min_val)
+                except ValueError:
+                    pass
+            
+            if max_val:
+                try:
+                    perfdata[f"{label}_max"] = float(max_val)
+                except ValueError:
+                    pass
+        
+        return perfdata
@@ -0,0 +1,240 @@
+"""
+Network monitoring plugin for Heartbeat.
+
+Collects network interface statistics and connection information using psutil.
+"""
+
+import logging
+from typing import Dict, Any, Optional, List
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+from hbd.client.plugin import MonitorPlugin
+
+logger = logging.getLogger(__name__)
+
+
+class NetworkMonitorPlugin(MonitorPlugin):
+    """
+    Monitor network interface statistics and connections.
+    
+    Collects:
+    - Network interface I/O counters (bytes sent/received, packets, errors, drops)
+    - Per-interface statistics
+    - Network connection counts by state
+    - Interface addresses and configuration
+    
+    Configuration:
+        interval: Collection interval in seconds (default: 300)
+        interfaces: List of interfaces to monitor (default: all)
+        include_connections: Include connection statistics (default: True)
+        include_addresses: Include interface addresses (default: False)
+    """
+    
+    name = "network_monitor"
+    interval = 300  # Collect every 5 minutes by default
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the network monitor plugin.
+        
+        Args:
+            config: Optional configuration dict with keys:
+                   - interval: Collection interval in seconds (default: 300)
+                   - interfaces: List of specific interfaces to monitor
+                   - include_connections: Include connection stats (default: True)
+                   - include_addresses: Include interface addresses (default: False)
+        """
+        super().__init__(config)
+        self.interfaces = self.config.get('interfaces', None)  # None = all interfaces
+        self.include_connections = self.config.get('include_connections', True)
+        self.include_addresses = self.config.get('include_addresses', False)
+        self.interval = self.config.get('interval', 300)
+        
+        if psutil is None:
+            raise ImportError("psutil library is required for network_monitor plugin")
+        
+        # Store previous I/O counters for delta calculation
+        self._prev_io = {}
+    
+    async def initialize(self):
+        """Initialize the plugin (check psutil availability)."""
+        if psutil is None:
+            logger.error("psutil not available - network_monitor cannot run")
+            return False
+        
+        logger.info(f"Network monitor initialized (interval: {self.interval}s, "
+                   f"connections: {self.include_connections})")
+        
+        # Initialize I/O counters
+        try:
+            self._prev_io = psutil.net_io_counters(pernic=True)
+        except Exception as e:
+            logger.warning(f"Could not initialize network I/O counters: {e}")
+        
+        return True
+    
+    async def collect(self) -> Dict[str, Any]:
+        """
+        Collect current network statistics.
+        
+        Returns:
+            Dictionary with network metrics:
+            - interfaces: Dict of interface statistics, keyed by interface name
+              - bytes_sent: Total bytes sent
+              - bytes_recv: Total bytes received
+              - packets_sent: Total packets sent
+              - packets_recv: Total packets received
+              - errin: Total incoming errors
+              - errout: Total outgoing errors
+              - dropin: Total incoming packets dropped
+              - dropout: Total outgoing packets dropped
+              - bytes_sent_delta: Bytes sent since last collection
+              - bytes_recv_delta: Bytes received since last collection
+              - packets_sent_delta: Packets sent since last collection
+              - packets_recv_delta: Packets received since last collection
+            - connections: Connection statistics by state (if include_connections)
+              - ESTABLISHED: Count of established connections
+              - LISTEN: Count of listening sockets
+              - TIME_WAIT: Count of TIME_WAIT connections
+              - etc.
+            - addresses: Interface address information (if include_addresses)
+              - Dict keyed by interface name with address details
+        """
+        if psutil is None:
+            logger.error("psutil not available")
+            return {}
+        
+        try:
+            data = await self._collect_metrics()
+            logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
+            return data
+        except Exception as e:
+            logger.error(f"Error collecting network metrics: {e}")
+            return {"error": str(e)}
+    
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        """Collect network metrics from psutil."""
+        metrics = {}
+        
+        # Collect per-interface I/O counters
+        try:
+            io_counters = psutil.net_io_counters(pernic=True)
+            interfaces_data = {}
+            
+            for iface_name, counters in io_counters.items():
+                # Skip if we're only monitoring specific interfaces
+                if self.interfaces and iface_name not in self.interfaces:
+                    continue
+                
+                iface_stats = {
+                    'bytes_sent': counters.bytes_sent,
+                    'bytes_recv': counters.bytes_recv,
+                    'packets_sent': counters.packets_sent,
+                    'packets_recv': counters.packets_recv,
+                    'errin': counters.errin,
+                    'errout': counters.errout,
+                    'dropin': counters.dropin,
+                    'dropout': counters.dropout,
+                }
+                
+                # Calculate deltas from previous collection
+                if iface_name in self._prev_io:
+                    prev = self._prev_io[iface_name]
+                    iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
+                    iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
+                    iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
+                    iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
+                
+                interfaces_data[iface_name] = iface_stats
+            
+            metrics['interfaces'] = interfaces_data
+            
+            # Store current counters for next delta calculation
+            self._prev_io = io_counters
+            
+        except Exception as e:
+            logger.warning(f"Could not collect network I/O counters: {e}")
+        
+        # Collect connection statistics
+        if self.include_connections:
+            try:
+                connections = psutil.net_connections(kind='inet')
+                conn_stats = {}
+                
+                # Count connections by state
+                for conn in connections:
+                    state = conn.status
+                    conn_stats[state] = conn_stats.get(state, 0) + 1
+                
+                metrics['connections'] = conn_stats
+                
+            except (PermissionError, psutil.AccessDenied):
+                logger.debug("Permission denied for net_connections (requires root/admin)")
+            except Exception as e:
+                logger.warning(f"Could not collect connection statistics: {e}")
+        
+        # Collect interface addresses
+        if self.include_addresses:
+            try:
+                addresses = psutil.net_if_addrs()
+                addr_data = {}
+                
+                for iface_name, addrs in addresses.items():
+                    # Skip if we're only monitoring specific interfaces
+                    if self.interfaces and iface_name not in self.interfaces:
+                        continue
+                    
+                    iface_addrs = []
+                    for addr in addrs:
+                        addr_info = {
+                            'family': str(addr.family),
+                            'address': addr.address,
+                        }
+                        if addr.netmask:
+                            addr_info['netmask'] = addr.netmask
+                        if addr.broadcast:
+                            addr_info['broadcast'] = addr.broadcast
+                        iface_addrs.append(addr_info)
+                    
+                    addr_data[iface_name] = iface_addrs
+                
+                metrics['addresses'] = addr_data
+                
+            except Exception as e:
+                logger.warning(f"Could not collect interface addresses: {e}")
+        
+        # Add interface stats (up/down status, speed, mtu)
+        try:
+            if_stats = psutil.net_if_stats()
+            stats_data = {}
+            
+            for iface_name, stats in if_stats.items():
+                # Skip if we're only monitoring specific interfaces
+                if self.interfaces and iface_name not in self.interfaces:
+                    continue
+                
+                stats_data[iface_name] = {
+                    'isup': stats.isup,
+                    'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
+                    'speed': stats.speed,
+                    'mtu': stats.mtu,
+                }
+            
+            metrics['interface_stats'] = stats_data
+            
+        except Exception as e:
+            logger.warning(f"Could not collect interface stats: {e}")
+        
+        return metrics
+    
+    async def cleanup(self):
+        """Cleanup (nothing to do for this plugin)."""
+        logger.info("Network monitor cleanup")
+
+
+# Plugin instance for automatic discovery
+plugin = NetworkMonitorPlugin
@@ -0,0 +1,142 @@
+"""OS Information Plugin for Heartbeat.
+
+Collects static operating system information including OS name, version,
+kernel, architecture, and distribution details.
+"""
+
+import platform
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+# Import from parent package
+from hbd.client.plugin import InfoPlugin
+
+
+class OSInfoPlugin(InfoPlugin):
+    """Collect operating system information.
+    
+    This plugin gathers static OS information that rarely changes:
+    - OS name and version
+    - Kernel version
+    - Architecture (x86_64, arm64, etc.)
+    - Distribution details (for Linux)
+    - Python version (used by hbc)
+    """
+    
+    name = "os_info"
+    version = "1.0.0"
+    description = "Operating system and platform information"
+    interval = 0  # InfoPlugin: collect once at startup
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+    
+    async def initialize(self) -> bool:
+        """Initialize the OS info plugin.
+        
+        Returns:
+            True (always succeeds - platform module is stdlib)
+        """
+        self.logger.info(f"Initializing {self.name} plugin")
+        return True
+    
+    async def _collect_info(self) -> Dict[str, Any]:
+        """Collect OS information.
+        
+        Returns:
+            Dictionary with OS details
+        """
+        try:
+            from hbd import __version__ as hbc_version
+            data = {
+                "system": platform.system(),  # e.g., "Linux", "Darwin", "Windows"
+                "node": platform.node(),  # hostname
+                "release": platform.release(),  # kernel version
+                "version": platform.version(),  # detailed version
+                "machine": platform.machine(),  # e.g., "x86_64", "arm64"
+                "processor": platform.processor(),  # processor name
+                "architecture": platform.architecture()[0],  # e.g., "64bit"
+                "python_version": platform.python_version(),
+                "python_implementation": platform.python_implementation(),
+                "hbc_version": hbc_version,
+                "hbc_type": "full",
+            }
+            if self.config.get("owner"):
+                self.logger.debug(f"Adding owner from config: {self.config['owner']}")
+                data["owner"] = self.config["owner"]
+            
+            # Add Linux-specific distribution info
+            if platform.system() == "Linux":
+                data.update(self._get_linux_distro())
+            
+            # Add macOS-specific info
+            elif platform.system() == "Darwin":
+                data["macos_version"] = platform.mac_ver()[0]
+            
+            # Add Windows-specific info
+            elif platform.system() == "Windows":
+                win_ver = platform.win32_ver()
+                data["windows_release"] = win_ver[0]
+                data["windows_version"] = win_ver[1]
+                data["windows_sp"] = win_ver[2]
+                data["windows_type"] = win_ver[3]
+            
+            self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
+            return data
+            
+        except Exception as e:
+            self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
+            return {}
+    
+    def _get_linux_distro(self) -> Dict[str, str]:
+        """Get Linux distribution information.
+        
+        Returns:
+            Dictionary with distribution details
+        """
+        distro_info = {}
+        
+        # Try reading /etc/os-release (standard on modern Linux)
+        os_release = Path("/etc/os-release")
+        if os_release.exists():
+            try:
+                with open(os_release) as f:
+                    for line in f:
+                        line = line.strip()
+                        if "=" in line and not line.startswith("#"):
+                            key, value = line.split("=", 1)
+                            # Remove quotes from value
+                            value = value.strip('"').strip("'")
+                            # Map common keys
+                            if key == "NAME":
+                                distro_info["distro_name"] = value
+                            elif key == "VERSION":
+                                distro_info["distro_version"] = value
+                            elif key == "ID":
+                                distro_info["distro_id"] = value
+                            elif key == "VERSION_ID":
+                                distro_info["distro_version_id"] = value
+                            elif key == "PRETTY_NAME":
+                                distro_info["distro_pretty_name"] = value
+            except Exception as e:
+                self.logger.warning(f"Could not read /etc/os-release: {e}")
+        
+        # Fallback: try lsb_release (older systems)
+        elif Path("/etc/lsb-release").exists():
+            try:
+                with open("/etc/lsb-release") as f:
+                    for line in f:
+                        line = line.strip()
+                        if "=" in line:
+                            key, value = line.split("=", 1)
+                            if key == "DISTRIB_ID":
+                                distro_info["distro_id"] = value
+                            elif key == "DISTRIB_RELEASE":
+                                distro_info["distro_version"] = value
+                            elif key == "DISTRIB_DESCRIPTION":
+                                distro_info["distro_name"] = value
+            except Exception as e:
+                self.logger.warning(f"Could not read /etc/lsb-release: {e}")
+        
+        return distro_info
@@ -0,0 +1,147 @@
+"""Ping Monitor Plugin for Heartbeat.
+
+Pings one or more hosts and reports round-trip time.  Results are sent as
+plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
+alerts using the same RTT threshold configuration format used for heartbeat RTT.
+
+Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
+
+```yaml
+plugins:
+  ping_monitor:
+    interval: 60          # ping every 60 seconds (default)
+    count: 3              # ICMP packets per ping run (default 3)
+    timeout: 5            # seconds before a host is considered unreachable (default 5)
+    hosts:
+      - 8.8.8.8
+      - 192.168.1.1
+```
+
+Reported metrics per host (metric key uses the hostname with dots/colons replaced
+by underscores so it is a valid identifier):
+
+  ping.<hostname>.rtt_avg   – average RTT in ms  (float, or inf if unreachable)
+  ping.<hostname>.rtt_min   – minimum RTT in ms
+  ping.<hostname>.rtt_max   – maximum RTT in ms
+  ping.<hostname>.loss      – packet loss percentage (0–100)
+
+Server-side threshold config example:
+
+```yaml
+threshold_configs:
+  default:
+    thresholds:
+      ping_monitor:
+        8_8_8_8_rtt_avg:
+          warning: 20.0
+          critical: 100.0
+```
+"""
+
+import asyncio
+import re
+import sys
+from typing import Any, Dict, Optional
+
+from hbd.client.plugin import MonitorPlugin
+
+
+def _host_key(host: str) -> str:
+    """Convert a hostname/IP to a safe metric key (replace . and : with _)."""
+    return re.sub(r"[^a-zA-Z0-9_]", "_", host)
+
+
+class PingMonitorPlugin(MonitorPlugin):
+    """Ping one or more configured hosts and report RTT metrics."""
+
+    name = "ping_monitor"
+    version = "1.0.0"
+    description = "ICMP ping latency monitoring"
+    interval = 60
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        cfg = config or {}
+        self.interval = cfg.get("interval", 60)
+        self.count = int(cfg.get("count", 3))
+        self.timeout = int(cfg.get("timeout", 5))
+        # hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
+        raw_hosts = cfg.get("hosts", {})
+        if isinstance(raw_hosts, list):
+            self.hosts = {h: {} for h in raw_hosts}
+        else:
+            self.hosts = dict(raw_hosts)
+
+    async def initialize(self) -> bool:
+        if not self.hosts:
+            self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
+            return False
+        self.logger.info(
+            "ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
+            len(self.hosts), self.interval, self.count, self.timeout,
+        )
+        return True
+
+    async def _ping(self, host: str) -> Dict[str, float]:
+        """Run a system ping command and return rtt_min/avg/max/loss."""
+        if sys.platform == "win32":
+            cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
+        else:
+            cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
+
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            stdout, _ = await asyncio.wait_for(
+                proc.communicate(),
+                timeout=self.timeout * self.count + 2,
+            )
+            output = stdout.decode(errors="replace")
+        except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
+            self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
+            return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
+                    "rtt_max": float("inf"), "loss": 100.0}
+
+        # Parse packet loss
+        loss = 100.0
+        loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
+        if loss_match:
+            loss = float(loss_match.group(1))
+
+        # Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
+        #                          macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
+        rtt_match = re.search(
+            r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
+            output,
+        )
+        if rtt_match:
+            return {
+                "rtt_min": float(rtt_match.group(1)),
+                "rtt_avg": float(rtt_match.group(2)),
+                "rtt_max": float(rtt_match.group(3)),
+                "loss": loss,
+            }
+
+        # Host unreachable or all packets lost
+        return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
+                "rtt_max": float("inf"), "loss": loss}
+
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        data: Dict[str, Any] = {}
+        tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
+        for host, task in tasks.items():
+            try:
+                result = await task
+            except Exception as e:
+                self.logger.error("ping_monitor: error pinging %s: %s", host, e)
+                result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
+                          "rtt_max": float("inf"), "loss": 100.0}
+            key = _host_key(host)
+            for metric, value in result.items():
+                data[f"{key}_{metric}"] = value
+            status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
+            self.logger.debug("ping_monitor: %s -> %s", host, status)
+        return data
@@ -0,0 +1,140 @@
+"""
+ZFS pool monitoring plugin for Heartbeat.
+
+Collects per-pool health, capacity, and cumulative I/O statistics via zpool(8).
+"""
+
+import asyncio
+import logging
+import shutil
+from typing import Any, Dict, List, Optional
+
+from hbd.client.plugin import MonitorPlugin
+
+logger = logging.getLogger(__name__)
+
+
+def _int(s: str) -> Optional[int]:
+    try:
+        return int(s.strip().rstrip("KMGTkBkmgt%x"))
+    except (ValueError, AttributeError):
+        return None
+
+
+def _float(s: str) -> Optional[float]:
+    try:
+        return float(s.strip().rstrip("%x"))
+    except (ValueError, AttributeError):
+        return None
+
+
+class ZFSMonitorPlugin(MonitorPlugin):
+    """Monitor ZFS pool health, capacity, and I/O statistics.
+
+    Collects per pool:
+    - health: ONLINE, DEGRADED, FAULTED, etc.
+    - size / alloc / free: total, allocated and free bytes
+    - capacity: percentage used (0-100)
+    - frag: fragmentation percentage
+    - dedup: deduplication ratio
+    - read_ops / write_ops: cumulative I/O operations since last boot/clear
+    - read_bw / write_bw: cumulative bytes transferred since last boot/clear
+
+    Configuration:
+        interval: collection interval in seconds (default: 300)
+        pools: list of pool names to monitor (default: all)
+    """
+
+    name = "zfs_monitor"
+    description = "ZFS pool health, capacity, and I/O statistics"
+    interval = 300
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        self.interval = self.config.get("interval", 300)
+        self._pools_filter: Optional[List[str]] = self.config.get("pools", None)
+
+    async def initialize(self) -> bool:
+        if not shutil.which("zpool"):
+            self.skip_reason = "zpool not found"
+            return False
+        logger.info("ZFS monitor initialized (interval: %ds)", self.interval)
+        return True
+
+    async def _run(self, *args: str) -> List[str]:
+        """Run a command and return its stdout lines, or [] on error."""
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *args,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.DEVNULL,
+            )
+            stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
+            return stdout.decode(errors="replace").splitlines()
+        except (FileNotFoundError, asyncio.TimeoutError) as exc:
+            logger.warning("zfs_monitor: %s: %s", args[0], exc)
+            return []
+
+    async def _zpool_list(self) -> Dict[str, Dict]:
+        """Return per-pool health and capacity from `zpool list`."""
+        lines = await self._run(
+            "zpool", "list", "-H", "-p",
+            "-o", "name,health,size,alloc,free,cap,frag,dedup",
+        )
+        pools: Dict[str, Dict] = {}
+        for line in lines:
+            parts = line.split("\t")
+            if len(parts) < 8:
+                continue
+            name = parts[0].strip()
+            if self._pools_filter and name not in self._pools_filter:
+                continue
+            health = parts[1].strip()
+            if health == "ONLINE":
+                status = 0
+            elif health in ("DEGRADED", "ONLINE with errors"):
+                status = 1
+            elif health in ("FAULTED", "OFFLINE", "UNAVAIL"):
+                status = 2
+            else:
+                status = 3  # unknown status
+            pools[name] = {
+                "health":    health,
+                "status": status,
+                "size":      _int(parts[2]),
+                "alloc":     _int(parts[3]),
+                "free":      _int(parts[4]),
+                "capacity":  _float(parts[5]),
+                "frag":      _float(parts[6]),
+                "dedup":     _float(parts[7]),
+            }
+        return pools
+
+    async def _zpool_iostat(self) -> Dict[str, Dict]:
+        """Return per-pool cumulative I/O counters from `zpool iostat`."""
+        lines = await self._run("zpool", "iostat", "-H", "-p")
+        io: Dict[str, Dict] = {}
+        for line in lines:
+            parts = line.split("\t")
+            if len(parts) < 7:
+                continue
+            name = parts[0].strip()
+            if not name or name.startswith(" "):
+                continue
+            io[name] = {
+                "read_ops": _int(parts[3]),
+                "write_ops": _int(parts[4]),
+                "read_bw":  _int(parts[5]),
+                "write_bw": _int(parts[6]),
+            }
+        return io
+
+    async def _collect_metrics(self) -> Dict[str, Any]:
+        pools, io = await asyncio.gather(self._zpool_list(), self._zpool_iostat())
+        for name, stats in io.items():
+            if name in pools:
+                pools[name].update(stats)
+        return {"pools": pools}
+
+
+plugin = ZFSMonitorPlugin
@@ -0,0 +1,3 @@
+"""Common utilities shared between hbc and hbd."""
+
+from hbd import __version__
@@ -0,0 +1,162 @@
+"""Message encoding/decoding utilities for hbd protocol.
+
+Message Types:
+    HTB: Heartbeat message (client -> server)
+    ACK: Acknowledgment (server -> client)
+    CMD: Command message (server -> client)
+    UPD: Update message (server -> client)
+    PLG: Plugin data message (client -> server)
+"""
+
+from typing import Dict, Any, Union
+import json
+import zlib
+
+
+def encode_value(v: Any) -> str:
+    """Encode a value for protocol transmission.
+    
+    Args:
+        v: Value to encode (int, float, str, bool, list, dict, etc.)
+        
+    Returns:
+        String representation suitable for protocol
+    """
+    if isinstance(v, float):
+        return f"{v:0.5f}"
+    elif isinstance(v, (list, dict)):
+        # Use JSON encoding for complex types, prefixed with @
+        return "@" + json.dumps(v)
+    elif isinstance(v, bool):
+        return str(int(v))  # True->1, False->0
+    else:
+        return str(v)
+
+
+def decode_value(val: str) -> Any:
+    """Decode a value from protocol format.
+    
+    Args:
+        val: String value from protocol
+        
+    Returns:
+        Decoded Python object
+    """
+    if not val:
+        return val
+    
+    # Check for JSON-encoded complex types
+    if val.startswith("@"):
+        try:
+            return json.loads(val[1:])
+        except Exception:
+            return val[1:]  # Return as string without @
+    
+    # Try numeric conversion (avoid eval to prevent SyntaxWarnings on version strings)
+    if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        return val
+    
+    return val
+
+
+def dicttos(ID: str, d: Dict[str, Any]):
+    """Serialize a dict to protocol message bytes.
+
+    If compress is True, the payload is zlib-compressed and the message is
+    prefixed with `!ID:` as the original script did. Otherwise the format is
+    `ID:key=value;...` (bytes).
+    """
+    s = []
+    for k in d:
+        v = d[k]
+        encoded_val = encode_value(v)
+        s.append(f"{k}={encoded_val}")
+    pk = ";".join(s)
+    zpk = zlib.compress(pk.encode(), 6)
+    hdr = ("!" + ID + ":").encode()
+    return hdr + zpk
+
+
+def stodict(msg: bytes):
+    """Deserialize a protocol message into a dict.
+
+    Mirrors original behaviour: detects compressed messages starting with
+    '!' and decodes accordingly. Returns a dict with key 'ID' set to the
+    message ID and the parsed key/value pairs.
+    """
+    d = {}
+    if len(msg) > 0 and chr(msg[0]) == "!":
+        # message is: b'!ID:' + compressed_payload
+        # original code used msg[1:4].decode() for ID (3 bytes including colon)
+        try:
+            pk = zlib.decompress(msg[5:]).decode()
+        except Exception:
+            # malformed compressed payload
+            return {}
+        d["ID"] = msg[1:4].decode()
+    else:
+        try:
+            r0 = msg.split(b":", 1)
+            pk = r0[1].decode()
+            d["ID"] = r0[0].decode()
+        except Exception:
+            return {}
+    if not pk:
+        return d
+    parts = pk.split(";")
+    for v in parts:
+        if not v:
+            continue
+        vr = v.split("=", 1)
+        k = vr[0].strip()
+        if len(vr) == 1:
+            d[k] = None
+        else:
+            val = vr[1].strip()
+            d[k] = decode_value(val)
+    return d
+
+
+def oldmtodict(msg: bytes):
+    """Compatibility wrapper for old-style messages (no ID prefix).
+
+    The original implementation prefixed with 'HTB:' and called stodict.
+    """
+    return stodict(b"HTB:" + msg)
+
+
+def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
+    """Encode plugin data into a PLG message.
+    
+    Args:
+        plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
+        data: Plugin data dictionary
+        compress: Whether to compress the payload
+        
+    Returns:
+        Encoded message bytes
+    """
+    # Add plugin name to data
+    full_data = {"plugin": plugin_name, **data}
+    return dicttos("PLG", full_data)
+
+
+def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
+    """Decode a PLG message into plugin data.
+    
+    Args:
+        msg: Raw message bytes
+        
+    Returns:
+        Dictionary with 'ID', 'plugin', and plugin data fields
+    """
+    return stodict(msg)
+
@@ -1,68 +0,0 @@
-"""Configuration loader and defaults for hbd."""
-
-import logging
-import os
-
-try:
-    import yaml
-except Exception:
-    yaml = None
-
-DEFAULTS = {
-    "hb_port": 50003,
-    "hbd_port": 50004,
-    "hbd_host": "",
-    "pickfile": "/tmp/hb.pick",
-    "logfile": "/var/log/heartbeat.log",
-    "logfmt": "text",
-    "pushsrv": "pushover",
-    "pushover_token": "",
-    "pushover_user": "",
-    "interval": 20,
-    "grace": 2,
-    "dyndomains": ["wrede.org"],
-    "watchhosts": [],
-    "dyndnshosts": [],
-    "drophosts": [],
-    "nsupdate_bin": "/usr/bin/nsupdate",
-    "foreground": False,
-    "verbose": False,
-    "debug": 0,
-    "smtpserver": "smtp.fastmail.com",
-    "smtpuser": "andreas@wrede.ca",
-    "smtppassword": "pvtvefyp5gbhnch2",
-    "smtpport": 587,
-    "toemail": ["aew.hbd.notify@wrede.ca"],
-    "fromemail": "aew.hbd@wrede.ca",
-    "ws_port": 50005,
-    "wss_port": None,
-    "cert_path": "/usr/local/etc/ssl/",
-    "wss_pem": "fullchain.pem",
-    "wss_key": "privkey.pem",
-}
-
-
-def load_config(path=None):
-    """Load configuration from a YAML file and merge with defaults.
-
-    If YAML is not available or the file does not exist, defaults are returned.
-    """
-    cfg = DEFAULTS.copy()
-    if not path:
-        # default path (~/.hb.yaml)
-        path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
-
-    if os.path.exists(path):
-        if yaml:
-            with open(path) as fh:
-                data = yaml.safe_load(fh)
-            # only keep known keys
-            for k, v in data.items():
-                if k in cfg:
-                    cfg[k] = v
-                else:
-                    logging.warning("unknown config key %s in %s", k, path)
-        else:
-            # yaml not installed: do not attempt to parse; user must ensure defaults
-            pass
-    return cfg
@@ -0,0 +1,196 @@
+# Example Heartbeat Client Configuration
+# This file demonstrates all available configuration options for the heartbeat client (hbc)
+# and its plugin system.
+
+# ==============================================================================
+# Server Configuration
+# ==============================================================================
+server: hbd.example.com      # Heartbeat server hostname or IP
+port: 50003                   # Server UDP port (default: 50003)
+interval: 30                  # Heartbeat interval in seconds (default: 30)
+
+# ==============================================================================
+# Plugin Configuration
+# ==============================================================================
+# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
+# and configured with plugin-specific settings.
+
+plugins:
+  # --------------------------------------------------------------------------
+  # OS Information Plugin (InfoPlugin - runs once at startup)
+  # --------------------------------------------------------------------------
+  os_info:
+    enabled: true
+    # No additional configuration needed
+  
+  # --------------------------------------------------------------------------
+  # CPU Monitor Plugin (MonitorPlugin - periodic collection)
+  # --------------------------------------------------------------------------
+  cpu_monitor:
+    enabled: true
+    interval: 300             # Collection interval in seconds (default: 300 = 5 minutes)
+    per_core: false           # Collect per-core CPU statistics (default: false)
+    # When per_core is true, will report CPU usage for each core separately
+  
+  # --------------------------------------------------------------------------
+  # Memory Monitor Plugin (MonitorPlugin)
+  # --------------------------------------------------------------------------
+  memory_monitor:
+    enabled: true
+    interval: 300             # Collection interval in seconds (default: 300 = 5 minutes)
+    include_swap: true        # Include swap memory statistics (default: true)
+  
+  # --------------------------------------------------------------------------
+  # Disk Monitor Plugin (MonitorPlugin)
+  # --------------------------------------------------------------------------
+  disk_monitor:
+    enabled: true
+    interval: 300             # Collection interval in seconds (default: 300 = 5 minutes)
+    include_io: true          # Include I/O statistics (default: true)
+    # Optional: Monitor only specific partitions
+    # partitions:
+    #   - /
+    #   - /home
+    #   - /var
+    # Optional: Exclude specific filesystem types
+    exclude_types:
+      - tmpfs
+      - devtmpfs
+      - squashfs
+  
+  # --------------------------------------------------------------------------
+  # Network Monitor Plugin (MonitorPlugin)
+  # --------------------------------------------------------------------------
+  network_monitor:
+    enabled: true
+    interval: 300             # Collection interval in seconds (default: 300 = 5 minutes)
+    include_connections: true # Include connection statistics (default: true)
+    include_addresses: false  # Include interface addresses (default: false)
+    # Optional: Monitor only specific interfaces
+    # interfaces:
+    #   - eth0
+    #   - wlan0
+  
+  # --------------------------------------------------------------------------
+  # Filesystem Info Plugin (InfoPlugin - runs once at startup)
+  # --------------------------------------------------------------------------
+  filesystem_info:
+    enabled: true
+    include_pseudo: false     # Include pseudo/virtual filesystems (default: false)
+    # When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
+    # When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
+    # Optional: Exclude additional specific filesystem types
+    # exclude_types:
+    #   - squashfs
+    #   - iso9660
+  
+  # --------------------------------------------------------------------------
+  # Nagios Runner Plugin (MonitorPlugin)
+  # --------------------------------------------------------------------------
+  nagios_runner:
+    enabled: true
+    interval: 300             # Collection interval in seconds (default: 300 = 5 minutes)
+    timeout: 30               # Plugin execution timeout in seconds (default: 30)
+    
+    # List of Nagios plugins to execute
+    # Each command is executed as-is, so provide full paths and arguments
+    commands:
+      # System load monitoring
+      - /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+      
+      # Disk space monitoring
+      - /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+      - /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
+      
+      # Process monitoring
+      - /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
+      
+      # Swap usage
+      - /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
+      
+      # Custom script example
+      # - /usr/local/bin/check_my_app.sh
+
+# ==============================================================================
+# Advanced Options
+# ==============================================================================
+# These options control client behavior
+
+# Compression: Enable zlib compression for heartbeat messages (default: true)
+compress: true
+
+# Hostname: Override the system hostname (default: auto-detect)
+# hostname: myhost.example.com
+
+# Message: Custom message included in heartbeat (optional)
+# message: "Production web server"
+
+# Logging
+log_level: INFO             # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
+# logfile: /var/log/hbc.log # Optional log file path
+
+# ==============================================================================
+# Example Profiles
+# ==============================================================================
+# Below are example configuration profiles for different use cases
+
+# Minimal Configuration (default settings):
+# -----------------------------------------
+# server: hbd.example.com
+# interval: 30
+
+# Monitoring Server (comprehensive metrics):
+# ------------------------------------------
+# server: monitoring.example.com
+# interval: 30
+# plugins:
+#   cpu_monitor:
+#     enabled: true
+#     interval: 15
+#     per_core: true
+#   memory_monitor:
+#     enabled: true
+#     interval: 15
+#   disk_monitor:
+#     enabled: true
+#     interval: 60
+#   network_monitor:
+#     enabled: true
+#     interval: 30
+#     include_connections: true
+
+# Nagios Integration (leverage existing plugins):
+# -----------------------------------------------
+# server: hbd.example.com
+# plugins:
+#   nagios_runner:
+#     enabled: true
+#     interval: 300  # Check every 5 minutes
+#     commands:
+#       - /usr/lib/nagios/plugins/check_http -H localhost -p 80
+#       - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
+#       - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
+
+# ==============================================================================
+# Threshold Configuration (for Heartbeat Daemon)
+# ==============================================================================
+# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
+# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
+#
+# Basic threshold example:
+# thresholds:
+#   cpu_monitor:
+#     cpu_percent:
+#       warning: 80.0
+#       critical: 90.0
+#   memory_monitor:
+#     percent:
+#       warning: 85.0
+#       critical: 95.0
+#   disk_monitor:
+#     partitions:
+#       /:
+#         percent:
+#           warning: 80.0
+#           critical: 90.0
+
@@ -0,0 +1,296 @@
+# ==============================================================================
+# Heartbeat Daemon Multi-Threshold Configuration Example
+# ==============================================================================
+# This file demonstrates the new multi-threshold configuration feature that allows
+# different threshold settings for different hosts/clients.
+#
+# Features:
+#   - Define multiple named threshold configurations
+#   - Map specific hosts to specific threshold configurations
+#   - Set a default configuration for unmapped hosts
+#   - Backward compatible with single threshold configuration
+# ==============================================================================
+
+# Global threshold settings
+threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts (seconds)
+
+# Optional: Set default threshold config (defaults to "default" if not specified)
+default_threshold_config: "default"
+
+# ----------------------------------------------------------------------------
+# Multiple Named Threshold Configurations
+# ----------------------------------------------------------------------------
+# Define multiple threshold configurations with different sensitivity levels
+threshold_configs:
+  
+  # Default configuration - moderate thresholds for most servers
+  default:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 80.0
+          critical: 90.0
+          operator: ">"
+        load_1min:
+          warning: 4.0
+          critical: 8.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 85.0
+          critical: 95.0
+          operator: ">"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 85.0
+              critical: 95.0
+              operator: ">"
+      
+      rtt:
+        # RTT thresholds (applies to all hosts)
+        warning: 50.0   # ms
+        critical: 200.0
+  
+  # High sensitivity configuration - lower thresholds for critical systems
+  high_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 60.0      # Alert earlier
+          critical: 75.0
+          operator: ">"
+          hysteresis: 0.15   # More hysteresis to reduce flapping
+        load_1min:
+          warning: 2.0
+          critical: 4.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 75.0      # Alert at lower memory usage
+          critical: 85.0
+          operator: ">"
+          display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 75.0
+              critical: 85.0
+              operator: ">"
+          /var:
+            percent:
+              warning: 80.0
+              critical: 90.0
+              operator: ">"
+      
+      rtt:
+        warning: 30.0
+        critical: 100.0
+  
+  # Low sensitivity configuration - higher thresholds for development/test systems
+  low_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 90.0      # Only alert at very high usage
+          critical: 95.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 90.0
+          critical: 98.0
+          operator: ">"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 90.0
+              critical: 95.0
+              operator: ">"
+      
+      rtt:
+        warning: 100.0
+        critical: 500.0
+  
+  # Production database servers - specialized thresholds
+  database:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 70.0
+          critical: 85.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 90.0      # Databases can use high memory
+          critical: 97.0
+          operator: ">"
+          display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 80.0
+              critical: 90.0
+              operator: ">"
+          /var/lib/mysql:    # Database data partition
+            percent:
+              warning: 75.0  # Alert earlier for DB partition
+              critical: 85.0
+              operator: ">"
+      
+      rtt:
+        warning: 20.0     # Stricter latency requirements
+        critical: 50.0
+
+# ----------------------------------------------------------------------------
+# Host to Threshold Configuration Mapping
+# ----------------------------------------------------------------------------
+# Map specific hosts to specific threshold configurations
+# ----------------------------------------------------------------------------
+# Notification Channels
+# ----------------------------------------------------------------------------
+# Define notification providers centrally with their credentials
+# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
+notification_channels:
+  # Signal notifications
+  signal_ops:
+    type: signal
+    cli_path: /usr/local/bin/signal-cli
+    user: +1234567890
+    recipient: +1234567890
+  
+  signal_oncall:
+    type: signal
+    cli_path: /usr/local/bin/signal-cli
+    user: +1234567890
+    recipient: +0987654321
+  
+  # Email notifications
+  email_ops:
+    type: email
+    recipients: [ops@example.com, alerts@example.com]
+    sender: heartbeat@example.com
+    smtp_server: smtp.example.com
+    smtp_port: 587
+    smtp_user: heartbeat@example.com
+    smtp_password: your-smtp-password
+  
+  # Pushover notifications
+  pushover_urgent:
+    type: pushover
+    token: your-pushover-app-token
+    user: your-pushover-user-key
+  
+  # Mattermost notifications
+  mattermost_devops:
+    type: mattermost
+    host: mattermost.example.com
+    token: your-webhook-token
+    channel: devops-alerts
+    username: heartbeat-bot
+    icon: https://example.com/heartbeat-icon.png
+
+# Default notification channels (used if host doesn't specify channels)
+default_notification_channels: [email_ops]
+
+# ----------------------------------------------------------------------------
+# Host Definitions (New Unified Format)
+# ----------------------------------------------------------------------------
+# Define hosts with threshold configs, monitoring, DNS, and notification settings
+hosts:
+  # Critical production servers - high sensitivity, multiple notification channels
+  prod-web-01:
+    threshold_config: high_sensitivity
+    watch: true
+    notification_channels: [signal_oncall, pushover_urgent, email_ops]
+    dyndns: false
+  
+  prod-web-02:
+    threshold_config: high_sensitivity
+    watch: true
+    notification_channels: [signal_oncall, pushover_urgent, email_ops]
+    dyndns: false
+  
+  prod-api-01:
+    threshold_config: high_sensitivity
+    watch: true
+    notification_channels: [signal_oncall, email_ops]
+    dyndns: false
+  
+  # Database servers - database-specific thresholds
+  prod-db-01:
+    threshold_config: database
+    watch: true
+    notification_channels: [signal_ops, email_ops]
+    dyndns: false
+  
+  prod-db-02:
+    threshold_config: database
+    watch: true
+    notification_channels: [signal_ops, email_ops]
+    dyndns: false
+  
+  prod-db-replica:
+    threshold_config: database
+    watch: true
+    notification_channels: [email_ops]  # Replica gets email only
+    dyndns: false
+  
+  # Development servers - low sensitivity, minimal notifications
+  dev-server-01:
+    threshold_config: low_sensitivity
+    watch: false  # Don't monitor dev servers closely
+    notification_channels: [email_ops]
+    dyndns: false
+  
+  dev-server-02:
+    threshold_config: low_sensitivity
+    watch: false
+    notification_channels: [email_ops]
+    dyndns: false
+  
+  # Test servers
+  test-server-01:
+    threshold_config: low_sensitivity
+    watch: false
+    dyndns: false
+    # No notification channels - uses default_notification_channels
+  
+  # Home server with dynamic DNS
+  home-server:
+    threshold_config: default
+    watch: true
+    notification_channels: [signal_ops]
+    dyndns: true  # Update DNS when IP changes
+
+# Hosts not listed in the hosts section will use:
+# - default_threshold_config for thresholds (falls back to "default")
+# - default_notification_channels for notifications
+
+# ----------------------------------------------------------------------------
+# Notes on Configuration Structure
+# ----------------------------------------------------------------------------
+# 
+# All configuration is centralized in the hosts section. Each host can specify:
+#   - threshold_config: Name of threshold configuration to use
+#   - watch: Whether to monitor this host actively (send notifications)
+#   - notification_channels: List of channels to use for this host
+#   - dyndns: Whether to update DNS when IP address changes
+#
+# Notification channels are defined once at the top level and referenced
+# by name in host definitions, allowing easy reuse and updates.
+#
+# For hosts not explicitly listed, the system will still accept heartbeats
+# and track their state, but won't apply thresholds or send notifications
+# unless default settings are configured.
@@ -0,0 +1,111 @@
+# Heartbeat Configuration Example with Nagios Plugin Runner
+
+# This example shows how to configure the Nagios Runner plugin
+# to execute existing Nagios-compatible monitoring plugins
+
+# Basic server settings (existing config)
+hb_port: 50003
+hbd_port: 50004
+interval: 20
+grace: 2
+
+# Plugin configuration
+# Each plugin can have its own configuration section
+
+# CPU Monitor Plugin
+cpu_monitor:
+  interval: 300         # Collect every 5 minutes (default)
+  per_core: false       # Set to true to get per-core CPU usage
+
+# Nagios Runner Plugin
+nagios_runner:
+  interval: 300         # Run Nagios plugins every 5 minutes (default)
+  timeout: 30           # Command execution timeout in seconds
+  shell: true           # Execute commands via shell
+  
+  # List of Nagios plugins to run
+  commands:
+    
+    # Example 1: Check disk space
+    - name: check_disk_root
+      command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
+    
+    # Example 2: Check disk space for /home
+    - name: check_disk_home
+      command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
+    
+    # Example 3: Check system load
+    - name: check_load
+      command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
+    
+    # Example 4: Check process count
+    - name: check_procs
+      command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
+    
+    # Example 5: Check SSH service
+    - name: check_ssh
+      command: /usr/lib/nagios/plugins/check_ssh localhost
+    
+    # Example 6: Check HTTP service
+    - name: check_http
+      command: /usr/lib/nagios/plugins/check_http -H localhost
+    
+    # Example 7: Check swap usage
+    - name: check_swap
+      command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
+    
+    # Example 8: Custom script (Nagios plugin format)
+    - name: check_custom
+      command: /usr/local/bin/my_custom_check.sh
+    
+    # Example 9: Check specific log file
+    - name: check_logs
+      command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
+
+# Notes:
+# 
+# 1. Nagios Plugin Output Format:
+#    - Single line: STATUS - Message | performance_data
+#    - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
+#
+# 2. Exit Codes:
+#    - 0 = OK
+#    - 1 = WARNING
+#    - 2 = CRITICAL
+#    - 3 = UNKNOWN
+#
+# 3. Performance Data:
+#    - Automatically parsed and included in heartbeat data
+#    - Metrics are stored as: {plugin_name}_{metric_name}
+#    - Example: check_disk_root_/ will contain the disk usage percentage
+#
+# 4. Overall Status:
+#    - The plugin reports the worst status from all commands
+#    - Useful for quick health checks
+#
+# 5. Plugin Paths:
+#    Common Nagios plugin directories:
+#    - Debian/Ubuntu: /usr/lib/nagios/plugins/
+#    - RHEL/CentOS: /usr/lib64/nagios/plugins/
+#    - Custom installs: /usr/local/nagios/libexec/
+#
+# 6. Installing Nagios Plugins:
+#    Debian/Ubuntu: sudo apt-get install nagios-plugins
+#    RHEL/CentOS:   sudo yum install nagios-plugins-all
+#    Arch Linux:    sudo pacman -S monitoring-plugins
+#
+# 7. Writing Custom Nagios Plugins:
+#    Any script can be a Nagios plugin if it:
+#    - Returns appropriate exit codes (0-3)
+#    - Prints status message to stdout
+#    - Optionally includes performance data after "|"
+#
+# Example custom plugin (save as /usr/local/bin/check_example.sh):
+#   #!/bin/bash
+#   if [ $(uptime | awk '{print $1}') -gt 50 ]; then
+#     echo "CRITICAL - Too many users | users=52;40;50;0"
+#     exit 2
+#   else
+#     echo "OK - Normal user count | users=25;40;50;0"
+#     exit 0
+#   fi
@@ -0,0 +1,278 @@
+# ==============================================================================
+# Heartbeat Daemon Threshold Configuration Example
+# ==============================================================================
+# This file demonstrates threshold configuration for the Heartbeat monitoring system.
+# Thresholds can be defined for any metric collected by monitoring plugins.
+#
+# Threshold levels:
+#   - WARNING: First level of concern, typically for early notification
+#   - CRITICAL: Severe condition requiring immediate attention
+#
+# Alert notifications are sent when:
+#   - A metric crosses from OK to WARNING or CRITICAL
+#   - A metric crosses from WARNING to CRITICAL
+#   - A metric recovers (returns to a lower severity level)
+#
+# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
+# ==============================================================================
+
+# Global threshold settings
+threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts (seconds)
+
+# Threshold definitions per plugin
+thresholds:
+  
+  # ----------------------------------------------------------------------------
+  # CPU Monitor Thresholds
+  # ----------------------------------------------------------------------------
+  cpu_monitor:
+    # Overall CPU usage percentage (0-100)
+    cpu_percent:
+      warning: 80.0         # Warn when CPU usage exceeds 80%
+      critical: 90.0        # Critical when CPU usage exceeds 90%
+      operator: ">"         # Alert when value is GREATER than threshold
+      hysteresis: 0.1       # 10% hysteresis to prevent flapping
+      enabled: true
+    
+    # 1-minute load average
+    load_1min:
+      warning: 4.0          # Warn when 1-min load exceeds 4.0
+      critical: 8.0         # Critical when 1-min load exceeds 8.0
+      operator: ">"
+      hysteresis: 0.15      # 15% hysteresis
+      enabled: true
+    
+    # 5-minute load average
+    load_5min:
+      warning: 3.0
+      critical: 6.0
+      operator: ">"
+      hysteresis: 0.15
+      enabled: true
+    
+    # 15-minute load average
+    load_15min:
+      warning: 2.0
+      critical: 4.0
+      operator: ">"
+      hysteresis: 0.15
+      enabled: true
+  
+  # ----------------------------------------------------------------------------
+  # Memory Monitor Thresholds
+  # ----------------------------------------------------------------------------
+  memory_monitor:
+    # Memory usage percentage
+    percent:
+      warning: 85.0         # Warn at 85% memory usage
+      critical: 95.0        # Critical at 95% memory usage
+      operator: ">"
+      hysteresis: 0.1
+      enabled: true
+    
+    # Available memory in MB (inverse threshold - alert when LOW)
+    available_mb:
+      warning: 1000         # Warn when less than 1GB available
+      critical: 500         # Critical when less than 500MB available
+      operator: "<"         # Alert when value is LESS than threshold
+      hysteresis: 0.1
+      enabled: true
+    
+    # Swap usage percentage
+    swap_percent:
+      warning: 50.0         # Warn at 50% swap usage
+      critical: 80.0        # Critical at 80% swap usage
+      operator: ">"
+      hysteresis: 0.1
+      enabled: true
+  
+  # ----------------------------------------------------------------------------
+  # Disk Monitor Thresholds
+  # ----------------------------------------------------------------------------
+  disk_monitor:
+    # Partition-specific thresholds
+    # Use the mount point as the key
+    partitions:
+      # Root filesystem
+      /:
+        percent:
+          warning: 80.0     # Warn at 80% disk usage
+          critical: 90.0    # Critical at 90% disk usage
+          operator: ">"
+          hysteresis: 0.05  # 5% hysteresis for disk (more stable)
+          enabled: true
+        
+        free_gb:
+          warning: 10.0     # Warn when less than 10GB free
+          critical: 5.0     # Critical when less than 5GB free
+          operator: "<"
+          hysteresis: 0.1
+          enabled: true
+      
+      # Home filesystem (if separate partition)
+      /home:
+        percent:
+          warning: 85.0
+          critical: 95.0
+          operator: ">"
+          hysteresis: 0.05
+          enabled: true
+      
+      # Var filesystem (logs, etc.)
+      /var:
+        percent:
+          warning: 80.0
+          critical: 90.0
+          operator: ">"
+          hysteresis: 0.05
+          enabled: true
+        
+        free_gb:
+          warning: 5.0      # Var needs space for logs
+          critical: 2.0
+          operator: "<"
+          hysteresis: 0.1
+          enabled: true
+  
+  # ----------------------------------------------------------------------------
+  # ZFS Monitor Thresholds
+  # ----------------------------------------------------------------------------
+  zfs_monitor:
+    # Pool health check — built-in default; shown here for reference/override.
+    # status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
+    # Use '*' to apply the same rule to every pool, or name a specific pool.
+    pools:
+      '*':
+        status:
+          warning: 1           # Alert WARNING when pool is DEGRADED
+          critical: 2           # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
+          operator: ">"
+          hysteresis: 0.0       # No hysteresis — a degraded pool is always critical
+          display: "ZFS pool {pool_name} is {health}"
+
+      # Per-pool capacity thresholds (optional; add pools you care about)
+      # tank:
+      #   capacity:
+      #     warning: 75.0       # Warn at 75% used
+      #     critical: 90.0      # Critical at 90% used
+      #     operator: ">"
+      #     hysteresis: 0.05
+
+  # ----------------------------------------------------------------------------
+  # Network Monitor Thresholds
+  # ----------------------------------------------------------------------------
+  network_monitor:
+    # Total error count across all interfaces
+    errors_total:
+      warning: 100          # Warn at 100 errors
+      critical: 1000        # Critical at 1000 errors
+      operator: ">"
+      hysteresis: 0.2       # 20% hysteresis for counters
+      enabled: true
+    
+    # Total dropped packets
+    dropin_total:
+      warning: 50
+      critical: 200
+      operator: ">"
+      hysteresis: 0.2
+      enabled: true
+    
+    dropout_total:
+      warning: 50
+      critical: 200
+      operator: ">"
+      hysteresis: 0.2
+      enabled: true
+    
+    # TCP connections in TIME_WAIT state
+    connections_TIME_WAIT:
+      warning: 1000         # Warn at 1000 TIME_WAIT connections
+      critical: 5000        # Critical at 5000 TIME_WAIT connections
+      operator: ">"
+      hysteresis: 0.2
+      enabled: true
+    
+    # Total established connections
+    connections_ESTABLISHED:
+      warning: 500
+      critical: 1000
+      operator: ">"
+      hysteresis: 0.1
+      enabled: true
+  
+  # ----------------------------------------------------------------------------
+  # Nagios Plugin Thresholds (if using nagios_runner)
+  # ----------------------------------------------------------------------------
+  nagios_runner:
+    # Nagios plugins report exit codes:
+    #   0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
+    # We can threshold on the exit_code directly
+    exit_code:
+      warning: 1            # Map Nagios WARNING to our WARNING
+      critical: 2           # Map Nagios CRITICAL to our CRITICAL
+      operator: ">="        # Alert when exit code >= threshold
+      hysteresis: 0.0       # No hysteresis for exit codes
+      enabled: true
+
+# ==============================================================================
+# Notification Configuration
+# ==============================================================================
+# Configure notification methods (email, pushover, etc.)
+# These are used when threshold violations occur
+
+# Email notifications
+toemail:
+  - admin@example.com
+  - oncall@example.com
+fromemail: heartbeat@example.com
+smtpserver: smtp.example.com
+smtpport: 587
+smtpuser: heartbeat@example.com
+smtppassword: your-password-here
+
+# Pushover notifications (optional)
+# pushover_token: your-pushover-app-token
+# pushover_user: your-pushover-user-key
+
+# Mattermost webhook (optional)
+# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
+
+# ==============================================================================
+# Watched Hosts
+# ==============================================================================
+# Hosts in this list will trigger notifications for:
+#   - Heartbeat timeouts/overdue
+#   - Threshold violations
+#   - Boot messages
+watchhosts:
+  - webserver01
+  - database01
+  - mailserver
+  - critical-app
+
+# ==============================================================================
+# Additional Server Settings
+# ==============================================================================
+hb_port: 50003            # UDP port for heartbeat messages
+hbd_port: 50004           # HTTP port for web interface
+grace: 10                 # Grace period for overdue detection (seconds)
+debug: 0                  # Debug level (0-3)
+verbose: false            # Verbose output
+
+# Journal settings (message logging)
+journal_enabled: true
+journal_path: /var/log/heartbeat/messages.journal
+journal_max_size: 104857600  # 100MB before rotation
+journal_max_backups: 10
+
+# ==============================================================================
+# Example: Production Configuration with Conservative Thresholds
+# ==============================================================================
+# For production systems, consider:
+#   - Higher warning thresholds to reduce alert fatigue
+#   - Appropriate hysteresis values (5-15% typical)
+#   - Re-notification intervals matching on-call rotation
+#   - Multiple escalation contacts
+#   - Integration with incident management systems
+# ==============================================================================
@@ -1,602 +0,0 @@
-#!/usr/bin/env python3
-# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
-# NEW
-import argparse
-import sys
-import time
-import socket
-import os
-import signal
-import select
-import traceback
-from hashlib import md5
-import shutil
-import zlib
-import subprocess
-import syslog
-import codecs
-
-from .config import load_config
-
-PORT = 50003
-INTERVAL = 10
-REOPENC = 6
-PIDFILE = "/tmp/hbc.pid"
-VER = 6
-MAXRECV = 32767
-
-running = True
-dorestart = False
-warned1 = False
-
-msgonly = False
-helpflag = False
-verbose = False
-fdaemon = False
-daemonized = False
-msgboot = {}
-home = os.environ["HOME"]
-configfile = "%s/.hbrc" % home
-cmdargs = []
-iam = socket.gethostname()
-
-
-def log(msg):
-    if fdaemon:
-        syslog.syslog(syslog.LOG_ERR, msg)
-    else:
-        print(msg)
-
-
-def handler(signum, frame):
-    if signum == signal.SIGTERM:
-        cleanup()
-
-
-class NullDevice:
-    def write(self, s):
-        pass
-
-
-class Conn:
-    def __init__(self, conId, addr, port, af):
-        self.conId = conId
-        self.addr = addr
-        self.port = port
-        self.af = af
-
-        self.ackcount = 0  # num of accks received
-        self.lastack = 0  # time() last ACK was received
-        self.send = 0
-        self.lastsend = 0  # time() last msg was sent
-        self.rtts = [0]
-        self.sock = None
-
-    def __str__(self):
-        return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
-
-    def open(self):
-        self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
-        self.sock.setsockopt(
-            socket.SOL_SOCKET,
-            socket.SO_REUSEADDR,
-            self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
-        )
-
-    def sendto(self, msg, ID="HTB"):  # default ID is HearTBeat
-        global warned1
-
-        if self.send % REOPENC == 0:
-            self.close()
-        if not self.sock:
-            self.open()
-        msg["name"] = shortname(iam)
-        msg["id"] = self.conId
-        msg["ver"] = VER
-        msg["time"] = time.time()
-        m = dicttos(ID, msg)  # always compress
-        if verbose:
-            log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
-        try:
-            self.sock.sendto(m, (self.addr, self.port))
-        except socket.error as e:
-            if not warned1:
-                log("socket error: %s %s:%s" % (e, self.addr, self.port))
-            warned1 = True
-            self.close()
-            return
-        self.send += 1
-        self.lastsend = time.time()
-
-    def ack(self, msgDict, now):
-        try:
-            self.lastack = msgDict["time"]
-            mul = 2
-        except Exception:
-            self.lastack = now
-            mul = 1
-        rtt = (self.lastack - self.lastsend) * mul
-        if verbose:
-            log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
-        self.rtts.append(rtt * 1000.0)
-        if len(self.rtts) > 10:
-            del self.rtts[0]
-        self.ackcount += 1
-
-    def close(self):
-        if self.sock:
-            self.sock.close()
-        self.sock = None
-
-
-def shortname(name):
-    r = name.split(".")
-    return r[0]
-
-
-def dicttos(ID, d):
-    s = []
-    for k in d:
-        if isinstance(d[k], float):
-            s.append("%s=%0.5f" % (k, d[k]))
-        else:
-            s.append("%s=%s" % (k, d[k]))
-    pk = ";".join(s)
-    zpk = zlib.compress(pk.encode(), 6)
-    ID = "!" + ID + ":"
-    return ID.encode() + zpk
-
-
-def stodict(msg):
-    d = {}
-    if len(msg) > 0 and chr(msg[0]) == "!":
-        pk = zlib.decompress(msg[5:]).decode()
-        d["ID"] = msg[1:4].decode()
-    else:
-        r0 = msg.split(":", 1)
-        pk = r0[1]
-        d["ID"] = r0[0]
-    r = pk.split(";")
-    for v in r:
-        vr = v.split("=", 1)
-        k = vr[0].strip()
-        if len(vr) == 1:
-            d[k] = None
-        else:
-            v = vr[1].strip()
-            try:
-                v = eval(v)
-            except Exception:
-                pass
-            d[k] = v
-    if verbose:
-        print("msg is %s" % d)
-    return d
-
-
-def XXstodict(msg):
-    d = {}
-    r0 = msg.split(":", 1)
-    if len(r0) == 1:
-        return None
-    if r0[0][0] == "!":  # compressed
-        pk = zlib.decompress(msg[len(r0[0]) + 1 :])
-        d["ID"] = r0[0][1:]
-    else:
-        pk = r0[1]
-        d["ID"] = r0[0]
-    r = pk.split(";")
-    for v in r:
-        vr = v.split("=", 1)
-        k = vr[0].strip()
-        if len(vr) == 1:
-            d[k] = None
-        else:
-            v = vr[1].strip()
-            try:
-                if v[0].isdigit():
-                    v = eval(v)
-            except Exception:
-                pass
-            d[k] = v
-    return d
-
-
-def syslogtrace(note):
-    logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
-    log(logm)
-    for line in logm.split("\n"):
-        syslog.syslog(syslog.LOG_ERR, " tb: %s" % line)
-    if verbose:
-        print(logm)
-
-
-conId = 1
-
-
-def createConnections(hosts):
-    global conId
-    for host in hosts:
-        if verbose:
-            log("createConnections for %s" % host)
-        try:
-            rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
-        except socket.gaierror:
-            logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
-            if verbose:
-                log(logm)
-            return None
-        for r in rs:
-            if verbose:
-                log("address %s" % str(r))
-            if r[0] in [10, 24, 28, 30]:  # for Linux, NetBSD, FreeBSD
-                af = socket.AF_INET6
-            elif r[0] == 2:
-                af = socket.AF_INET
-            else:
-                print("dont know this net type: %s" % r[0][0])
-                sys.exit(1)
-
-            addr = r[4][0]
-            conns[conId] = Conn(conId, addr, hb_port, af)
-            if verbose:
-                print("cons[%s] = %s" % (conId, str(conns[conId])))
-            conId += 1
-
-
-def doexec(conn, data):
-    try:
-        ro = subprocess.check_output(
-            data, stderr=subprocess.STDOUT, shell=True
-        ).decode()
-        fail = "OK"
-    except subprocess.CalledProcessError as e:
-        ro = str(e)
-        fail = "CalledProcessError"
-    except Exception as e:
-        syslogtrace("System")
-        ro = "N/A"
-        fail = "cmd failed: %s" % e
-    msg = {"service": "command", "msg": fail + " " + ro}
-    conns[conn].sendto(msg)
-
-
-def doupdate(conn, msgDict):
-    fail = None
-    try:
-        code = codecs.decode(msgDict["code"], "base64").decode()
-        csum = msgDict["csum"]
-    except Exception as e:
-        fail = "csum/code missing: %s" % e
-    if not fail:
-        fail = doupdateone(code, csum)
-
-    msg = {"service": "update", "msg": fail if fail else "OK"}
-    conns[conn].sendto(msg)
-    if not fail:
-        log("hc updates, fs = %s" % (len(code)))
-
-    return fail
-
-
-def doupdateone(code, csum):
-
-    m = md5()
-    m.update(code.encode())
-    icsum = m.hexdigest()
-    if icsum != csum:
-        return "checksum error"
-
-    fn = sys.argv[0]
-    ofn = "%s.sav" % fn
-    try:
-        shutil.copy2(fn, ofn)
-    except Exception as e:
-        return "cannot make backup copy: %s" % e
-
-    try:
-        fh = open(fn, "w")
-        fh.write(code)
-        fh.close()
-    except Exception as e:
-        return "cannot write new code: %s" % e
-
-    return None
-
-
-def restart():
-    if verbose:
-        print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
-    syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
-    e = "fallthrough"
-    try:
-        os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
-    except Exception:
-        pass
-    print("should not be here:", str(e))
-    log("restart failed: %s" % e)
-
-
-def process():
-    global running, dorestart
-
-    nextReport = time.time()
-
-    while running:
-        while time.time() < nextReport:
-            ifiles = {}
-            conIds = {}
-            for conn in conns:
-                if conns[conn].sock:
-                    ifiles[conns[conn].sock.fileno()] = conns[conn].sock
-                    conIds[conns[conn].sock.fileno()] = conn
-
-            sleep = nextReport - time.time()
-            if sleep <= 0:
-                break
-            try:
-                r = select.select(list(ifiles.keys()), [], [], sleep)
-                now = (
-                    time.time()
-                )  # nb: delay from actual packet arrival to select is ca. 105ms!
-            except KeyboardInterrupt:
-                running = False
-                break
-            except SystemExit:
-                log("daemon exit, running was %s" % running)
-                if running:
-                    running = False
-                break
-            except Exception:
-                if running:
-                    syslogtrace("select")
-                    running = False
-                break
-            for rfh in r[0]:
-                conn = conIds[rfh]
-                data, addr = ifiles[rfh].recvfrom(MAXRECV)
-                if verbose:
-                    print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
-                try:
-                    msgDict = stodict(data)
-                except Exception as e:
-                    print(
-                        "failed to parse incoming data from %s: %s (%s)"
-                        % (addr, data, e)
-                    )
-                    continue
-
-                if verbose:
-                    print(
-                        "sock.recvfrom: %s (%s) %s"
-                        % (addr, len(data), str(msgDict)[:80])
-                    )
-                if msgDict is None:
-                    print("bad backet from %s (%s) %s" % (addr, len(data), data))
-                elif msgDict["ID"] == "ACK":
-                    conns[conn].ack(msgDict, now)
-                elif msgDict["ID"] == "UPD":
-                    if doupdate(conn, msgDict) is None:
-                        if verbose:
-                            print("process: restart after update")
-                        dorestart = True
-                        break
-                elif msgDict["ID"] == "CMD":
-                    doexec(conn, msgDict["cmd"])
-                else:
-                    doexec(conn, data)  # deprecated until no more VER - hbc
-            if dorestart:
-                running = False
-                break
-        if not running:
-            break
-        for conn in conns:
-            msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
-            conns[conn].sendto(msg)
-            time.sleep(
-                0.1
-            )  # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
-        if nextReport + interval >= time.time():
-            nextReport += interval
-        else:
-            nextReport = time.time() + interval
-
-    if verbose:
-        log("process: done running")
-
-
-def cleanup():
-    global running
-    if not running:
-        return
-    if verbose:
-        log("cleanup")
-    running = False
-    for conn in conns:
-        msg = {"shutdown": 1, "acks": conns[conn].ackcount}
-        conns[conn].sendto(msg)
-        conns[conn].close()
-    time.sleep(1)
-    closeall()
-
-
-def closeall():
-    if verbose:
-        syslog.syslog(syslog.LOG_ERR, "closecall")
-    for conn in conns:
-        conns[conn].close()
-
-
-def daemonize(
-    working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
-):
-    """
-    Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
-    UNIX Environment" for details (ISBN 0201563177)
-    http://www.yendor.com/programming/unix/apue/proc/fork2.c
-    """
-
-    try:
-        # first fork
-        pid = os.fork()
-        if pid > 0:
-            # exit from first parent
-            os._exit(0)
-    except OSError as e:
-        sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
-        os._exit(1)
-
-    # decouple from parent environment
-    os.chdir(working_dir)
-    os.setsid()
-    os.umask(0)
-    # second fork
-    try:
-        pid = os.fork()
-        if pid > 0:
-            # exit from second parent
-            os._exit(0)
-    except OSError as e:
-        sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
-        sys.exit(1)
-
-    # redirects standard file descriptors
-    sys.stdout.flush()
-    sys.stderr.flush()
-    si = open(stdin, "r")
-    so = open(stdout, "a+")
-    se = open(stderr, "a+")
-    os.dup2(si.fileno(), sys.stdin.fileno())
-    os.dup2(so.fileno(), sys.stdout.fileno())
-    os.dup2(se.fileno(), sys.stderr.fileno())
-
-
-#
-# Main program
-#
-def build_parser():
-    parser = argparse.ArgumentParser(
-        prog="hbc",
-        description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
-    parser.add_argument(
-        "-c", "--config", dest="configfile", help="Config file path (YAML)"
-    )
-    parser.add_argument("-m", "--message", dest="message", help="Send a message")
-    parser.add_argument(
-        "-n", "--name", dest="name", help="Name to use in heartbeat message"
-    )
-    parser.add_argument(
-        "-f", "--daemon", action="store_true", help="Run in daemon mode"
-    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
-    parser.add_argument(
-        "-x", "--debug", action="count", default=0, help="Increase debug level"
-    )
-    parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
-    return parser
-
-
-def main(argv=None):
-    global msgonly, verbose, fdaemon, daemonized, cmdargs, iam, hb_port, conns, interval, hb_hosts
-    parser = build_parser()
-    args = parser.parse_args(argv)
-
-    config = load_config(args.configfile)
-
-    # Apply CLI overrides
-    if args.boot:
-        msgboot["boot"] = 1
-    if args.message:
-        msgboot["service"] = "service"
-        msgboot["msg"] = args.message
-        msgonly = True
-    if args.name:
-        iam = args.name
-        cmdargs += ["-n", iam]
-    if args.daemon:
-        fdaemon = True
-    if args.verbose:
-        verbose = True
-        cmdargs.append("--verbose")
-    if args.debug:
-        config.setdefault("debug", 0)
-        config["debug"] += args.debug
-        cmdargs.append("-" + "x" * args.debug) 
-
-    if verbose:
-        print("cmdargs for restart are %s" % cmdargs)
-
-    #
-    # set defaults
-
-    hb_hosts = args.hosts
-    hb_port = config.get("hb_port", PORT)
-    interval = config.get("interval", INTERVAL)
-
-    #
-    if verbose:
-        print("notice: hb_hosts: %s" % str(hb_hosts))
-        print("notice: hb_port: %s" % hb_port)
-        print("notice: interval: %s" % interval)
-        print("notice: iam: %s" % iam)
-        print("notice: msgonly: %s" % msgonly)
-        print("notice: msgboot: %s" % msgboot)
-
-    if not msgonly:
-        msgboot["interval"] = interval
-
-    conns = {}
-    while True:
-        if verbose:
-            log("create connections")
-        createConnections(hb_hosts)
-        if len(conns) != 0:
-            break
-        if verbose:
-            log("no connections yet, sleep a bit")
-        time.sleep(2)
-
-    if verbose:
-        log("%s connections created" % (len(conns)))
-
-    if len(msgboot) > 0:
-        if verbose:
-            print("on boot")
-        msgboot["acks"] = 0
-        for conn in conns:
-            conns[conn].sendto(msgboot)
-
-    if msgonly:
-        if verbose:
-            print("msgboot done msgonly=%s" % msgonly)
-        closeall()
-        sys.exit(0)
-
-    #
-    syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
-    if fdaemon:
-        print("daemoinizing.")
-        daemonize()
-        daemonized = True
-        syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
-
-    signal.signal(signal.SIGTERM, handler)
-    try:
-        process()
-    except Exception as e:
-        syslogtrace("process")
-        if verbose:
-            print("err: process exit: %s" % e)
-
-    if verbose:
-        log("main: cleanup")
-    cleanup()
-    if dorestart:
-        restart()
-
-
-if __name__ == "__main__":
-    main()
@@ -1,381 +0,0 @@
-"""
-host and connection class shared between hbd and
-the websit's heartbeat.py
-
-"""
-
-import time
-import json
-import copy
-import queue
-
-num = 0
-
-MAXRTTS = 10
-
-DEBUG = 2
-
-
-def log(host, m):
-    if DEBUG:
-        print("class log: %s %s" % (host, m))
-
-
-class Connection:
-    # map of addrs to names
-
-    htab = {}
-    UNKNOWN = "unknown"
-    UP = "up"
-    DOWN = "down"
-    OVERDUE = "overdue"
-
-    def __init__(self, host, cid, addr, afam):
-        self.host = host
-        self.cid = cid
-        if addr[0:7] == "::ffff:":
-            addr = addr[7:]
-        self.addr = addr
-        self.afam = afam
-        self.rtts = [0]
-        self.lastbeat = time.time()
-        self.statetime = self.lastbeat
-        self.deltastatetime = "computed"
-        self.state = Connection.UNKNOWN
-
-        if host:
-            Connection.htab[addr] = self.host.name
-            if self.host.isDynDns():
-                log(self.host.name, "dns update %s" % self.addr)
-                Host.dnsQ.put((self.host.name, self.addr))
-
-    def registerDns(self):
-        Host.dnsQ.put((self.host.name, self.addr))
-
-    def clearstate(self):
-        d = {}
-        d["addr"] = ""
-        d["rtt"] = ""
-        d["lastbeat"] = ""
-        d["state"] = ""
-        d["statetime"] = ""
-        d["deltastatetime"] = ""
-        d["rttstate"] = ""
-        return d
-
-    def statedict(self, Null=False):
-        d = self.clearstate()
-        now = time.time()
-        if not Null:
-            d["addr"] = self.addr
-            if self.rtts[-1]:
-                d["rtt"] = "%0.1f" % self.rtts[-1]
-            elif self.state == Connection.UNKNOWN:
-                d["rtt"] = ""
-            else:
-                d["rtt"] = "?"
-            d["lastbeat"] = self.lastbeat
-            if self.state == Connection.OVERDUE:
-                d["state"] = "<b>%s</b>" % self.state
-            else:
-                d["state"] = self.state
-            if self.state == Connection.UP:
-                d["rttstate"] = d["rtt"]
-            elif self.state == Connection.OVERDUE:
-                d["rttstate"] = ""
-            else:
-                d["rttstate"] = d["state"]
-            d["statetime"] = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
-            )
-            delta = now - self.statetime
-
-            if self.state == Connection.UNKNOWN:
-                d["deltastatetime"] = ""
-            elif delta > 86400:
-                # d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
-            elif delta > 3600:
-                # 	d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
-            # 		d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
-            elif delta > 60:
-                # 	d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
-            # 		d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
-            else:
-                # 	d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
-                d["deltastatetime"] = "%i secs" % (delta)
-        if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
-            d = self.clearstate()
-
-        return d
-
-    def headerdict(self, afam):
-        d = {}
-        d["addr"] = "%s Addr" % afam
-        d["rtt"] = "Latencey"
-        d["lastbeat"] = "Last Contact"
-        d["state"] = "State"
-        d["statetime"] = "Last State"
-        d["rttstate"] = "Reach"
-        d["deltastatetime"] = "Last State"
-        return d
-
-    def jsons(self):
-        return json.dumps(self.__dict__)
-
-    # set new state, return number of secs in previous state
-    def newstate(self, state, now, when=0):
-        self.state = state
-        delta = now - when
-        s = delta - self.statetime
-        self.statetime = delta
-        return s
-
-    def getstate(self):
-        return self.state
-
-    def newaddr(self, addr, rtt, now):
-        self.lastbeat = now
-        self.rtts.append(rtt)
-        if len(self.rtts) > MAXRTTS:
-            del self.rtts[0]
-
-        if self.addr == addr:
-            r = None
-        else:
-            r = "changed from %s to %s" % (self.addr, addr)
-            try:
-                del Connection.htab[self.addr]
-            except Exception:
-                pass
-            self.addr = addr
-            Connection.htab[addr] = self.host.name
-            if self.host.isDynDns():
-                Host.dnsQ.put((self.host.name, self.addr))
-        return r
-
-
-#
-class Host:
-    # Table of Hosts
-    hosts = {}
-    dnsQ = queue.Queue()
-
-    def __init__(self, name):
-        global num
-        self.name = name
-        if name:
-            num += 1
-            Host.hosts[name] = self
-        self.num = num
-        self.dyn = False
-        self.watched = False
-        self.upcount = 0
-        self.interval = 0
-        self.doesack = -1
-        self.cmds = []
-        self.cver = 0
-        self.connections = {}
-        self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
-
-    def statedict(self):
-        d = {}
-        d["name"] = self.name
-        if self.dyn:
-            d["name"] += "*"
-        if self.watched:
-            d["name"] = "<b>%s</b>" % d["name"]
-        d["dyn"] = str(self.dyn)
-        d["ver"] = str(self.cver)
-        d["num"] = self.num
-        for c in ["IPv4", "IPv6"]:
-            if c in self.connections:
-                cs = self.connections[c].statedict()
-            else:
-                cs = ubConnection.statedict(True)
-            for csv in cs:
-                d["%s.%s" % (c, csv)] = cs[csv]
-
-        return d
-
-    def headerdict(self):
-        d = {}
-        d["name"] = "Name"
-        d["dyn"] = "Dyn"
-        d["ver"] = "Ver"
-        d["num"] = "??"
-        for c in ["IPv4", "IPv6"]:
-            cs = ubConnection.headerdict(c)
-            for csv in cs:
-                d["%s.%s" % (c, csv)] = cs[csv]
-        return d
-
-    def registerDns(self):
-        for af in self.connections:
-            self.connections[af].registerDns()
-
-    def stateinfo(self):
-        ddict = {}
-        for d in self.__dict__:
-            if d == "connections":
-                cl = []
-                for c in ["IPv4", "IPv6"]:
-                    if c not in self.connections:
-                        continue
-                    # dirty ugly hack: fix conn to host backpointer
-                    cld = copy.deepcopy(self.connections[c].__dict__)
-                    cld["host"] = cld["host"].name
-                    cl.append(cld)
-                ddict[d] = cl
-            else:
-                ddict[d] = self.__dict__[d]
-        return ddict
-
-    def jsons(self):
-        return json.dumps(self.stateinfo())
-
-    def setcver(self, cver):
-        self.cver = cver
-
-    def isDynDns(self):
-        return self.dyn
-
-    def isIPv4(self, addr):
-        if isinstance(addr, tuple):
-            return addr[0].find(".") > 0
-        else:
-            return addr.find(".") > 0
-
-    def conndata(self, cid, addr, rtt, now):
-        if addr[0:7] == "::ffff:":
-            addr = addr[7:]
-        if self.isIPv4(addr):
-            afam = "IPv4"
-        else:
-            afam = "IPv6"
-
-        if afam not in self.connections:
-            self.connections[afam] = Connection(self, cid, addr, afam)
-
-        conn = self.connections[afam]
-        res = conn.newaddr(addr, rtt, now)
-        return conn, res
-
-    # called when reloading class from pickle, add new fields here
-    def fixup(self):
-        for c in ["IPv4", "IPv6"]:
-            if c in self.connections:
-                addr = self.connections[c].addr
-                if addr[0:7] == "::ffff:":
-                    addr = addr[7:]
-                self.connections[c].addr = addr
-
-        pass
-
-    # def dispstate(self):
-    #    if self.state in ["down", "overdue"]:
-    #        state = "<b>%s</b>" % self.state
-    #    elif self.state in ["up", "UP"]:
-    #        state = ""
-    #        for x in list(self.connections.keys()):
-    #            try:
-    #                state += " %5.1f" % (self.connections[x].rtts[-1])
-    #            except:
-    #                state += " %5s" % (self.connections[x].rtts[-1])
-    #    elif self.state in ["unknown", "UNKNOWN"]:
-    #        state = ""
-    #    else:
-    #        state = "%s" % self.state
-    #    return state
-
-    def dispstats(self):
-        if self.doesack != -1:
-            if self.upcount > 0:
-                r = ""
-                for v in range(3):
-                    a, u = self.hdwcounts[v]
-                    if (self.upcount - u) != 0:
-                        vs = "%0.0f" % (
-                            100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
-                        )
-                        if vs == "0":
-                            vs = ""
-                    else:
-                        vs = "-"
-                    r += '<td align="right">%s</td>' % vs
-                return r
-            else:
-                return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
-        return '<td align="right">N/A</td><td></td<td></td>>'
-
-    hostfields_long = [
-        "name",
-        "IPv4.addr",
-        "IPv4.state",
-        ("IPv4.rtt", 'style="text-align: right;"'),
-        ("IPv4.statetime", 'style="text-align: right;"'),
-        "IPv6.addr",
-        "IPv6.state",
-        ("IPv6.rtt", 'style="text-align: right;"'),
-        ("IPv6.statetime", 'style="text-align: right;"'),
-        "ver",
-    ]
-
-    hostfields_short = [
-        "name",
-        ("IPv4.rttstate", 'style="text-align: right;"'),
-        ("IPv4.deltastatetime", 'style="text-align: right;"'),
-        ("IPv6.rttstate", 'style="text-align: right;"'),
-        ("IPv6.deltastatetime", 'style="text-align: right;"'),
-    ]
-
-    def gene(self, tag, v, attrib=None):
-        if attrib:
-            a = " %s" % attrib
-        else:
-            a = ""
-        return "<%s%s>%s</%s>" % (tag, a, v, tag)
-
-    def htmltable(self, tag, hd, short):
-        if short:
-            hostfields = Host.hostfields_short
-        else:
-            hostfields = Host.hostfields_long
-        h = []
-        for f in hostfields:
-            if isinstance(f, tuple):
-                h.append(self.gene(tag, hd[f[0]], f[1]))
-            else:
-                h.append(self.gene(tag, hd[f]))
-        return self.gene("tr", "\n".join(h))
-
-    def buildhosttable(self, short=False):
-        if DEBUG > 1:
-            print("DBG buildhosttable: start")
-        res = []
-        res.append('<table id="ntable" class="sortable">')
-        res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
-        hosts_sorted = list(Host.hosts.keys())
-        if len(hosts_sorted):
-            hosts_sorted.sort()
-            for h in hosts_sorted:
-                res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
-        res.append("</table>")
-        if DEBUG > 1:
-            print("DBG buildhosttable: %s" % res)
-        return res
-
-    def buildmsgtable(self, msgs):
-        res = []
-        le = max(40 - len(Host.hosts), 3)
-        res.append("<h4>Log of Events</h4>")
-        for m in msgs[len(msgs) - le :]:
-            res.append("%s<BR>" % m)
-        return res
-
-
-# create fake "unbound objects", remove in Python 3.0
-ubHost = Host(None)
-ubConnection = Connection(None, "", "", "")
@@ -1,221 +0,0 @@
-"""HTTP server implementation using aiohttp and jinja2."""
-
-import asyncio
-import json
-import time
-import urllib.parse
-import os
-import logging
-from aiohttp import web
-import jinja2
-
-logger = logging.getLogger(__name__)
-
-
-def _render_template(html_str: str, **context) -> str:
-    tmpl = jinja2.Template(html_str)
-    return tmpl.render(**context)
-
-
-async def start(
-    host: str,
-    port: int,
-    config,
-    hbdclass,
-    msgs_getter,
-    log=None,
-    email=None,
-    pushmsg=None,
-    msg_to_websockets=None,
-    tcss=None,
-    DEBUG=0,
-    verbose=False,
-    get_now=None,
-    VER="",
-):
-    """Start an aiohttp web server and block until cancelled.
-
-    This function is intended to be awaited inside the main asyncio event loop.
-    """
-    get_now = get_now or (lambda: time.time())
-
-    async def index(request):
-        res = []
-        res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
-        res.append("<html>")
-        res.append("<head>")
-        res.append("<title>Heartbeat</title>")
-        if tcss:
-            res.append(tcss)
-        res.append("</head>")
-        res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
-        res.append(f"<H2>Heartbeat status {VER}</h2>")
-        res += hbdclass.ubHost.buildhosttable()
-        res += hbdclass.ubHost.buildmsgtable(msgs_getter())
-        res.append(
-            "<p> %s (%s)</p>"
-            % (
-                time.strftime("%H:%M:%S", time.localtime(get_now())),
-                config.get("tz", "CET-1CDT"),
-            )
-        )
-        res.append("</body></html>")
-        body = "\n".join(res)
-        return web.Response(text=body, content_type="text/html")
-
-    async def api_hosts(request):
-        lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
-        return web.json_response(json.loads("[" + ",".join(lst) + "]"))
-
-    async def api_messages(request):
-        lst = msgs_getter()[-30:]
-        return web.json_response(lst)
-
-    async def cmd(request):
-        qa = request.rel_url.query
-        uname = qa.get("h")
-        ucmd = qa.get("c")
-        if not ucmd or not uname:
-            return web.Response(status=400, text="need h= and c= arguments")
-        if uname not in hbdclass.Host.hosts:
-            return web.Response(status=400, text=f"h={uname} not found")
-        hbdclass.Host.hosts[uname].cmds.append(
-            ("CMD", {"cmd": urllib.parse.unquote(ucmd)})
-        )
-        return web.Response(text=f"cmd {uname} queued")
-
-    async def drop(request):
-        qa = request.rel_url.query
-        uname = qa.get("h")
-        if not uname:
-            return web.Response(status=400, text="need h= argument")
-        if uname not in hbdclass.Host.hosts:
-            return web.Response(status=400, text=f"h={uname} not found")
-        if log:
-            log(uname, "dropped")
-        del hbdclass.Host.hosts[uname]
-        return web.Response(text="Done")
-
-    async def register(request):
-        qa = request.rel_url.query
-        uname = qa.get("h")
-        if not uname:
-            return web.Response(status=400, text="need h= argument")
-        if uname not in hbdclass.Host.hosts:
-            return web.Response(status=400, text=f"h={uname} not found")
-        ll = hbdclass.Host.hosts[uname].registerDns()
-        if log:
-            log(uname, ll)
-        return web.Response(text=str(ll))
-
-    async def update(request):
-        qa = request.rel_url.query
-        uname = urllib.parse.unquote(qa.get("h", ""))
-        ucode = qa.get("c")
-        if not ucode or not uname:
-            return web.Response(status=400, text="need h= and c= arguments")
-        if uname != "All" and uname not in hbdclass.Host.hosts:
-            return web.Response(status=400, text=f"h={uname} not found")
-        if uname != "All":
-            names = [uname]
-        else:
-            names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
-        out = []
-        for n in names:
-            err = None
-            try:
-                r = {"csum": None, "code": ucode}
-                hbdclass.Host.hosts[n].cmds.append(("UPD", r))
-            except Exception as e:
-                err = str(e)
-            out.append(f"update started for {n}: {err if err else 'OK'}")
-        return web.Response(text="\n".join(out))
-
-    async def restart(request):
-        # signal main application to perform restart if needed
-        # not implemented here - return OK
-        if log:
-            log(None, "restart request")
-        return web.Response(text="restart request")
-
-    async def live(request):
-        # render template from hbd/templates/live.html using Jinja2
-        # Resolve templates directory relative to the hbd package
-        pkg_dir = os.path.dirname(__file__)
-        templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
-        env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
-        host = config.get("hb_host", "localhost")
-        extra_scripts = config.get("http_extra_scripts", "")
-        host = request.host.split(":")[0]
-        if config.get("wss_port"):
-            heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
-        else:
-            heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
-        tmpl = env.get_template("live.html")
-        body = tmpl.render(
-            title="Heartbeat",
-            header="Heartbeat",
-            request=request,
-            heartbeat_ws_url=heartbeat_ws_url,
-            extra_scripts=extra_scripts,
-            hosts=[
-                hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
-            ],
-            messages=msgs_getter()[-30:],
-        )
-        return web.Response(text=body, content_type="text/html")
-
-    async def static(request):
-        """Serve files from the package static directory.
-
-        URL form: /static/<path>
-        """
-        p = request.match_info.get("path", "")
-        logger.debug("static file requested: %s", p)
-        base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
-        # normalize and prevent directory traversal
-        target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
-        if not target.startswith(base + os.sep) and target != base:
-            return web.Response(status=403, text="Forbidden")
-        if not os.path.exists(target) or not os.path.isfile(target):
-            return web.Response(status=404, text="Not Found")
-        logger.info("serving static file: %s", target)
-        return web.FileResponse(path=target)
-
-    async def favicon(request):
-        """Serve favicon.ico from the package static directory."""
-        base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
-        target = os.path.join(base, "favicon.ico")
-        if not os.path.exists(target) or not os.path.isfile(target):
-            return web.Response(status=404, text="Not Found")
-        return web.FileResponse(path=target)
-
-    app = web.Application()
-    app.add_routes(
-        [
-            web.get("/", index),
-            web.get("/api/0/hosts", api_hosts),
-            web.get("/api/0/messages", api_messages),
-            web.get("/c", cmd),
-            web.get("/d", drop),
-            web.get("/n", register),
-            web.get("/u", update),
-            web.get("/r", restart),
-            web.get("/live", live),
-            web.get("/static/{path:.*}", static),
-            web.get("/favicon.ico", favicon),
-        ]
-    )
-
-    runner = web.AppRunner(app)
-    await runner.setup()
-    site = web.TCPSite(runner, host, port)
-    await site.start()
-
-    if verbose:
-        print(f"HTTP server started on {host}:{port}")
-
-    try:
-        await asyncio.Future()
-    finally:
-        await runner.cleanup()
@@ -1,50 +0,0 @@
-"""monitor helper and thread for heartbeat daemon."""
-
-from __future__ import annotations
-import asyncio
-import time
-
-DROPOVERDUE = 7 * 24 * 3600
-
-
-def checkoverdue(
-    config: dict,
-    hbdclass,
-    log: callable,
-    pushmsg: callable,
-    msg_to_websockets: callable,
-):
-    now = time.time()
-    for h in list(hbdclass.Host.hosts.keys()):
-        pmsg = []
-        for c in hbdclass.Host.hosts[h].connections:
-            conn = hbdclass.Host.hosts[h].connections[c]
-            if conn.state == hbdclass.Connection.DOWN:
-                continue
-            timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
-            if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
-                conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
-                pmsg.append(conn.afam)
-            if (
-                conn.state == hbdclass.Connection.OVERDUE
-                and (now - conn.lastbeat) > DROPOVERDUE
-            ):
-                conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
-        if pmsg != []:
-            if h in config.get("watchhosts", []):
-                pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
-            log(h, "%s overdue" % " and ".join(pmsg))
-            msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
-
-
-async def start(
-    config: dict,
-    hbdclass: callable,
-    log=None,
-    pushmsg=None,
-    msg_to_websockets=None,
-):
-    """start a monitor loop that checks for overdue hosts every minute"""
-    while True:
-        await asyncio.sleep(15)  # 15 seconds between checks
-        checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
@@ -1,202 +0,0 @@
-"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
-
-import logging
-from typing import Optional
-import http.client
-import urllib.parse
-import subprocess
-import smtplib
-import time
-
-DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
-
-# module-level configuration set via setup()
-_config = {}
-logger = logging.getLogger(__name__)
-
-
-def setup(cfg: dict):
-    """Initialize notifier defaults from a configuration dict."""
-    global _config
-    _config = dict(cfg)
-
-
-def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
-    """Send a plain email via SMTP. Returns True on success."""
-    try:
-        smtpport = _config.get("smtpport", 587)
-        server = smtplib.SMTP(smtpserver, smtpport)
-        if debug > 0:
-            server.set_debuglevel(1)
-        if smtpport == 587:
-            server.starttls()
-            server.ehlo()
-            smtpuser = _config.get("smtpuser", None)
-            smtppassword = _config.get("smtppassword", None)
-            if smtpuser and smtppassword:
-                server.login(smtpuser, smtppassword)
-        server.sendmail(sender, toaddrs, body)
-    except Exception as e:
-        logger.warning("email send failed: %s", e)
-        try:
-            server.quit()
-        except Exception:
-            pass
-        return False
-    try:
-        server.quit()
-    except Exception:
-        pass
-    return True
-
-
-def email(subject: str, msg: str, debug: int = 0) -> bool:
-    """Convenience wrapper exposed to the rest of the application.
-
-    Uses module-level configuration to supply recipient list, smtp server
-    and sender address.
-    """
-    toaddrs = _config.get("toemail")
-    fromemail = _config.get("fromemail")
-    smtpserver = _config.get("smtpserver")
-    if not toaddrs or not fromemail or not smtpserver:
-        logger.warning(
-            "email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
-            toaddrs,
-            fromemail,
-            smtpserver,
-        )
-        return False
-    date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
-    body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
-        toaddrs[0] if toaddrs else "",
-        fromemail,
-        subject,
-        date,
-        msg,
-    )
-    return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
-
-
-def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
-    """Send message via Pushover API."""
-    conn = http.client.HTTPSConnection("api.pushover.net:443")
-    try:
-        conn.request(
-            "POST",
-            "/1/messages.json",
-            urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
-            {"Content-type": "application/x-www-form-urlencoded"},
-        )
-        r = conn.getresponse()
-        logger.debug("pushover response: %s %s", r.status, r.reason)
-        return r.status == 200
-    except Exception as e:
-        logger.error("pushover error: %s", e)
-        return False
-
-
-def pushmattermost(
-    host: str,
-    token: str,
-    channel: str,
-    msg: str,
-    username: str = "hbd",
-    icon: Optional[str] = None,
-    debug: int = 0,
-) -> bool:
-    """Send a message to Mattermost via simple webhook driver if available.
-
-    This helper tries to import mattermostdriver.Driver and uses webhooks if present.
-    If the import fails it returns False.
-    """
-    try:
-        from mattermostdriver import Driver
-    except Exception:
-        return False
-    ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
-    mm = Driver(ses)
-    payload = {"text": msg, "channel": channel, "username": username}
-    if icon:
-        payload["icon_url"] = icon
-    try:
-        rc = mm.webhooks.call_webhook(token, payload)
-        logger.debug("mattermost rc: %s", rc)
-        return bool(rc is None or rc == "")
-    except Exception as e:
-        logger.error("mattermost error: %s", e)
-        return False
-
-
-def pushsignal(
-    signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
-) -> bool:
-    """Send a message via signal-cli (requires local installation).
-
-    Uses subprocess to call signal-cli. Returns True if the command succeeded.
-    """
-    CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
-    logger.debug("signal cli: %s", CLI)
-    try:
-        res = subprocess.run(CLI, capture_output=True)
-        if res.returncode != 0:
-            logger.error("signal failed: %s".res.stderr.decode())
-            return False
-        logger.debug("signal sent: %s", res.stdout.decode())
-        return True
-    except Exception as e:
-        logger.exception("signal exception: %s", e)
-        return False
-
-
-def pushmsg(cfg: dict, msg: str, debug: int = 0):
-    """Dispatch push notifications according to `cfg['pushsrv']`.
-
-    cfg is expected to contain keys for different services when needed, e.g.
-    - cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
-    - cfg['pushover_token'], cfg['pushover_user']
-    - cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
-    - cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
-
-    Returns a dict of results per provider.
-    """
-    results = {}
-    p = cfg.get("pushsrv", "pushover")
-    if p in ("all", "pushover"):
-        ok = pushover(
-            cfg.get("pushover_token", ""),
-            cfg.get("pushover_user", ""),
-            msg,
-            debug=debug,
-        )
-        results["pushover"] = ok
-    if p in ("all", "mattermost"):
-        ok = pushmattermost(
-            cfg.get("matter_host", ""),
-            cfg.get("matter_token", ""),
-            cfg.get("matter_channel", ""),
-            msg,
-            username=cfg.get("matter_username", "hbd"),
-            icon=cfg.get("matter_icon"),
-            debug=debug,
-        )
-        results["mattermost"] = ok
-    if p in ("all", "signal"):
-        ok = pushsignal(
-            cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
-            cfg.get("signal_user", ""),
-            cfg.get("signal_recipient", ""),
-            msg,
-            debug=debug,
-        )
-        results["signal"] = ok
-    if p in ("all", "email"):
-        ok = email("Heartbeat notification", msg, debug=debug)
-        results["email"] = ok
-    logger.debug("push results: %s", results)
-    return results
-
-
-def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
-    """Use the module-level configuration dict to dispatch a push message."""
-    return pushmsg(_config, msg, debug=debug)
@@ -1,82 +0,0 @@
-"""Message encoding/decoding utilities for hbd protocol."""
-
-from typing import Dict, Any
-import zlib
-
-
-def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
-    """Serialize a dict to protocol message bytes.
-
-    If compress is True, the payload is zlib-compressed and the message is
-    prefixed with `!ID:` as the original script did. Otherwise the format is
-    `ID:key=value;...` (bytes).
-    """
-    s = []
-    for k in d:
-        v = d[k]
-        if isinstance(v, float):
-            s.append(f"{k}={v:0.5f}")
-        else:
-            s.append(f"{k}={v}")
-    pk = ";".join(s)
-    if compress:
-        zpk = zlib.compress(pk.encode(), 6)
-        hdr = ("!" + ID + ":").encode()
-        return hdr + zpk
-    else:
-        return (ID + ":" + pk).encode()
-
-
-def stodict(msg: bytes):
-    """Deserialize a protocol message into a dict.
-
-    Mirrors original behaviour: detects compressed messages starting with
-    '!' and decodes accordingly. Returns a dict with key 'ID' set to the
-    message ID and the parsed key/value pairs.
-    """
-    d = {}
-    if len(msg) > 0 and chr(msg[0]) == "!":
-        # message is: b'!ID:' + compressed_payload
-        # original code used msg[1:4].decode() for ID (3 bytes including colon)
-        try:
-            pk = zlib.decompress(msg[5:]).decode()
-        except Exception:
-            # malformed compressed payload
-            return {}
-        d["ID"] = msg[1:4].decode()
-    else:
-        try:
-            r0 = msg.split(b":", 1)
-            pk = r0[1].decode()
-            d["ID"] = r0[0].decode()
-        except Exception:
-            return {}
-    if not pk:
-        return d
-    parts = pk.split(";")
-    for v in parts:
-        if not v:
-            continue
-        vr = v.split("=", 1)
-        k = vr[0].strip()
-        if len(vr) == 1:
-            d[k] = None
-        else:
-            val = vr[1].strip()
-            if val and val[0].isdigit():
-                try:
-                    val_e = eval(val)
-                except Exception:
-                    val_e = val
-                d[k] = val_e
-            else:
-                d[k] = val
-    return d
-
-
-def oldmtodict(msg: bytes):
-    """Compatibility wrapper for old-style messages (no ID prefix).
-
-    The original implementation prefixed with 'HTB:' and called stodict.
-    """
-    return stodict(b"HTB:" + msg)
@@ -1,370 +0,0 @@
-"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
-
-import asyncio
-import logging
-import socket
-import time
-import signal
-import sys
-import ssl
-from . import __version__
-
-from . import udp
-from . import hbdclass
-
-from . import ws as ws_mod
-
-logger = logging.getLogger(__name__)
-msg_to_websockets = ws_mod.broadcast
-
-logf = None
-lastfm = ["", "", ""]
-
-# shared runtime collections and helpers
-msgs = []
-
-
-def initlog(logfile):
-    try:
-        return open(logfile, "a+")
-    except Exception as e:
-        import sys
-
-        print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
-        return sys.stderr
-
-
-def log(host, m, service=None):
-    ts = time.time()
-    s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
-    msgs.append(s)
-    logger.info(s)
-    if logf:
-        try:
-            logf.write(s + "\n")
-            logf.flush()
-        except Exception as e:
-            logger.warning("failed to write to logfile: %s", e)
-    msg_to_websockets("message", s)
-
-
-def cleanup_function(config):
-    """This function will be executed upon program exit."""
-    logger.info("Running cleanup function...")
-    import pickle
-
-    pickfile = config.get("pickfile", "hbd.pickle")
-
-    pickf = open(pickfile, "wb")
-    pick = pickle.Pickler(pickf)
-    pick.dump(hbdclass.Host.hosts)
-    pick.dump(msgs)
-    pick.dump(lastfm)
-    pickf.close()
-
-    logger.info("Cleanup complete.")
-
-
-async def _run_async(config):
-    loop = asyncio.get_running_loop()
-    shutdown_event = asyncio.Event()
-
-    # Signal handlers for graceful shutdown
-    def signal_handler(signum, frame):
-        sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
-        logger.info(f"Received {sig_name}, initiating shutdown...")
-        loop.call_soon_threadsafe(shutdown_event.set)
-
-    # Register signal handlers
-    loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
-
-    from . import http as http_mod
-    from . import dns as dns_mod
-    from . import notify as notify_mod
-    from . import monitor as monitor_mod
-
-    notify_mod.setup(config)
-
-    pushmsg = notify_mod.pushmsg_from_config
-
-    sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
-    # Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
-    # This option is system-dependent; on many systems, setting it to False enables
-    # the socket to handle both IPv4 and IPv6 traffic.
-    try:
-        sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
-    except OSError as e:
-        logger.warning(
-            f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
-        )
-
-    # 3. Bind to all interfaces (::) on a specific port
-
-    # UDP server endpoint (handler wired to handle_datagram with context)
-    bind_addr = ("::", config.get("hb_port", 50003))
-    sock.bind(bind_addr)
-    logger.info("Starting UDP server on %s:%s", *bind_addr)
-
-    def udp_handler(msg, addr, transport):
-        ctx = dict(
-            config=config,
-            hbdclass=hbdclass,
-            log=log,
-            pushmsg=pushmsg,
-            msg_to_websockets=msg_to_websockets,
-            DEBUG=config.get("debug", 0),
-            verbose=config.get("verbose", False),
-        )
-        udp.handle_datagram(msg, addr, transport, ctx)
-
-    transport, protocol = await loop.create_datagram_endpoint(
-        lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
-        sock=sock,
-    )
-
-    # HTTP server (asyncio-based via aiohttp)
-    try:
-        http_task = asyncio.create_task(
-            http_mod.start(
-                host=config.get("hbd_host", ""),
-                port=config.get("hbd_port", 50004),
-                config=config,
-                hbdclass=hbdclass,
-                msgs_getter=lambda: msgs,
-                log=log,
-                pushmsg=pushmsg,
-                msg_to_websockets=msg_to_websockets,
-                tcss=None,
-                DEBUG=config.get("debug", 0),
-                verbose=config.get("verbose", False),
-                get_now=lambda: time.time(),
-                VER="",
-            )
-        )
-        logger.info(
-            "HTTP server started on %s:%s",
-            config.get("hbd_host", ""),
-            config.get("hbd_port", 50004),
-        )
-    except Exception as e:
-        logger.exception("failed to start HTTP server: %s", e)
-
-    # start dns update worker (async)
-    dns_task = None
-    try:
-        dns_task = dns_mod.start_dns_worker(
-            hbdclass, config, log=log, pushmsg=pushmsg, loop=loop
-        )
-        logger.info("dns update worker started")
-    except Exception as e:
-        logger.exception("dns worker failed to start: %s", e)
-
-    # Start the websocket servers as a background task
-    if config.get("wss_port", None):
-        ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-        ssl_path = config.get("cert_path", "")
-        wss_pem = ssl_path + config.get("wss_pem", "")
-        wss_key = ssl_path + config.get("wss_key", "")
-        try:
-            ssl_context.load_cert_chain(wss_pem, keyfile=wss_key)
-        except FileNotFoundError:
-            logger.error("error: missing SSL keys %s or %s", wss_pem, wss_key)
-            sys.exit(1)
-        logger.info(
-            "Starting secure WebSocket server on port %s with cert %s",
-            config.get("wss_port", None),
-            wss_pem,
-        )
-    else:
-        ssl_context = None
-
-    try:
-        ws_task = asyncio.create_task(
-            ws_mod.start(
-                host=config.get("hbd_host", ""),
-                ws_port=config.get("ws_port", None),
-                wss_port=config.get("wss_port", None),
-                ssl_context=ssl_context,
-                get_hosts=lambda: [
-                    hbdclass.Host.hosts[h].stateinfo()
-                    for h in sorted(hbdclass.Host.hosts)
-                ],
-                get_msgs=lambda: msgs,
-                verbose=config.get("verbose", False),
-            )
-        )
-        logger.info("WebSocket task started")
-    except Exception as e:
-        logger.exception("websocket server failed to start: %s", e)
-
-    # Start the monitor thread as a background task
-    try:
-        monitor_task = asyncio.create_task(
-            monitor_mod.start(
-                config=config,
-                hbdclass=hbdclass,
-                log=log,
-                pushmsg=pushmsg,
-                msg_to_websockets=msg_to_websockets,
-            )
-        )
-        logger.info("Monitor task started")
-    except Exception as e:
-        logger.exception("monitor task failed to start: %s", e)
-
-    try:
-        # run forever until shutdown event is set
-        await shutdown_event.wait()
-        logger.info("Shutdown signal received, stopping services...")
-    except Exception as e:
-        logger.exception("Error in main loop: %s", e)
-    finally:
-        # Cancel all running tasks
-        logger.info("Cancelling tasks...")
-        try:
-            transport.close()
-        except Exception as e:
-            logger.warning("Error closing UDP transport: %s", e)
-
-        tasks_to_cancel = [http_task, ws_task, monitor_task]
-        for task in tasks_to_cancel:
-            if task:
-                try:
-                    task.cancel()
-                    logger.debug("Cancelled task: %s", task)
-                except Exception as e:
-                    logger.warning("Error cancelling task: %s", e)
-
-        # Wait for tasks to finish cancellation with timeout
-        remaining_tasks = [t for t in tasks_to_cancel if t]
-        if remaining_tasks:
-            try:
-                await asyncio.wait_for(
-                    asyncio.gather(*remaining_tasks, return_exceptions=True),
-                    timeout=2.0,
-                )
-            except asyncio.TimeoutError:
-                logger.warning("Timeout waiting for tasks to cancel")
-            except Exception as e:
-                logger.debug("Exception during task cancellation: %s", e)
-
-        # Signal DNS worker to exit and await it
-        try:
-            if "dns_task" in locals() and dns_task:
-                try:
-                    hbdclass.Host.dnsQ.put(None)
-                except Exception:
-                    pass
-                try:
-                    await asyncio.wait_for(dns_task, timeout=2.0)
-                    logger.info("DNS worker finished")
-                except asyncio.TimeoutError:
-                    logger.warning("Timeout waiting for DNS worker to finish")
-                    dns_task.cancel()
-                except asyncio.CancelledError:
-                    logger.info("DNS worker was cancelled")
-                except Exception as e:
-                    logger.warning("Error awaiting DNS worker: %s", e)
-                finally:
-                    # Clear queue bridge to release any held references
-                    hbdclass.Host.dnsQ = None
-        except Exception as e:
-            logger.warning("Error stopping DNS worker: %s", e)
-
-        logger.info("All tasks cancelled")
-
-
-def load_pickled_hosts(config, hbdclass):
-    """Load pickled hosts from file, if available."""
-    global lastfm, msgs
-    import os
-    import pickle
-
-    pickfile = config.get("pickfile", "hbd.pickle")
-    dyndnshosts = config.get("dyndnshosts", [])
-    watchhosts = config.get("watchhosts", [])
-    drophosts = config.get("drophosts", [])
-    if 1 and os.path.exists(pickfile):
-        if config.get("verbose", False):
-            logger.info("opening pickls %s", pickfile)
-        pickf = open(pickfile, "rb")
-        pick = pickle.Unpickler(pickf)
-        try:
-            hbdclass.Host.hosts = pick.load()
-            msgs = pick.load()
-            try:
-                lastfm = pick.load()
-            except Exception:
-                lastfm = ["", "", ""]
-            pickf.close()
-        except Exception as e:
-            logger.exception("load pickled failed: %s", e)
-            os.unlink(pickfile)
-        hbdclass.Connection.htab = {}
-        for h in list(hbdclass.Host.hosts.keys()):
-            hbdclass.Host.hosts[h].dyn = h in dyndnshosts
-            hbdclass.Host.hosts[h].watched = h in watchhosts
-            hbdclass.Host.hosts[h].fixup()
-        for h in drophosts:
-            if h in hbdclass.Host.hosts:
-                del hbdclass.Host.hosts[h]
-        if config.get("verbose", False):
-            logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
-    else:
-        if config.get("verbose", False):
-            logger.info("no pickled data")
-
-
-def run(config):
-    """Start the hbd service (blocking).
-
-    Manually manages the event loop to ensure clean shutdown.
-    """
-    global logf
-    import os
-
-    logging.basicConfig(
-        level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO
-    )
-    load_pickled_hosts(config, hbdclass)
-
-    logf = initlog(logfile=config.get("logfile", "messages.log"))
-    log(None, f"hbd version {__version__} starting up")
-
-    # Create and set the event loop manually
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_run_async(config))
-    except KeyboardInterrupt:
-        logger.info("Received KeyboardInterrupt, shutting down...")
-    except Exception as e:
-        logger.exception("Unhandled exception in main: %s", e)
-    finally:
-        cleanup_function(config)
-        logger.info("hbd shutdown complete")
-        if logf and logf != sys.stderr:
-            try:
-                logf.close()
-            except Exception:
-                pass
-        # Explicitly close the loop
-        try:
-            # Cancel all remaining tasks
-            pending = asyncio.all_tasks(loop)
-            for task in pending:
-                task.cancel()
-            # Run one more cycle to process cancellations
-            if pending:
-                loop.run_until_complete(
-                    asyncio.gather(*pending, return_exceptions=True)
-                )
-        except Exception:
-            pass
-        finally:
-            loop.close()
-
-    # Exit
-    os._exit(0)
@@ -0,0 +1,3 @@
+"""HeartBeat Daemon (hbd) - Server/daemon component."""
+
+from hbd import __version__
@@ -0,0 +1,302 @@
+"""Command line interface for hbd package."""
+
+import argparse
+import getpass
+import sys
+
+from .config import load_config
+from .main import run as run_server
+
+PUSHSRVS = ["all", "pushover", "mattermost"]
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(
+        prog="hbd",
+        description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    subparsers = parser.add_subparsers(dest="command")
+
+    # --- serve (default) ---
+    serve_p = subparsers.add_parser("serve", help="Start the hbd server (default)")
+    serve_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+    serve_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
+    serve_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
+    serve_p.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
+                         help="Push service to use")
+    serve_p.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
+
+    # Legacy top-level flags (no subcommand) — kept for backward compatibility
+    parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+    parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
+    parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
+                        help="Push service to use")
+    parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
+
+    # --- passwd ---
+    passwd_p = subparsers.add_parser(
+        "passwd",
+        help="Generate a password hash for use in the config file",
+    )
+    passwd_p.add_argument(
+        "username",
+        nargs="?",
+        help="Username (informational only, for display)",
+    )
+
+    # --- notify ---
+    notify_p = subparsers.add_parser(
+        "notify",
+        help="Send a test message via a configured notification channel",
+    )
+    notify_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+    notify_p.add_argument(
+        "channel",
+        help="Channel name as defined in notification_channels",
+    )
+    notify_p.add_argument(
+        "message",
+        nargs="?",
+        default="Test notification from hbd",
+        help="Message body (default: 'Test notification from hbd')",
+    )
+    notify_p.add_argument(
+        "--level",
+        default="WARNING",
+        choices=["INFO", "WARNING", "CRITICAL", "RECOVER"],
+        help="Notification level (default: WARNING)",
+    )
+    notify_p.add_argument(
+        "--title",
+        default=None,
+        help="Notification title (default: '[LEVEL] test')",
+    )
+
+    # --- stop ---
+    stop_p = subparsers.add_parser("stop", help="Stop the running hbd instance")
+    stop_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+
+    # --- reload ---
+    reload_p = subparsers.add_parser("reload", help="Reload configuration (SIGHUP)")
+    reload_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+
+    # --- restart ---
+    restart_p = subparsers.add_parser("restart", help="Restart the running hbd instance")
+    restart_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
+    restart_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground after restart")
+    restart_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output after restart")
+
+    return parser
+
+
+def cmd_passwd(args):
+    """Interactive password hash generator."""
+    from .users import hash_password
+
+    username = args.username or ""
+    prompt = f"New password for {username}: " if username else "New password: "
+    while True:
+        pw = getpass.getpass(prompt)
+        if not pw:
+            print("Password must not be empty.", file=sys.stderr)
+            continue
+        pw2 = getpass.getpass("Confirm password: ")
+        if pw != pw2:
+            print("Passwords do not match, try again.", file=sys.stderr)
+            continue
+        break
+
+    hashed = hash_password(pw)
+    if username:
+        print(f"\nAdd the following to your config under users: -> {username}:")
+    else:
+        print("\nPassword hash (paste into config file under the user's 'password' key):")
+    print(f"  password: {hashed}")
+
+
+def cmd_notify(args):
+    """Send a test message via a single notification channel."""
+    from .config import load_config
+    from .notify import Notification, _dispatch_to_channel, setup
+
+    config = load_config(args.configfile)
+    setup(config)
+
+    channels = config.get("notification_channels", {})
+    if args.channel not in channels:
+        available = ", ".join(channels.keys()) if channels else "(none)"
+        print(f"Error: channel '{args.channel}' not found in notification_channels.", file=sys.stderr)
+        print(f"Available channels: {available}", file=sys.stderr)
+        sys.exit(1)
+
+    channel_cfg = channels[args.channel]
+    level = args.level.upper()
+    title = args.title or f"[{level}] test"
+    base_url = config.get("base_url", "").rstrip("/")
+
+    notif = Notification(
+        title=title,
+        body=args.message,
+        level=level,
+        url=f"{base_url}/plugins" if base_url else "",
+    )
+
+    import asyncio
+    from .notify import _send_matrix_async, _send_sms_voipms_async, _DRIVERS
+    ch_type = channel_cfg.get("type", "")
+    print(f"Sending via {args.channel} ({ch_type}): {title} — {args.message}")
+
+    if ch_type == "matrix":
+        ok = asyncio.run(_send_matrix_async(channel_cfg, notif))
+    elif ch_type == "sms_voipms":
+        ok = asyncio.run(_send_sms_voipms_async(channel_cfg, notif))
+    else:
+        driver = _DRIVERS.get(ch_type)
+        if driver is None:
+            print(f"Error: unknown channel type '{ch_type}'", file=sys.stderr)
+            sys.exit(1)
+        ok = driver(channel_cfg, notif)
+
+    if ok:
+        print("OK")
+    else:
+        print("FAILED — check logs for details", file=sys.stderr)
+        sys.exit(1)
+
+
+def _read_pid(configfile) -> int | None:
+    """Return the PID from the pidfile, or None if not found / not running."""
+    import os
+    config = load_config(configfile)
+    pidfile = config.get("pidfile", "")
+    if not pidfile:
+        print("Error: no pidfile configured.", file=sys.stderr)
+        return None
+    try:
+        with open(pidfile) as f:
+            pid = int(f.read().strip())
+        # Verify process is actually running
+        os.kill(pid, 0)
+        return pid
+    except FileNotFoundError:
+        print(f"PID file not found ({pidfile}). Is hbd running?", file=sys.stderr)
+        return None
+    except ProcessLookupError:
+        print(f"PID file exists but process {pid} is not running.", file=sys.stderr)
+        return None
+    except Exception as e:
+        print(f"Error reading pidfile: {e}", file=sys.stderr)
+        return None
+
+
+def cmd_stop(args):
+    import os, signal as _signal, time
+    pid = _read_pid(args.configfile)
+    if pid is None:
+        sys.exit(1)
+    print(f"Stopping hbd (pid {pid})...")
+    os.kill(pid, _signal.SIGTERM)
+    # Wait up to 10 s for the process to exit
+    for _ in range(20):
+        time.sleep(0.5)
+        try:
+            os.kill(pid, 0)
+        except ProcessLookupError:
+            print("hbd stopped.")
+            return
+    print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
+    sys.exit(1)
+
+
+def cmd_reload(args):
+    import os, signal as _signal
+    pid = _read_pid(args.configfile)
+    if pid is None:
+        sys.exit(1)
+    print(f"Sending SIGHUP to hbd (pid {pid})...")
+    os.kill(pid, _signal.SIGHUP)
+    print("Reload signal sent.")
+
+
+def cmd_restart(args):
+    import os, signal as _signal, time, subprocess
+    pid = _read_pid(args.configfile)
+    if pid is not None:
+        print(f"Stopping hbd (pid {pid})...")
+        os.kill(pid, _signal.SIGTERM)
+        for _ in range(20):
+            time.sleep(0.5)
+            try:
+                os.kill(pid, 0)
+            except ProcessLookupError:
+                print("hbd stopped.")
+                break
+        else:
+            print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
+            sys.exit(1)
+    else:
+        print("hbd does not appear to be running — starting fresh.")
+
+    # Re-launch hbd with the same config
+    cmd = [sys.executable, "-m", "hbd.server.cli", "serve"]
+    if args.configfile:
+        cmd += ["-c", args.configfile]
+    if getattr(args, "foreground", False):
+        cmd += ["-f"]
+    if getattr(args, "verbose", False):
+        cmd += ["-v"]
+
+    if getattr(args, "foreground", False):
+        # Run in foreground — replace current process
+        os.execv(sys.executable, cmd)
+    else:
+        subprocess.Popen(cmd, start_new_session=True)
+        print("hbd restarted.")
+
+
+def main(argv=None):
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    if args.command == "passwd":
+        cmd_passwd(args)
+        return
+
+    if args.command == "notify":
+        cmd_notify(args)
+        return
+
+    if args.command == "stop":
+        cmd_stop(args)
+        return
+
+    if args.command == "reload":
+        cmd_reload(args)
+        return
+
+    if args.command == "restart":
+        cmd_restart(args)
+        return
+
+    # Default: run the server (supports both `hbd serve ...` and `hbd ...`)
+    config = load_config(args.configfile)
+
+    # Apply CLI overrides
+    if args.foreground:
+        config["foreground"] = True
+    if args.verbose:
+        config["verbose"] = True
+    if args.pushsrv:
+        config["pushsrv"] = args.pushsrv
+    if args.debug > 0:
+        config["debug"] = args.debug
+
+    # Pass config_path for reloading support
+    run_server(config, config_path=args.configfile)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,342 @@
+"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
+
+import asyncio
+import logging
+import os
+
+try:
+    import yaml
+except Exception:
+    yaml = None
+
+SERVER_DEFAULTS = {
+    # Network settings
+    "hb_port": 50003,           # Port to listen for heartbeats
+    "hbd_port": 50004,          # HTTP API port
+    "hbd_host": "",             # Bind address (empty = all interfaces)
+
+    # Persistence
+    "pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"),  # File to store host state between restarts
+    "pidfile": os.path.join(os.path.expanduser("~"), ".hb.pid"),    # PID file for stop/restart/reload
+
+    # Logging
+    "logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
+    # Notification channels
+    "notification_channels": {},  # Named channels with type and credentials
+    "base_url": "",               # Base URL for notification links (e.g. https://hbd.example.com)
+
+    # Monitoring settings
+    "interval": 20,             # Expected heartbeat interval (for server checks)
+    "grace": 2,                 # Grace multiplier (interval * grace = timeout)
+    "threshold_renotify_interval": 3600,  # Seconds between threshold re-notifications
+
+    # User management
+    "users": {},                # username -> {full_name, avatar, password, admin, notification_channels}
+    "default_owner": None,      # Username that owns hosts with no explicit owner
+
+    # OAuth2 providers
+    "oauth": {},                 # oauth.gitea.{url,client_id,client_secret}
+
+    # Host management
+    "hosts": {},                # Unified host definitions
+    "dyndnshosts": [],          # Hosts with dynamic DNS (legacy)
+    "drophosts": [],            # Hosts to ignore
+    "dyndomains": ["wrede.org"],
+    
+    # DNS updates
+    "nsupdate_bin": "/usr/bin/nsupdate",
+    
+    # WebSocket settings
+    "ws_port": 50005,
+    "wss_port": None,
+    "cert_path": "/usr/local/etc/ssl/",
+    "wss_pem": "fullchain.pem",
+    "wss_key": "privkey.pem",
+    
+    # Message journal configuration
+    "journal_enabled": True,
+    "journal_dir": "/var/log/heartbeat",
+    "journal_file": "messages.journal",
+    "journal_max_size": 100 * 1024 * 1024,  # 100MB
+    "journal_max_backups": 10,
+    
+    # Runtime flags
+    "foreground": False,
+    "verbose": False,
+    "debug": 0,
+    
+    # Plugin/threshold configs (for clients reporting to this server)
+    "plugins": {},
+    "thresholds": {},
+}
+
+THRESHOLD_DEFAULTS = {
+    'thresholds': {
+        'cpu_monitor': {
+            'cpu_percent': {
+                'warning': 80.0, 
+                'critical': 90.0
+                }
+            },
+            'memory_monitor': {
+                'percent': {
+                    'warning': 85.0,
+                    'critical': 95.0
+                }
+            },
+            'disk_monitor': {
+                'partitions': {
+                    '/': {
+                        'percent': {
+                            'warning': 85.0,
+                            'critical': 90.0
+                        }
+                    }
+                }
+            },
+            'rtt': {
+                'warning': 200,
+                'critical': 250.0,
+                'count': 3  # Optional: number of consecutive breaches before alerting
+            },
+            'nagios_runner': {
+                'status_code': {
+                    'display': '{check_name} {output}',
+                    'operator': "nagios"
+                }
+            },
+            'zfs_monitor': {
+                'pools': {
+                    '*': {
+                        'status': {
+                            'warning': 1,  
+                            'critical': 2,  
+                            'operator': '>',
+                            'hysteresis': 0.0,
+                            'display': 'ZFS pool {pool_name} is {health}'
+                        }
+                    }
+                }
+            },
+        }
+    }
+
+
+def load_config(path=None):
+    """Load configuration from a YAML file and merge with server defaults.
+
+    If YAML is not available or the file does not exist, defaults are returned.
+    
+    Args:
+        path: Path to YAML config file (default: ~/.hb.yaml)
+        
+    Returns:
+        Dictionary with configuration
+    """
+    cfg = SERVER_DEFAULTS.copy()
+    if not path:
+        # default path (~/.hb.yaml)
+        path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
+
+    if os.path.exists(path):
+        if yaml:
+            with open(path) as fh:
+                data = yaml.safe_load(fh)
+            # Merge YAML data with defaults
+            # Keep all keys from YAML to support plugin configs and future extensions
+            for k, v in data.items():
+                cfg[k] = v
+        else:
+            # yaml not installed: do not attempt to parse; user must ensure defaults
+            pass
+    return cfg
+
+
+class ReloadableConfig:
+    """Thread-safe/async-safe configuration wrapper that supports runtime reloading.
+    
+    This class wraps the configuration dictionary and provides:
+    - Thread-safe config reloading via SIGHUP
+    - Backward-compatible dict-like access
+    - Async lock to prevent concurrent reloads
+    """
+    
+    def __init__(self, initial_config, config_path=None):
+        """Initialize with initial configuration.
+        
+        Args:
+            initial_config: Initial configuration dictionary
+            config_path: Path to config file for reloading (optional)
+        """
+        self._config = initial_config
+        self._config_path = config_path
+        self._lock = asyncio.Lock()
+        self._logger = logging.getLogger(__name__)
+    
+    async def reload(self, config_path=None):
+        """Reload configuration from file.
+        
+        Args:
+            config_path: Path to config file (uses stored path if not provided)
+            
+        Returns:
+            New configuration dictionary
+            
+        Raises:
+            Exception if reload fails (keeps existing config)
+        """
+        path = config_path or self._config_path
+        if not path:
+            raise ValueError("No config path specified for reload")
+        
+        async with self._lock:
+            try:
+                # Load new config
+                new_config = load_config(path)
+                
+                # Store old config for rollback if needed
+                old_config = self._config
+                
+                # Update config
+                self._config = new_config
+                self._logger.info(f"Configuration reloaded from {path}")
+                
+                return new_config
+            except Exception as e:
+                self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
+                # Keep existing config on error
+                raise
+    
+    def get(self, key, default=None):
+        """Get a config value (dict-compatible)."""
+        return self._config.get(key, default)
+    
+    def __getitem__(self, key):
+        """Get a config value via subscript (dict-compatible)."""
+        return self._config[key]
+    
+    def __contains__(self, key):
+        """Check if key exists (dict-compatible)."""
+        return key in self._config
+    
+    def keys(self):
+        """Return config keys (dict-compatible)."""
+        return self._config.keys()
+    
+    def items(self):
+        """Return config items (dict-compatible)."""
+        return self._config.items()
+    
+    def values(self):
+        """Return config values (dict-compatible)."""
+        return self._config.values()
+    
+    @property
+    def config(self):
+        """Get the underlying config dict (for components that need full dict)."""
+        return self._config
+
+
+def get_watchhosts(config):
+    """Extract watched hostnames from config (hosts with watch: true).
+
+    Returns:
+        List of hostnames to watch
+    """
+    watchhosts = []
+    hosts_config = config.get("hosts", {})
+    if isinstance(hosts_config, dict):
+        for host_name, host_attrs in hosts_config.items():
+            if isinstance(host_attrs, dict) and host_attrs.get("watch", True):
+                watchhosts.append(host_name)
+    return watchhosts
+
+
+def get_dyndnshosts(config):
+    """Extract dyndnshosts from config, supporting both new and legacy formats.
+    
+    Args:
+        config: Configuration dictionary
+        
+    Returns:
+        List of hostnames with dynamic DNS
+    """
+    dyndnshosts = []
+    
+    # New format: hosts section with dyndns attribute
+    if "hosts" in config:
+        hosts_config = config["hosts"]
+        if isinstance(hosts_config, dict):
+            for host_name, host_attrs in hosts_config.items():
+                if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
+                    dyndnshosts.append(host_name)
+    
+    # Legacy format: dyndnshosts list/set
+    if "dyndnshosts" in config:
+        legacy_dyndnshosts = config.get("dyndnshosts", [])
+        if isinstance(legacy_dyndnshosts, (list, set)):
+            dyndnshosts.extend(legacy_dyndnshosts)
+    
+    return list(set(dyndnshosts))  # Remove duplicates
+
+
+def get_host_config(config, hostname):
+    """Get configuration for a specific host from the hosts section.
+
+    Returns:
+        Dictionary with host attributes or empty dict
+    """
+    hosts_config = config.get("hosts", {})
+    if isinstance(hosts_config, dict) and hostname in hosts_config:
+        val = hosts_config[hostname]
+        return val if isinstance(val, dict) else {}
+    return {}
+
+
+# ---------------------------------------------------------------------------
+# User / host-access helpers
+# ---------------------------------------------------------------------------
+
+def get_default_owner(config) -> str | None:
+    """Return the configured default_owner username, or the first admin user, or None."""
+    explicit = config.get("default_owner")
+    if explicit:
+        return explicit
+    # Fall back to first admin user found in config
+    users_cfg = config.get("users", {})
+    if isinstance(users_cfg, dict):
+        for username, attrs in users_cfg.items():
+            if isinstance(attrs, dict) and attrs.get("admin", False):
+                return username
+    return None
+
+
+def get_host_access(config, hostname) -> dict:
+    """Return the access dict for *hostname*: owner, managers, monitors.
+
+    Falls back to default_owner for hosts without an explicit owner.
+
+    Returns:
+        {
+            "owner": str | None,
+            "managers": list[str],
+            "monitors": list[str],
+        }
+    """
+    host_cfg = get_host_config(config, hostname)
+
+    owner = host_cfg.get("owner") # or get_default_owner(config)
+
+    managers = host_cfg.get("managers", [])
+    if isinstance(managers, str):
+        managers = [managers]
+
+    monitors = host_cfg.get("monitors", [])
+    if isinstance(monitors, str):
+        monitors = [monitors]
+
+    return {
+        "owner": owner,
+        "managers": list(managers),
+        "monitors": list(monitors),
+    }
@@ -0,0 +1,12 @@
+msgs = []  # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
+class Data:
+
+    def __init__(self, config):
+        self.config = config
+        self.data = {}
+    
+    def update(self, new_data):
+        self.data.update(new_data)
+    
+    def get(self, key, default=None):
+        return self.data.get(key, default)
@@ -136,16 +136,7 @@ async def dns_update_worker(
            )
            if err:
                m += f", DNS update failed: {err}"
-                if pushmsg:
-                    try:
-                        await loop.run_in_executor(
-                            None,
-                            pushmsg,
-                            "error: nsupdate failed",
-                            f"{name}.dy.{dyndomain}: {m}",
-                        )
-                    except Exception:
-                        pass
+                logger.error("DNS update failed for %s: %s", name, err)
            else:
                m += ", DNS updated."

@@ -171,7 +162,6 @@ def start_dns_worker(
    hbdclass,
    cfg: dict,
    log: Optional[callable] = None,
-    pushmsg: Optional[callable] = None,
    loop: Optional[asyncio.AbstractEventLoop] = None,
 ):
    """Start the async DNS worker and return the Task.
@@ -218,7 +208,7 @@ def start_dns_worker(

    task = loop.create_task(
        dns_update_worker(
-            hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
+            hbdclass, cfg, async_queue=async_q, log=log, loop=loop
        )
    )
    return task
@@ -0,0 +1,638 @@
+"""
+host and connection class shared between hbd and
+the websit's heartbeat.py
+
+"""
+
+import time
+import json
+import copy
+import queue
+
+num = 0
+
+MAXRTTS = 10
+
+DEBUG = 2
+
+
+def log(host, m):
+    if DEBUG:
+        print("class log: %s %s" % (host, m))
+
+
+class Connection:
+    # map of addrs to names
+
+    htab = {}
+    UNKNOWN = "unknown"
+    UP = "up"
+    DOWN = "down"
+    OVERDUE = "overdue"
+
+    def __init__(self, host, cid, addr, afam):
+        self.host = host
+        self.cid = cid
+        if addr[0:7] == "::ffff:":
+            addr = addr[7:]
+        self.addr = addr
+        self.afam = afam
+        self.rtts = [0]
+        self.lastbeat = time.time()
+        self.statetime = self.lastbeat
+        self.deltastatetime = "computed"
+        self.state = Connection.UNKNOWN
+        
+        # Timer-based reachability monitoring
+        self.overdue_timer = None
+        self.overdue_callback = None
+        self.timeout_duration = None
+
+        if host:
+            Connection.htab[addr] = self.host.name
+            if self.host.isDynDns():
+                log(self.host.name, "dns update %s" % self.addr)
+                Host.dnsQ.put((self.host.name, self.addr))
+
+    def __getstate__(self):
+        """Prepare Connection for pickling by excluding non-serializable timer objects."""
+        state = self.__dict__.copy()
+        # Remove asyncio timer objects that can't be pickled
+        # These will be recreated when the next HTB arrives after unpickling
+        state['overdue_timer'] = None
+        state['overdue_callback'] = None
+        state['timeout_duration'] = None
+        return state
+
+    def __setstate__(self, state):
+        """Restore Connection from pickle, reinitializing timer fields."""
+        self.__dict__.update(state)
+        # Ensure timer fields are initialized (they'll be recreated when HTB arrives)
+        if not hasattr(self, 'overdue_timer'):
+            self.overdue_timer = None
+        if not hasattr(self, 'overdue_callback'):
+            self.overdue_callback = None
+        if not hasattr(self, 'timeout_duration'):
+            self.timeout_duration = None
+
+    def registerDns(self):
+        Host.dnsQ.put((self.host.name, self.addr))
+
+    def clearstate(self):
+        d = {}
+        d["addr"] = ""
+        d["rtt"] = ""
+        d["lastbeat"] = ""
+        d["state"] = ""
+        d["statetime"] = ""
+        d["deltastatetime"] = ""
+        d["rttstate"] = ""
+        return d
+
+    def statedict(self, Null=False):
+        d = self.clearstate()
+        now = time.time()
+        if not Null:
+            d["addr"] = self.addr
+            if self.rtts[-1]:
+                d["rtt"] = "%d" % round(self.rtts[-1])
+            elif self.state == Connection.UNKNOWN:
+                d["rtt"] = ""
+            else:
+                d["rtt"] = "?"
+            d["lastbeat"] = self.lastbeat
+            if self.state == Connection.OVERDUE:
+                d["state"] = "<b>%s</b>" % self.state
+            else:
+                d["state"] = self.state
+            if self.state == Connection.UP:
+                d["rttstate"] = d["rtt"]
+            elif self.state == Connection.OVERDUE:
+                d["rttstate"] = ""
+            else:
+                d["rttstate"] = d["state"]
+            d["statetime"] = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
+            )
+            delta = now - self.statetime
+
+            if self.state == Connection.UNKNOWN:
+                d["deltastatetime"] = ""
+            elif delta > 86400:
+                # d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
+                d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
+            elif delta > 3600:
+                # 	d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
+                d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
+            # 		d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
+            elif delta > 60:
+                # 	d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
+                d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
+            # 		d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
+            else:
+                # 	d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
+                d["deltastatetime"] = "%i secs" % (delta)
+        if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
+            d = self.clearstate()
+
+        return d
+
+    def headerdict(self, afam):
+        d = {}
+        d["addr"] = "%s Addr" % afam
+        d["rtt"] = "Latencey"
+        d["lastbeat"] = "Last Contact"
+        d["state"] = "State"
+        d["statetime"] = "Last State"
+        d["rttstate"] = "Reach"
+        d["deltastatetime"] = "Last State"
+        return d
+
+    def jsons(self):
+        """Serialize connection to JSON, excluding non-serializable timer objects."""
+        data = {}
+        for key, value in self.__dict__.items():
+            # Skip timer-related fields that can't be serialized
+            if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
+                continue
+            # Handle host backpointer by converting to name
+            if key == 'host':
+                data[key] = value.name if value else None
+            else:
+                data[key] = value
+        return json.dumps(data)
+
+    # set new state, return number of secs in previous state
+    def newstate(self, state, now, when=0):
+        self.state = state
+        delta = now - when
+        s = delta - self.statetime
+        self.statetime = delta
+        return s
+
+    def getstate(self):
+        return self.state
+
+    def newaddr(self, addr, rtt, now):
+        self.lastbeat = now
+        if rtt is not None:
+            self.rtts.append(rtt)
+        if len(self.rtts) > MAXRTTS:
+            del self.rtts[0]
+
+        if self.addr == addr:
+            r = None
+        else:
+            r = "changed from %s to %s" % (self.addr, addr)
+            try:
+                del Connection.htab[self.addr]
+            except Exception:
+                pass
+            self.addr = addr
+            Connection.htab[addr] = self.host.name
+            if self.host.isDynDns():
+                Host.dnsQ.put((self.host.name, self.addr))
+        return r
+    
+    def reset_overdue_timer(self, timeout_seconds, callback):
+        """Reset the overdue timer for this connection.
+        
+        Cancels any existing timer and sets a new one that will mark
+        the connection as overdue if no heartbeat arrives before timeout.
+        
+        Args:
+            timeout_seconds: Seconds before marking as overdue
+            callback: Async function to call when timer expires
+        """
+        import asyncio
+        
+        # Cancel existing timer if any
+        if self.overdue_timer and not self.overdue_timer.cancelled():
+            self.overdue_timer.cancel()
+        
+        # Store parameters for later reference
+        self.timeout_duration = timeout_seconds
+        self.overdue_callback = callback
+        
+        # Create new timer
+        async def timer_expired():
+            await callback(self)
+        
+        try:
+            loop = asyncio.get_event_loop()
+            self.overdue_timer = loop.call_later(timeout_seconds, 
+                                                  lambda: asyncio.create_task(timer_expired()))
+        except RuntimeError:
+            # No event loop running yet
+            pass
+    
+    def cancel_overdue_timer(self):
+        """Cancel the overdue timer if it exists and clear all timer references."""
+        if self.overdue_timer:
+            try:
+                if not self.overdue_timer.cancelled():
+                    self.overdue_timer.cancel()
+            except Exception:
+                pass
+        # Clear all timer-related references
+        self.overdue_timer = None
+        self.overdue_callback = None
+        self.timeout_duration = None
+    
+    def get_avg_rtt(self):
+        """Get average RTT from recent samples."""
+        valid_rtts = [r for r in self.rtts if r > 0]
+        if valid_rtts:
+            return sum(valid_rtts) / len(valid_rtts)
+        return 0
+    
+    def get_current_rtt(self):
+        """Get most recent RTT value."""
+        return self.rtts[-1] if self.rtts else 0
+    
+    def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
+        """Check if RTT exceeds thresholds.
+        
+        Args:
+            warning_threshold: RTT in ms for warning level
+            critical_threshold: RTT in ms for critical level
+            
+        Returns:
+            Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
+        """
+        rtt = self.get_current_rtt()
+        if rtt <= 0:
+            return (None, rtt)
+        
+        if critical_threshold and rtt > critical_threshold:
+            return ('CRITICAL', rtt)
+        elif warning_threshold and rtt > warning_threshold:
+            return ('WARNING', rtt)
+        
+        return (None, rtt)
+
+
+#
+class Host:
+    # Table of Hosts
+    hosts = {}
+    dnsQ = queue.Queue()
+
+    def __init__(self, name):
+        global num
+        self.name = name
+        if name:
+            num += 1
+            Host.hosts[name] = self
+        self.num = num
+        self.dyn = False
+        self.watched = True
+        self.upcount = 0
+        self.interval = 0
+        self.doesack = -1
+        self.cmds = []
+        self.connections = {}
+        # Plugin data storage: {plugin_name: [(timestamp, data), ...]}
+        self.plugin_data = {}
+        self.plugin_retention = 100  # Keep last N samples per plugin
+        # Alert state tracking: {metric_path: AlertState}
+        self.alert_states = {}
+        # User access control
+        self.owner: str | None = None       # username of owner
+        self.managers: list = []            # usernames with manager role
+        self.monitors: list = []            # usernames with monitor role
+
+    def statedict(self):
+        d = {}
+        d["raw_name"] = self.name
+        d["name"] = self.name
+        if self.dyn:
+            d["name"] += "*"
+        if self.watched:
+            d["name"] = "<b>%s</b>" % d["name"]
+        d["dyn"] = str(self.dyn)
+        d["num"] = self.num
+        
+        # Add alert counts (split by acknowledged status)
+        warning_unacked = 0
+        warning_acked = 0
+        critical_unacked = 0
+        critical_acked = 0
+        for metric_path, alert_state in self.alert_states.items():
+            # Import AlertLevel here to avoid circular imports
+            from .threshold import AlertLevel
+            if alert_state.level == AlertLevel.WARNING:
+                if alert_state.acknowledged:
+                    warning_acked += 1
+                else:
+                    warning_unacked += 1
+            elif alert_state.level == AlertLevel.CRITICAL:
+                if alert_state.acknowledged:
+                    critical_acked += 1
+                else:
+                    critical_unacked += 1
+        
+        d["alert_warning_unacked"] = warning_unacked
+        d["alert_warning_acked"] = warning_acked
+        d["alert_critical_unacked"] = critical_unacked
+        d["alert_critical_acked"] = critical_acked
+        
+        for c in ["IPv4", "IPv6"]:
+            if c in self.connections:
+                cs = self.connections[c].statedict()
+            else:
+                cs = ubConnection.statedict(True)
+            for csv in cs:
+                d["%s.%s" % (c, csv)] = cs[csv]
+
+        return d
+
+    def headerdict(self):
+        d = {}
+        d["name"] = "Name"
+        d["dyn"] = "Dyn"
+        d["num"] = "??"
+        for c in ["IPv4", "IPv6"]:
+            cs = ubConnection.headerdict(c)
+            for csv in cs:
+                d["%s.%s" % (c, csv)] = cs[csv]
+        return d
+
+    def registerDns(self):
+        for af in self.connections:
+            self.connections[af].registerDns()
+
+    def stateinfo(self):
+        ddict = {}
+        for d in self.__dict__:
+            if d in ["alert_states", "plugin_data"]:
+                continue
+            if d == "connections":
+                cl = []
+                for c in ["IPv4", "IPv6"]:
+                    if c not in self.connections:
+                        continue
+                    # Create connection dict, excluding non-serializable timer objects
+                    conn = self.connections[c]
+                    cld = {}
+                    for key, value in conn.__dict__.items():
+                        # Skip timer-related fields that can't be serialized
+                        if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
+                            continue
+                        # Handle host backpointer by converting to name
+                        if key == 'host':
+                            cld[key] = value.name if value else None
+                        else:
+                            # Safe copy for serializable values
+                            try:
+                                cld[key] = copy.deepcopy(value)
+                            except Exception:
+                                # If deepcopy fails, use shallow copy
+                                cld[key] = value
+                    cl.append(cld)
+                ddict[d] = cl
+            else:
+                ddict[d] = self.__dict__[d]
+        
+        # Add alert counts (computed from alert_states)
+        warning_unacked = 0
+        warning_acked = 0
+        critical_unacked = 0
+        critical_acked = 0
+        if hasattr(self, 'alert_states'):
+            from .threshold import AlertLevel
+            for metric_path, alert_state in self.alert_states.items():
+                if alert_state.level == AlertLevel.WARNING:
+                    if alert_state.acknowledged:
+                        warning_acked += 1
+                    else:
+                        warning_unacked += 1
+                elif alert_state.level == AlertLevel.CRITICAL:
+                    if alert_state.acknowledged:
+                        critical_acked += 1
+                    else:
+                        critical_unacked += 1
+        
+        ddict["alert_warning_unacked"] = warning_unacked
+        ddict["alert_warning_acked"] = warning_acked
+        ddict["alert_critical_unacked"] = critical_unacked
+        ddict["alert_critical_acked"] = critical_acked
+
+        # User access
+        ddict["owner"] = getattr(self, "owner", None)
+        ddict["managers"] = list(getattr(self, "managers", []))
+        ddict["monitors"] = list(getattr(self, "monitors", []))
+
+        # hbc version from latest os_info plugin data
+        hbc_version = None
+        latest_os = self.get_latest_plugin_data("os_info")
+        if latest_os:
+            _, os_data = latest_os
+            hbc_version = os_data.get("hbc_version")
+        ddict["hbc_version"] = hbc_version
+
+        return ddict
+
+    def jsons(self):
+        return json.dumps(self.stateinfo())
+
+    def isDynDns(self):
+        return self.dyn
+
+    def isIPv4(self, addr):
+        if isinstance(addr, tuple):
+            return addr[0].find(".") > 0
+        else:
+            return addr.find(".") > 0
+
+    def conndata(self, cid, addr, rtt, now):
+        if addr[0:7] == "::ffff:":
+            addr = addr[7:]
+        if self.isIPv4(addr):
+            afam = "IPv4"
+        else:
+            afam = "IPv6"
+
+        if afam not in self.connections:
+            self.connections[afam] = Connection(self, cid, addr, afam)
+
+        conn = self.connections[afam]
+        res = conn.newaddr(addr, rtt, now)
+        return conn, res
+
+    # called when reloading class from pickle, add new fields here
+    def fixup(self):
+        for c in ["IPv4", "IPv6"]:
+            if c in self.connections:
+                addr = self.connections[c].addr
+                if addr[0:7] == "::ffff:":
+                    addr = addr[7:]
+                self.connections[c].addr = addr
+
+        # Add plugin_data if missing (for backward compatibility)
+        if not hasattr(self, "plugin_data"):
+            self.plugin_data = {}
+        if not hasattr(self, "plugin_retention"):
+            self.plugin_retention = 100
+        if not hasattr(self, "alert_states"):
+            self.alert_states = {}
+        # User access fields (added in user-management feature)
+        if not hasattr(self, "owner"):
+            self.owner = None
+        if not hasattr(self, "managers"):
+            self.managers = []
+        if not hasattr(self, "monitors"):
+            self.monitors = []
+
+        pass
+
+    def add_plugin_data(self, plugin_name, data, timestamp=None):
+        """Store plugin data with timestamp.
+        
+        Args:
+            plugin_name: Name of the plugin (e.g., "cpu_monitor")
+            data: Dict of plugin data
+            timestamp: Optional timestamp (default: current time)
+        """
+        if timestamp is None:
+            timestamp = time.time()
+        
+        if plugin_name not in self.plugin_data:
+            self.plugin_data[plugin_name] = []
+        
+        # Add new data
+        self.plugin_data[plugin_name].append((timestamp, data))
+        
+        # Enforce retention limit (keep last N samples)
+        if len(self.plugin_data[plugin_name]) > self.plugin_retention:
+            self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
+    
+    def get_plugin_data(self, plugin_name, limit=None):
+        """Retrieve plugin data for a specific plugin.
+        
+        Args:
+            plugin_name: Name of the plugin
+            limit: Optional limit on number of recent samples to return
+            
+        Returns:
+            List of (timestamp, data) tuples, most recent last
+        """
+        data = self.plugin_data.get(plugin_name, [])
+        if limit and len(data) > limit:
+            return data[-limit:]
+        return data
+    
+    def get_latest_plugin_data(self, plugin_name):
+        """Get the most recent plugin data for a plugin.
+        
+        Args:
+            plugin_name: Name of the plugin
+            
+        Returns:
+            (timestamp, data) tuple or None if no data
+        """
+        data = self.plugin_data.get(plugin_name, [])
+        return data[-1] if data else None
+    
+    def get_all_plugin_data(self):
+        """Get all plugin data for this host.
+
+        Returns:
+            Dict of {plugin_name: [(timestamp, data), ...]}
+        """
+        return self.plugin_data
+
+    # ------------------------------------------------------------------
+    # User-role helpers
+    # ------------------------------------------------------------------
+
+    def apply_access(self, owner, managers, monitors):
+        """Set owner/managers/monitors on this host (called from config load)."""
+        self.owner = owner
+        self.managers = list(managers)
+        self.monitors = list(monitors)
+
+    def is_owner(self, username: str) -> bool:
+        return self.owner == username
+
+    def is_manager(self, username: str) -> bool:
+        return username in self.managers or self.is_owner(username)
+
+    def is_monitor(self, username: str) -> bool:
+        return username in self.monitors or self.is_manager(username)
+
+    def access_dict(self) -> dict:
+        return {
+            "owner": self.owner,
+            "managers": list(self.managers),
+            "monitors": list(self.monitors),
+        }
+
+    hostfields_long = [
+        "name",
+        "IPv4.addr",
+        "IPv4.state",
+        ("IPv4.rtt", 'style="text-align: right;"'),
+        ("IPv4.statetime", 'style="text-align: right;"'),
+        "IPv6.addr",
+        "IPv6.state",
+        ("IPv6.rtt", 'style="text-align: right;"'),
+        ("IPv6.statetime", 'style="text-align: right;"'),
+    ]
+
+    hostfields_short = [
+        "name",
+        ("IPv4.rttstate", 'style="text-align: right;"'),
+        ("IPv4.deltastatetime", 'style="text-align: right;"'),
+        ("IPv6.rttstate", 'style="text-align: right;"'),
+        ("IPv6.deltastatetime", 'style="text-align: right;"'),
+    ]
+
+    def gene(self, tag, v, attrib=None):
+        if attrib:
+            a = " %s" % attrib
+        else:
+            a = ""
+        return "<%s%s>%s</%s>" % (tag, a, v, tag)
+
+    def htmltable(self, tag, hd, short):
+        if short:
+            hostfields = Host.hostfields_short
+        else:
+            hostfields = Host.hostfields_long
+        h = []
+        for f in hostfields:
+            if isinstance(f, tuple):
+                h.append(self.gene(tag, hd[f[0]], f[1]))
+            else:
+                h.append(self.gene(tag, hd[f]))
+        return self.gene("tr", "\n".join(h))
+
+    def buildhosttable(self, short=False):
+        if DEBUG > 1:
+            print("DBG buildhosttable: start")
+        res = []
+        res.append('<table id="ntable" class="sortable">')
+        res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
+        hosts_sorted = list(Host.hosts.keys())
+        if len(hosts_sorted):
+            hosts_sorted.sort()
+            for h in hosts_sorted:
+                res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
+        res.append("</table>")
+        if DEBUG > 1:
+            print("DBG buildhosttable: %s" % res)
+        return res
+
+    def buildmsgtable(self, msgs):
+        res = []
+        le = max(40 - len(Host.hosts), 3)
+        res.append("<h4>Log of Events</h4>")
+        for m in msgs[len(msgs) - le :]:
+            res.append("%s<BR>" % m)
+        return res
+
+
+# create fake "unbound objects", remove in Python 3.0
+ubHost = Host(None)
+ubConnection = Connection(None, "", "", "")
@@ -0,0 +1,342 @@
+"""
+Journal logging for heartbeat messages.
+
+Provides size-based rotating log files for all received heartbeat messages.
+Messages are logged in JSON format for easy parsing and analysis.
+"""
+
+import json
+import logging
+import os
+import asyncio
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class MessageJournal:
+    """
+    Journal logger for heartbeat messages with size-based rotation.
+    
+    Features:
+    - Logs all received messages in JSON format
+    - Automatic rotation when file size exceeds threshold
+    - Keeps configurable number of rotated logs
+    - Thread-safe and async-safe operation
+    - Configurable log directory and file naming
+    
+    Configuration:
+        journal_dir: Directory for journal files (default: /var/log/heartbeat/)
+        journal_file: Base filename (default: messages.journal)
+        max_size: Maximum file size in bytes before rotation (default: 100MB)
+        max_backups: Number of backup files to keep (default: 10)
+        enabled: Enable/disable journaling (default: True)
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize the message journal.
+        
+        Args:
+            config: Configuration dictionary with journal settings
+        """
+        self.config = config or {}
+        
+        # Configuration options
+        self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
+        self.journal_file = self.config.get('journal_file', 'messages.journal')
+        self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024)  # 100MB default
+        self.max_backups = self.config.get('journal_max_backups', 10)
+        self.enabled = self.config.get('journal_enabled', True)
+        
+        # Runtime state
+        self._file_handle = None
+        self._current_size = 0
+        self._lock = asyncio.Lock()
+        self._initialized = False
+        
+        # Full path to current journal file
+        self.journal_path = self.journal_dir / self.journal_file
+        
+    async def initialize(self) -> bool:
+        """
+        Initialize the journal.
+        
+        Creates journal directory if needed and opens the journal file.
+        
+        Returns:
+            True if initialization successful, False otherwise
+        """
+        if not self.enabled:
+            logger.info("Message journal disabled in configuration")
+            return True
+            
+        try:
+            # Create journal directory if it doesn't exist
+            self.journal_dir.mkdir(parents=True, exist_ok=True)
+            
+            # Open journal file in append mode
+            self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
+            
+            # Get current file size
+            try:
+                self._current_size = os.path.getsize(self.journal_path)
+            except OSError:
+                self._current_size = 0
+            
+            self._initialized = True
+            logger.info(f"Message journal initialized: {self.journal_path} "
+                       f"(current size: {self._current_size:,} bytes, "
+                       f"max: {self.max_size:,} bytes)")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize message journal: {e}")
+            self.enabled = False
+            return False
+    
+    async def log_message(
+        self,
+        msg: Dict[str, Any],
+        addr: tuple,
+        timestamp: Optional[float] = None
+    ):
+        """
+        Log a received message to the journal.
+        
+        Args:
+            msg: Parsed message dictionary
+            addr: Source address (ip, port) tuple
+            timestamp: Message timestamp (defaults to current time)
+        """
+        if not self.enabled or not self._initialized:
+            return
+        
+        # Skip HTB (heartbeat) messages - too verbose
+        msg_id = msg.get('ID', '')
+        if msg_id == 'HTB':
+            return
+        
+        async with self._lock:
+            try:
+                # Prepare journal entry
+                if timestamp is None:
+                    import time
+                    timestamp = time.time()
+                
+                entry = {
+                    'timestamp': timestamp,
+                    'datetime': datetime.fromtimestamp(timestamp).isoformat(),
+                    'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
+                    'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
+                    'message': msg
+                }
+                
+                # Serialize to JSON (one line per entry)
+                json_line = json.dumps(entry, separators=(',', ':')) + '\n'
+                json_bytes = json_line.encode('utf-8')
+                
+                # Check if rotation is needed
+                if self._current_size + len(json_bytes) > self.max_size:
+                    await self._rotate()
+                
+                # Write to journal
+                if self._file_handle:
+                    self._file_handle.write(json_line)
+                    self._file_handle.flush()  # Ensure data is written
+                    self._current_size += len(json_bytes)
+                    
+                    logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
+                    
+            except Exception as e:
+                logger.error(f"Error writing to journal: {e}")
+    
+    async def _rotate(self):
+        """
+        Rotate the journal file.
+        
+        Renames current file with timestamp, opens new file, and removes
+        old backups exceeding max_backups limit.
+        """
+        try:
+            # Close current file
+            if self._file_handle:
+                self._file_handle.close()
+                self._file_handle = None
+            
+            # Generate backup filename with timestamp
+            timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
+            backup_name = f"{self.journal_file}.{timestamp_str}"
+            backup_path = self.journal_dir / backup_name
+            
+            # Rename current file to backup
+            if self.journal_path.exists():
+                self.journal_path.rename(backup_path)
+                logger.info(f"Rotated journal: {backup_path} "
+                           f"(size: {self._current_size:,} bytes)")
+            
+            # Open new journal file
+            self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
+            self._current_size = 0
+            
+            # Clean up old backups
+            await self._cleanup_old_backups()
+            
+        except Exception as e:
+            logger.error(f"Error rotating journal: {e}")
+            # Try to reopen the file even if rotation failed
+            try:
+                self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
+            except Exception as e2:
+                logger.error(f"Failed to reopen journal after rotation error: {e2}")
+                self.enabled = False
+    
+    async def _cleanup_old_backups(self):
+        """
+        Remove old backup files exceeding max_backups limit.
+        
+        Keeps only the most recent backups based on filename (which includes timestamp).
+        """
+        try:
+            # Find all backup files
+            backup_pattern = f"{self.journal_file}.*"
+            backup_files = sorted(self.journal_dir.glob(backup_pattern))
+            
+            # Remove oldest backups if we have too many
+            if len(backup_files) > self.max_backups:
+                files_to_remove = backup_files[:len(backup_files) - self.max_backups]
+                for backup_file in files_to_remove:
+                    try:
+                        backup_file.unlink()
+                        logger.info(f"Removed old backup: {backup_file.name}")
+                    except Exception as e:
+                        logger.warning(f"Failed to remove old backup {backup_file}: {e}")
+                        
+        except Exception as e:
+            logger.error(f"Error cleaning up old backups: {e}")
+    
+    async def log_threshold_event(
+        self,
+        host_name: str,
+        metric_path: str,
+        old_level: str,
+        new_level: str,
+        value: Any,
+        timestamp: Optional[float] = None
+    ):
+        """
+        Log a threshold state change event.
+        
+        Args:
+            host_name: Name of the host
+            metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
+            old_level: Previous alert level
+            new_level: New alert level
+            value: Current metric value
+            timestamp: Event timestamp (default: current time)
+        """
+        if not self.enabled or not self._initialized:
+            return
+        
+        try:
+            if timestamp is None:
+                timestamp = __import__('time').time()
+            
+            event = {
+                'timestamp': timestamp,
+                'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
+                'event_type': 'threshold',
+                'host': host_name,
+                'metric': metric_path,
+                'old_level': old_level,
+                'new_level': new_level,
+                'value': value,
+            }
+            
+            async with self._lock:
+                if not self._file_handle:
+                    return
+                
+                # Check if rotation is needed
+                if self._current_size >= self.max_size:
+                    await self._rotate()
+                
+                # Write event
+                line = json.dumps(event) + '\n'
+                self._file_handle.write(line)
+                self._file_handle.flush()
+                
+                # Update size
+                self._current_size += len(line.encode('utf-8'))
+                
+        except Exception as e:
+            logger.error(f"Error logging threshold event: {e}")
+    
+    async def close(self):
+        """
+        Close the journal and release resources.
+        
+        Should be called during shutdown.
+        """
+        async with self._lock:
+            if self._file_handle:
+                try:
+                    self._file_handle.close()
+                    logger.info("Message journal closed")
+                except Exception as e:
+                    logger.error(f"Error closing journal: {e}")
+                finally:
+                    self._file_handle = None
+                    self._initialized = False
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get journal statistics.
+        
+        Returns:
+            Dictionary with journal stats
+        """
+        return {
+            'enabled': self.enabled,
+            'initialized': self._initialized,
+            'current_file': str(self.journal_path),
+            'current_size': self._current_size,
+            'max_size': self.max_size,
+            'max_backups': self.max_backups,
+            'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
+        }
+
+
+# Global journal instance
+_journal_instance: Optional[MessageJournal] = None
+
+
+def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
+    """
+    Get or create the global journal instance.
+    
+    Args:
+        config: Configuration dictionary (only used on first call)
+        
+    Returns:
+        MessageJournal instance
+    """
+    global _journal_instance
+    if _journal_instance is None:
+        _journal_instance = MessageJournal(config)
+    return _journal_instance
+
+
+async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
+    """
+    Convenience function to log a message using the global journal.
+    
+    Args:
+        msg: Parsed message dictionary
+        addr: Source address (ip, port) tuple
+        timestamp: Message timestamp (defaults to current time)
+    """
+    journal = get_journal()
+    await journal.log_message(msg, addr, timestamp)
@@ -0,0 +1,539 @@
+"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
+
+import asyncio
+import logging
+import socket
+import time
+import signal
+import sys
+import ssl
+from . import __version__
+
+from . import udp
+from . import hbdclass
+
+from . import ws as ws_mod
+from . import notify as notify_mod
+from . import data
+from . import users as users_mod
+
+logger = logging.getLogger(__name__)
+msg_to_websockets = ws_mod.broadcast
+eventlog = notify_mod.eventlog
+
+# shared runtime collections and helpers
+
+def save_state(config, hbdclass):
+    """Save current state to pickle file. Safe to call at any time."""
+    import pickle
+    import os
+    from . import users as users_mod
+
+    # Clear timer references before pickling (they can't be serialized)
+    for hostname, host in list(hbdclass.Host.hosts.items()):
+        for conn_type, conn in host.connections.items():
+            if hasattr(conn, 'cancel_overdue_timer'):
+                conn.cancel_overdue_timer()
+            if hasattr(conn, 'overdue_timer'):
+                conn.overdue_timer = None
+            if hasattr(conn, 'overdue_callback'):
+                conn.overdue_callback = None
+            if hasattr(conn, 'timeout_duration'):
+                conn.timeout_duration = None
+
+    pickfile = config.get("pickfile", "hbd.pickle")
+    tmpfile = pickfile + ".tmp"
+
+    try:
+        with open(tmpfile, "wb") as pickf:
+            pick = pickle.Pickler(pickf)
+            pick.dump(hbdclass.Host.hosts)
+            pick.dump(data.msgs)
+            pick.dump(users_mod.save_sessions())
+        os.replace(tmpfile, pickfile)
+    except Exception as e:
+        logger.error("Failed to save state: %s", e)
+        try:
+            os.unlink(tmpfile)
+        except Exception:
+            pass
+
+
+def cleanup_function(config, hbdclass):
+    """This function will be executed upon program exit."""
+    logger.info("Running cleanup function...")
+    save_state(config, hbdclass)
+    logger.info("Cleanup complete.")
+
+
+async def reload_configuration(config_obj, config_path, components):
+    """Reload configuration and update all components.
+    
+    Args:
+        config_obj: ReloadableConfig instance
+        config_path: Path to config file
+        components: Dict with threshold_checker and other components
+        
+    Returns:
+        True if reload succeeded, False otherwise
+    """
+    try:
+        logger.info("=" * 60)
+        logger.info("Starting configuration reload...")
+        logger.info("=" * 60)
+        
+        # Reload config file
+        new_config = await config_obj.reload(config_path)
+        
+        # Update notify module
+        notify_mod.reload_config(new_config)
+
+        # Reload users
+        users_mod.load_users(new_config)
+
+        # Re-apply host attributes from updated config to all known hosts
+        from . import config as config_mod
+        dyndnshosts = config_mod.get_dyndnshosts(new_config)
+        watchhosts = config_mod.get_watchhosts(new_config)
+        for hostname, host in hbdclass.Host.hosts.items():
+            host.dyn = hostname in dyndnshosts
+            host.watched = hostname in watchhosts
+            access = config_mod.get_host_access(new_config, hostname)
+            host.apply_access(access["owner"], access["managers"], access["monitors"])
+
+        # Reload threshold checker and prune alerts orphaned by the new config
+        if 'threshold_checker' in components:
+            components['threshold_checker'].reload(new_config)
+            components['threshold_checker'].purge_stale_alerts(hbdclass)
+        
+        # Note: Changes to the following require restart:
+        # - hb_port, hbd_port, ws_port (already bound)
+        # - SSL certificates (already loaded)
+        # - pickfile (already opened)
+        # - journal settings (journal already initialized)
+        
+        # These are reloadable and effective immediately:
+        # - notification_channels
+        # - threshold_configs
+        # - hosts (watchhosts, dyndnshosts, notification_channels)
+        # - grace period (used on next heartbeat)
+        # - debug/verbose flags (used on next message)
+        
+        logger.info("=" * 60)
+        logger.info("Configuration reload completed successfully")
+        logger.info("=" * 60)
+        return True
+        
+    except Exception as e:
+        logger.error("=" * 60)
+        logger.error(f"Failed to reload configuration: {e}", exc_info=True)
+        logger.error("Keeping previous configuration")
+        logger.error("=" * 60)
+        return False
+
+
+async def _run_async(config, config_path=None):
+    from .config import ReloadableConfig
+    if not isinstance(config, ReloadableConfig):
+        config = ReloadableConfig(config, config_path)
+
+    loop = asyncio.get_running_loop()
+    shutdown_event = asyncio.Event()
+    reload_event = asyncio.Event()
+
+    # Signal handlers for graceful shutdown and reload
+    def signal_handler(signum, frame):
+        sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
+        logger.info(f"Received {sig_name}, initiating shutdown...")
+        loop.call_soon_threadsafe(shutdown_event.set)
+    
+    def reload_handler(signum, frame):
+        sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
+        logger.info(f"Received {sig_name}, initiating config reload...")
+        loop.call_soon_threadsafe(reload_event.set)
+
+    # Register signal handlers
+    loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
+    loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
+
+    from . import http as http_mod
+    from . import dns as dns_mod
+    from . import notify as notify_mod
+    from . import journal as journal_mod
+    from . import threshold as threshold_mod
+
+    notify_mod.setup(config, loop=loop)
+    
+    # Initialize message journal
+    msg_journal = journal_mod.get_journal(config)
+    await msg_journal.initialize()
+    
+    # Initialize threshold checker
+    threshold_checker = threshold_mod.ThresholdChecker(
+        config=config,
+        renotify_interval=config.get("threshold_renotify_interval", 3600),
+        journal=msg_journal,
+    )
+    logger.info("Threshold checker initialized")
+    
+    # Components dict for reload orchestration
+    components = {
+        'threshold_checker': threshold_checker,
+        'msg_journal': msg_journal,
+    }
+
+    sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+    # Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
+    # This option is system-dependent; on many systems, setting it to False enables
+    # the socket to handle both IPv4 and IPv6 traffic.
+    try:
+        sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
+    except OSError as e:
+        logger.warning(
+            f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
+        )
+
+    bind_addr = ("::", config.get("hb_port", 50003))
+    sock.bind(bind_addr)
+    logger.info("Starting UDP server on %s:%s", *bind_addr)
+
+    # Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
+    # If supported, read datagrams via recvmsg() so RTT uses the kernel
+    # timestamp rather than the time.time() call after asyncio scheduling.
+    use_kernel_ts = udp.enable_kernel_timestamps(sock)
+    if use_kernel_ts:
+        logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
+    else:
+        logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
+
+    def udp_handler(msg, addr, transport, recv_ts=None):
+        ctx = dict(
+            config=config,
+            hbdclass=hbdclass,
+            msg_to_websockets=msg_to_websockets,
+            msg_journal=msg_journal,
+            threshold_checker=threshold_checker,
+            DEBUG=config.get("debug", 0),
+            verbose=config.get("verbose", False),
+            recv_ts=recv_ts,
+        )
+        udp.handle_datagram(msg, addr, transport, ctx)
+
+    if use_kernel_ts:
+        # recvmsg path: manage the socket ourselves with loop.add_reader()
+        sock.setblocking(False)
+        transport = udp.RecvmsgTransport(loop, sock)
+        reader = udp.make_recvmsg_reader(sock, udp_handler, transport)
+        loop.add_reader(sock.fileno(), reader)
+        protocol = None
+    else:
+        transport, protocol = await loop.create_datagram_endpoint(
+            lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
+            sock=sock,
+        )
+
+    # Restore connection timers for hosts loaded from pickle
+    restore_ctx = dict(
+        config=config,
+        hbdclass=hbdclass,
+        msg_to_websockets=msg_to_websockets,
+        threshold_checker=threshold_checker,
+    )
+    udp.restore_connection_timers(hbdclass, restore_ctx)
+
+    # Drop alert states that no longer have a matching threshold (stale after
+    # upgrade or config change between runs).
+    threshold_checker.purge_stale_alerts(hbdclass)
+
+    # HTTP server (asyncio-based via aiohttp)
+    try:
+        http_task = asyncio.create_task(
+            http_mod.start(
+                host=config.get("hbd_host", ""),
+                port=config.get("hbd_port", 50004),
+                config=config,
+                hbdclass=hbdclass,
+                tcss=None,
+                threshold_checker=threshold_checker,
+                verbose=config.get("verbose", False),
+                get_now=lambda: time.time(),
+                VER="",
+            )
+        )
+        logger.info(
+            "HTTP server started on %s:%s",
+            config.get("hbd_host", ""),
+            config.get("hbd_port", 50004),
+        )
+    except Exception as e:
+        logger.exception("failed to start HTTP server: %s", e)
+
+    # start dns update worker (async)
+    dns_task = None
+    try:
+        dns_task = dns_mod.start_dns_worker(
+            hbdclass, config, log=eventlog, loop=loop
+        )
+        logger.info("dns update worker started")
+    except Exception as e:
+        logger.exception("dns worker failed to start: %s", e)
+
+    # Register WebSocket state — connections are now served through /ws on the HTTP port
+    ws_task = None
+    ws_mod.setup(
+        loop=loop,
+        get_hosts=lambda: [
+            hbdclass.Host.hosts[h].stateinfo()
+            for h in sorted(hbdclass.Host.hosts)
+        ],
+        verbose=config.get("verbose", False),
+    )
+    logger.info("WebSocket handler registered on /ws (HTTP port %s)", config.get("hbd_port", 50004))
+
+    # Periodic autosave task
+    autosave_interval = config.get("autosave_interval", 300)  # default: 5 minutes
+
+    async def autosave_task():
+        while True:
+            await asyncio.sleep(autosave_interval)
+            logger.debug("Autosaving state...")
+            save_state(config, hbdclass)
+            logger.debug("Autosave complete (%d hosts)", len(hbdclass.Host.hosts))
+
+    autosave = asyncio.create_task(autosave_task())
+    logger.info("Autosave task started (interval: %ds)", autosave_interval)
+
+    # Main event loop - monitor shutdown and reload events
+    try:
+        while True:
+            # Wait for either shutdown or reload event
+            done, pending = await asyncio.wait(
+                [
+                    asyncio.create_task(shutdown_event.wait()),
+                    asyncio.create_task(reload_event.wait()),
+                ],
+                return_when=asyncio.FIRST_COMPLETED
+            )
+            
+            # Check which event was triggered
+            if shutdown_event.is_set():
+                logger.info("Shutdown signal received, stopping services...")
+                # Cancel pending wait tasks
+                for task in pending:
+                    task.cancel()
+                break
+            
+            if reload_event.is_set():
+                # Clear the event for next reload
+                reload_event.clear()
+                
+                # Cancel pending wait tasks
+                for task in pending:
+                    task.cancel()
+                
+                # Perform reload if config_path is available
+                if config_path:
+                    await reload_configuration(config, config_path, components)
+                else:
+                    logger.warning("Cannot reload: no config path available")
+                
+                # Continue main loop
+                continue
+                
+    except Exception as e:
+        logger.exception("Error in main loop: %s", e)
+    finally:
+        # Cancel all running tasks
+        logger.info("Cancelling tasks...")
+        try:
+            transport.close()
+        except Exception as e:
+            logger.warning("Error closing UDP transport: %s", e)
+
+        tasks_to_cancel = [http_task, autosave]
+        for task in tasks_to_cancel:
+            if task:
+                try:
+                    task.cancel()
+                    logger.debug("Cancelled task: %s", task)
+                except Exception as e:
+                    logger.warning("Error cancelling task: %s", e)
+
+        # Wait for tasks to finish cancellation with timeout
+        remaining_tasks = [t for t in tasks_to_cancel if t]
+        if remaining_tasks:
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*remaining_tasks, return_exceptions=True),
+                    timeout=2.0,
+                )
+            except asyncio.TimeoutError:
+                logger.warning("Timeout waiting for tasks to cancel")
+            except Exception as e:
+                logger.debug("Exception during task cancellation: %s", e)
+        
+        # Close message journal
+        try:
+            await msg_journal.close()
+        except Exception as e:
+            logger.warning("Error closing message journal: %s", e)
+
+        # Signal DNS worker to exit and await it
+        try:
+            if "dns_task" in locals() and dns_task:
+                try:
+                    hbdclass.Host.dnsQ.put(None)
+                except Exception:
+                    pass
+                try:
+                    await asyncio.wait_for(dns_task, timeout=2.0)
+                    logger.info("DNS worker finished")
+                except asyncio.TimeoutError:
+                    logger.warning("Timeout waiting for DNS worker to finish")
+                    dns_task.cancel()
+                except asyncio.CancelledError:
+                    logger.info("DNS worker was cancelled")
+                except Exception as e:
+                    logger.warning("Error awaiting DNS worker: %s", e)
+                finally:
+                    # Clear queue bridge to release any held references
+                    hbdclass.Host.dnsQ = None
+        except Exception as e:
+            logger.warning("Error stopping DNS worker: %s", e)
+
+        # Save state (hosts + sessions) on clean shutdown
+        try:
+            save_state(config, hbdclass)
+            logger.info("State saved on shutdown")
+        except Exception as e:
+            logger.warning("Error saving state on shutdown: %s", e)
+
+        logger.info("All tasks cancelled")
+
+
+def load_pickled_hosts(config, hbdclass):
+    """Load pickled hosts from file, if available."""
+    import os
+    import pickle
+    from . import config as config_mod
+    from . import users as users_mod
+
+    pickfile = config.get("pickfile", "hbd.pickle")
+    dyndnshosts = config_mod.get_dyndnshosts(config)
+    watchhosts = config_mod.get_watchhosts(config)
+    drophosts = config.get("drophosts", [])
+    if 1 and os.path.exists(pickfile):
+        if config.get("verbose", False):
+            logger.info("opening pickls %s", pickfile)
+        pickf = open(pickfile, "rb")
+        pick = pickle.Unpickler(pickf)
+        try:
+            hbdclass.Host.hosts = pick.load()
+            data.msgs = pick.load()
+            try:
+                users_mod.load_sessions(pick.load())
+            except Exception:
+                pass  # older pickle without sessions — fine
+            pickf.close()
+        except Exception as e:
+            logger.exception("load pickled failed: %s", e)
+            os.unlink(pickfile)
+        hbdclass.Connection.htab = {}
+        for h in list(hbdclass.Host.hosts.keys()):
+            hbdclass.Host.hosts[h].dyn = h in dyndnshosts
+            hbdclass.Host.hosts[h].watched = h in watchhosts
+            hbdclass.Host.hosts[h].fixup()
+            access = config_mod.get_host_access(config, h)
+            hbdclass.Host.hosts[h].apply_access(
+                access["owner"], access["managers"], access["monitors"]
+            )
+        for h in drophosts:
+            if h in hbdclass.Host.hosts:
+                del hbdclass.Host.hosts[h]
+        if config.get("verbose", False):
+            logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
+    else:
+        if config.get("verbose", False):
+            logger.info("no pickled data")
+
+
+def run(config, config_path=None):
+    """Start the hbd service (blocking).
+
+    Manually manages the event loop to ensure clean shutdown.
+    
+    Args:
+        config: Configuration dictionary
+        config_path: Path to config file (for reload support)
+    """
+    import os
+
+    log_level = logging.WARNING
+    if config.get("verbose", False):
+        log_level = logging.INFO
+    if config.get("debug", 0) > 0:
+        log_level = logging.DEBUG
+    logging.basicConfig(level=log_level)
+    if not config.get("debug", 0):
+        logging.getLogger("aiohttp.access").propagate = False
+    load_pickled_hosts(config, hbdclass)
+
+    notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
+    users_mod.load_users(config)
+
+    # Write pidfile
+    pidfile = config.get("pidfile", "")
+    if pidfile:
+        try:
+            with open(pidfile, "w") as f:
+                f.write(str(os.getpid()))
+        except Exception as e:
+            logger.warning("Failed to write pidfile %s: %s", pidfile, e)
+
+    eventlog(None, "INFO", f"hbd version {__version__} starting up")
+    
+    if config_path:
+        logger.info(f"Config file: {config_path} (reload with SIGHUP)")
+    else:
+        logger.warning("No config path provided - reload via SIGHUP disabled")
+
+    # Create and set the event loop manually
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    try:
+        loop.run_until_complete(_run_async(config, config_path=config_path))
+    except KeyboardInterrupt:
+        logger.info("Received KeyboardInterrupt, shutting down...")
+    except Exception as e:
+        logger.exception("Unhandled exception in main: %s", e)
+    finally:
+        cleanup_function(config, hbdclass)
+        logger.info("hbd shutdown complete")
+        eventlog(None, "INFO", f"hbd version {__version__} shutdown")
+        notify_mod.closelog()
+        # Remove pidfile
+        if pidfile:
+            try:
+                os.unlink(pidfile)
+            except Exception:
+                pass
+        # Explicitly close the loop
+        try:
+            # Cancel all remaining tasks
+            pending = asyncio.all_tasks(loop)
+            for task in pending:
+                task.cancel()
+            # Run one more cycle to process cancellations
+            if pending:
+                loop.run_until_complete(
+                    asyncio.gather(*pending, return_exceptions=True)
+                )
+        except Exception:
+            pass
+        finally:
+            loop.close()
+
+    # Exit
+    os._exit(0)
@@ -0,0 +1,28 @@
+"""Monitor helper for heartbeat daemon.
+
+This module provides monitoring tasks for the heartbeat daemon.
+The primary reachability monitoring is now event-driven (timers set/reset 
+on HTB arrival in udp.py) rather than periodic polling.
+
+This module can be extended for additional monitoring tasks.
+"""
+
+from __future__ import annotations
+import asyncio
+import time
+from . import notify as notify_mod
+
+DROPOVERDUE = 7 * 24 * 3600
+eventlog = notify_mod.eventlog
+
+
+async def cleanup_connections(hbdclass):
+    """Clean up connection timers on shutdown.
+    
+    Cancels all active overdue timers to prevent callbacks after shutdown.
+    """
+    for hostname, host in list(hbdclass.Host.hosts.items()):
+        for conn_type, conn in host.connections.items():
+            if hasattr(conn, 'cancel_overdue_timer'):
+                conn.cancel_overdue_timer()
+
@@ -0,0 +1,492 @@
+"""Notification helpers: email, pushover, matrix, mattermost, signal, sms and dispatcher.
+
+Channel types supported:
+  pushover      - Pushover app notifications
+  email         - SMTP email
+  matrix        - Matrix (via matrix-nio)
+  mattermost    - Mattermost webhook
+  signal        - Signal via signal-cli subprocess
+  sms_voipms    - SMS via voip.ms REST API
+
+Each channel can specify ``min_level: WARNING|CRITICAL`` (default: WARNING).
+
+Notifications are dispatched to the owner + managers of the host, each via
+their own ``notification_channels`` list.  When no users are configured the
+server runs silently (no notifications sent).
+"""
+
+import asyncio
+import logging
+import smtplib
+import subprocess
+import time
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+from . import data
+from . import ws as ws_mod
+
+logger = logging.getLogger(__name__)
+
+msg_to_websockets = ws_mod.broadcast
+
+# Module-level state set via setup()
+_config: dict = {}
+
+# Tracks which channels fired a WARNING/CRITICAL per host.
+# {host_name: set of channel_names}  — used to route RECOVER to the same channels.
+_alerted_channels: dict = {}
+
+logf = None
+
+
+# ---------------------------------------------------------------------------
+# Level ordering
+# ---------------------------------------------------------------------------
+
+_LEVEL_ORDER = {"RECOVER": 0, "INFO": 0, "WARNING": 1, "CRITICAL": 2}
+
+def _level_value(level: str) -> int:
+    return _LEVEL_ORDER.get(level.upper(), 0)
+
+
+# ---------------------------------------------------------------------------
+# Notification dataclass
+# ---------------------------------------------------------------------------
+
+@dataclass
+class Notification:
+    """Structured notification payload."""
+    title: str          # e.g. "[CRITICAL] webserver01"
+    body: str           # detail message
+    level: str          # RECOVER | WARNING | CRITICAL | INFO
+    url: str = ""       # link to plugin metrics page
+
+
+# ---------------------------------------------------------------------------
+# Module setup
+# ---------------------------------------------------------------------------
+
+def setup(cfg: dict, loop: Optional[asyncio.AbstractEventLoop] = None):
+    """Initialize notifier from configuration dict."""
+    global _config
+    _config = dict(cfg)
+
+
+def reload_config(cfg: dict):
+    """Reload notification configuration on SIGHUP."""
+    global _config
+    _config = dict(cfg)
+    logger.info("Notification configuration reloaded")
+
+
+# ---------------------------------------------------------------------------
+# Event log (websocket + file + in-memory)
+# ---------------------------------------------------------------------------
+
+def initlog(logfile):
+    global logf
+    try:
+        logf = open(logfile, "a+")
+    except Exception as e:
+        print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
+        logf = sys.stderr
+    return logf
+
+
+def closelog():
+    global logf
+    if logf and logf != sys.stderr:
+        try:
+            logf.close()
+        except Exception:
+            pass
+
+
+def eventlog(host, lvl, m, service=None):
+    ts = time.time()
+    msg = {
+        "ts": ts,
+        "host": host or None,
+        "level": lvl,
+        "service": service,
+        "message": m,
+    }
+    data.msgs.append(msg)
+    s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
+    if host:
+        s += f"{host} "
+    s += m
+    logger.info(s)
+    if logf:
+        try:
+            logf.write(s + "\n")
+            logf.flush()
+        except Exception as e:
+            logger.warning("failed to write to logfile: %s", e)
+    msg_to_websockets("message", msg)
+
+
+# ---------------------------------------------------------------------------
+# Low-level channel drivers
+# ---------------------------------------------------------------------------
+
+def _send_pushover(channel_cfg: dict, notif: Notification) -> bool:
+    import http.client
+    import urllib.parse
+    token = channel_cfg.get("token", "")
+    user = channel_cfg.get("user", "")
+    if not token or not user:
+        logger.warning("pushover: missing token or user")
+        return False
+    params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body}
+    if channel_cfg.get("sound"):
+        params["sound"] = channel_cfg["sound"]
+    if notif.url:
+        params["url"] = notif.url
+        params["url_title"] = "Heartbeat"
+    conn = http.client.HTTPSConnection("api.pushover.net:443")
+    try:
+        conn.request(
+            "POST",
+            "/1/messages.json",
+            urllib.parse.urlencode(params),
+            {"Content-type": "application/x-www-form-urlencoded"},
+        )
+        r = conn.getresponse()
+        logger.debug("pushover response: %s %s", r.status, r.reason)
+        return r.status == 200
+    except Exception as e:
+        logger.error("pushover error: %s", e)
+        return False
+
+
+def _send_email(channel_cfg: dict, notif: Notification) -> bool:
+    recipients = channel_cfg.get("recipients", [])
+    sender = channel_cfg.get("sender", "")
+    smtp_server = channel_cfg.get("smtp_server", "")
+    smtp_port = channel_cfg.get("smtp_port", 587)
+    smtp_user = channel_cfg.get("smtp_user")
+    smtp_password = channel_cfg.get("smtp_password")
+
+    if not recipients or not sender or not smtp_server:
+        logger.warning("email: missing recipients, sender, or smtp_server")
+        return False
+
+    date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
+    body_text = notif.body
+    if notif.url:
+        body_text += f"\n\n{notif.url}"
+    raw = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
+        recipients[0] if isinstance(recipients, list) else recipients,
+        sender,
+        notif.title,
+        date,
+        body_text,
+    )
+    try:
+        server = smtplib.SMTP(smtp_server, smtp_port)
+        if smtp_port == 587:
+            server.starttls()
+            server.ehlo()
+            if smtp_user and smtp_password:
+                server.login(smtp_user, smtp_password)
+        server.sendmail(sender, recipients, raw)
+        server.quit()
+        return True
+    except Exception as e:
+        logger.warning("email send failed: %s", e)
+        try:
+            server.quit()
+        except Exception:
+            pass
+        return False
+
+
+def _send_mattermost(channel_cfg: dict, notif: Notification) -> bool:
+    try:
+        from mattermostdriver import Driver
+    except ImportError:
+        logger.error("mattermostdriver not installed")
+        return False
+    host = channel_cfg.get("host", "")
+    token = channel_cfg.get("token", "")
+    channel = channel_cfg.get("channel", "")
+    if not host or not token or not channel:
+        logger.warning("mattermost: missing host, token, or channel")
+        return False
+    text = f"**{notif.title}**\n{notif.body}"
+    if notif.url:
+        text += f"\n[Plugin metrics] {notif.url}"
+    ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
+    mm = Driver(ses)
+    payload: dict = {"text": text, "channel": channel, "username": channel_cfg.get("username", "hbd")}
+    icon = channel_cfg.get("icon")
+    if icon:
+        payload["icon_url"] = icon
+    try:
+        rc = mm.webhooks.call_webhook(token, payload)
+        return bool(rc is None or rc == "")
+    except Exception as e:
+        logger.error("mattermost error: %s", e)
+        return False
+
+
+def _send_signal(channel_cfg: dict, notif: Notification) -> bool:
+    cli = channel_cfg.get("cli_path", "/usr/local/bin/signal-cli")
+    user = channel_cfg.get("user", "")
+    recipient = channel_cfg.get("recipient", "")
+    if not user or not recipient:
+        logger.warning("signal: missing user or recipient")
+        return False
+    msg = f"{notif.title}\n{notif.body}"
+    if notif.url:
+        msg += f"\n{notif.url}"
+    try:
+        res = subprocess.run([cli, "-u", user, "send", "-m", msg, recipient], capture_output=True)
+        if res.returncode != 0:
+            logger.error("signal failed: %s", res.stderr.decode())
+            return False
+        return True
+    except Exception as e:
+        logger.exception("signal exception: %s", e)
+        return False
+
+
+async def _send_sms_voipms_async(channel_cfg: dict, notif: Notification) -> bool:
+    """Send SMS via voip.ms REST API using multipart form-data POST."""
+    import json
+    import aiohttp
+
+    api_user = channel_cfg.get("api_user", "")
+    api_password = channel_cfg.get("api_password", "")
+    did = channel_cfg.get("did", "")
+    dst = channel_cfg.get("dst", "")
+    if not api_user or not api_password or not did or not dst:
+        logger.warning("sms_voipms: missing api_user, api_password, did, or dst")
+        return False
+
+    # SMS body: title + body, truncated to 160 chars
+    text = f"{notif.title}: {notif.body}"
+    if len(text) > 160:
+        text = text[:157] + "..."
+
+    form_data = {
+        "api_username": api_user,
+        "api_password": api_password,
+        "method": "sendSMS",
+        "did": did,
+        "dst": dst,
+        "message": text,
+    }
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            with aiohttp.MultipartWriter("form-data") as mp:
+                for key, value in form_data.items():
+                    part = mp.append(value)
+                    part.set_content_disposition("form-data", name=key)
+            async with session.post("https://voip.ms/api/v1/rest.php", data=mp) as resp:
+                body = await resp.text()
+                if resp.status != 200:
+                    logger.error("sms_voipms HTTP %s: %s", resp.status, body)
+                    return False
+                result = json.loads(body)
+                if result.get("status") == "success":
+                    return True
+                logger.error("sms_voipms error: %s", result.get("status"))
+                return False
+    except Exception as e:
+        logger.error("sms_voipms exception: %s", e)
+        return False
+
+
+
+
+async def _send_matrix_async(channel_cfg: dict, notif: Notification) -> bool:
+    """Send a Matrix message using matrix-nio."""
+    try:
+        from nio import AsyncClient, RoomMessageText  # noqa: F401
+    except ImportError:
+        logger.error("matrix-nio not installed; pip install matrix-nio")
+        return False
+
+    from nio import AsyncClient
+    homeserver = channel_cfg.get("homeserver", "")
+    access_token = channel_cfg.get("access_token", "")
+    room_id = channel_cfg.get("room_id", "")
+    if not homeserver or not access_token or not room_id:
+        logger.warning("matrix: missing homeserver, access_token, or room_id")
+        return False
+
+    text = f"{notif.title}\n{notif.body}"
+    if notif.url:
+        text += f"\n{notif.url}"
+    html = f"<strong>{notif.title}</strong><br>{notif.body}"
+    if notif.url:
+        html += f'<br><a href="{notif.url}">Plugin metrics</a>'
+
+    client = AsyncClient(homeserver)
+    client.access_token = access_token
+    try:
+        from nio import RoomSendResponse
+        content = {
+            "msgtype": "m.text",
+            "body": text,
+            "format": "org.matrix.custom.html",
+            "formatted_body": html,
+        }
+        resp = await client.room_send(room_id, "m.room.message", content)
+        if hasattr(resp, "event_id"):
+            return True
+        logger.error("matrix send failed: %s", resp)
+        return False
+    except Exception as e:
+        logger.error("matrix exception: %s", e)
+        return False
+    finally:
+        await client.close()
+
+
+# ---------------------------------------------------------------------------
+# Channel dispatcher  (all async — sync drivers run in a thread executor)
+# ---------------------------------------------------------------------------
+
+# Sync drivers kept for `hbd notify` CLI usage (asyncio.run wraps them there).
+_DRIVERS = {
+    "pushover": _send_pushover,
+    "email": _send_email,
+    "mattermost": _send_mattermost,
+    "signal": _send_signal,
+}
+
+_TIMEOUT = 15  # seconds per channel send
+
+
+async def _dispatch_to_channel(channel_name: str, channel_cfg: dict, notif: Notification) -> bool:
+    """Send *notif* to a single named channel, honouring min_level."""
+    level = notif.level.upper()
+    if level != "RECOVER":
+        min_level = channel_cfg.get("min_level", "WARNING").upper()
+        if _level_value(level) < _level_value(min_level):
+            logger.debug(
+                "channel '%s': skipping level %s (min_level=%s)", channel_name, level, min_level
+            )
+            return True  # filtered intentionally
+
+    ch_type = channel_cfg.get("type", "")
+    try:
+        if ch_type == "matrix":
+            return await asyncio.wait_for(_send_matrix_async(channel_cfg, notif), timeout=_TIMEOUT)
+        if ch_type == "sms_voipms":
+            return await asyncio.wait_for(_send_sms_voipms_async(channel_cfg, notif), timeout=_TIMEOUT)
+        sync_driver = _DRIVERS.get(ch_type)
+        if sync_driver is None:
+            logger.warning("unknown channel type '%s' for channel '%s'", ch_type, channel_name)
+            return False
+        return await asyncio.wait_for(
+            asyncio.to_thread(sync_driver, channel_cfg, notif), timeout=_TIMEOUT
+        )
+    except asyncio.TimeoutError:
+        logger.error("channel '%s' timed out after %ds", channel_name, _TIMEOUT)
+        return False
+
+
+# ---------------------------------------------------------------------------
+# Central dispatch function
+# ---------------------------------------------------------------------------
+
+def _build_url(host_name: str) -> str:
+    base_url = _config.get("base_url", "").rstrip("/")
+    if not base_url:
+        return ""
+    return f"{base_url}/alerts?filter={host_name}"
+
+
+async def send_notification(host_name: str, notif: Notification) -> dict:
+    """Dispatch *notif* to all managers/owner of *host_name*.
+
+    Looks up the host's owner + managers, resolves each user's
+    notification_channels, and dispatches.  Silently does nothing if
+    no users are configured.
+
+    Returns a dict of {channel_name: bool} results.
+    """
+    from . import users as users_mod
+    from . import hbdclass
+
+    if not users_mod.users_enabled():
+        return {}
+
+    # Collect recipient usernames: owner + managers
+    host = hbdclass.Host.hosts.get(host_name)
+    if host is None:
+        logger.debug("send_notification: host '%s' not found", host_name)
+        return {}
+
+    recipients: set[str] = set()
+    owner = getattr(host, "owner", None)
+    if owner:
+        recipients.add(owner)
+    for m in getattr(host, "managers", []):
+        recipients.add(m)
+
+    if not recipients:
+        logger.debug("send_notification: no owner/managers for '%s'", host_name)
+        return {}
+
+    # Fill url if not already set
+    if not notif.url:
+        notif.url = _build_url(host_name)
+
+    global_channels: dict = _config.get("notification_channels", {})
+    results: dict = {}
+    level = notif.level.upper()
+    is_alert = level in ("WARNING", "CRITICAL")
+    is_recover = level in ("RECOVER",)
+
+    # For RECOVER: send to every channel that previously fired an alert for this host,
+    # regardless of that channel's min_level.
+    if is_recover and host_name in _alerted_channels:
+        for channel_name in list(_alerted_channels[host_name]):
+            channel_cfg = global_channels.get(channel_name)
+            if not channel_cfg:
+                continue
+            try:
+                ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
+                results[channel_name] = ok
+                if ok:
+                    logger.info("recover sent to channel '%s': %s", channel_name, notif.title)
+            except Exception as e:
+                logger.error("error sending recover to channel '%s': %s", channel_name, e)
+        del _alerted_channels[host_name]
+        return results
+
+    for username in recipients:
+        user = users_mod.get_user(username)
+        if user is None:
+            logger.debug("send_notification: user '%s' not found", username)
+            continue
+        for channel_name in user.notification_channels:
+            if channel_name in results:
+                continue
+            channel_cfg = global_channels.get(channel_name)
+            if not channel_cfg:
+                logger.warning("channel '%s' not defined in notification_channels", channel_name)
+                results[channel_name] = False
+                continue
+            try:
+                ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
+                results[channel_name] = ok
+                if ok:
+                    logger.info("notification sent to channel '%s': %s", channel_name, notif.title)
+                    if is_alert:
+                        _alerted_channels.setdefault(host_name, set()).add(channel_name)
+                else:
+                    logger.warning("failed to send notification to channel '%s'", channel_name)
+            except Exception as e:
+                logger.error("error sending to channel '%s': %s", channel_name, e)
+                results[channel_name] = False
+
+    return results
@@ -0,0 +1,142 @@
+"""Gitea OAuth2 support.
+
+Config shape (in ~/.hb.yaml):
+
+    oauth:
+      gitea:
+        url: https://git.example.com
+        client_id: <client-id>
+        client_secret: <client-secret>
+
+Register a Gitea OAuth2 application at:
+  Gitea → Settings → Applications → OAuth2
+Set the redirect URI to:
+  https://<hbd-host>/login/oauth/gitea/callback
+"""
+
+import logging
+import secrets
+import time
+import urllib.parse
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+STATE_TTL = 600  # 10 minutes
+
+# state_token -> expiry timestamp
+_states: dict[str, float] = {}
+
+
+def make_state() -> str:
+    """Generate a CSRF state token, store it with TTL, and return it."""
+    _purge_states()
+    token = secrets.token_hex(32)
+    _states[token] = time.time() + STATE_TTL
+    return token
+
+
+def validate_state(state: str) -> bool:
+    """Return True if *state* is known and unexpired; always removes it."""
+    expiry = _states.pop(state, None)
+    if expiry is None:
+        return False
+    return time.time() < expiry
+
+
+def _purge_states() -> None:
+    """Remove all expired CSRF state tokens from the in-memory store."""
+    now = time.time()
+    expired = [k for k, exp in list(_states.items()) if exp < now]
+    for k in expired:
+        del _states[k]
+
+
+class OAuthError(Exception):
+    """Raised when the OAuth2 flow fails for any reason."""
+
+
+def _gitea_cfg(config: dict) -> dict:
+    """Return the gitea sub-dict or {} if absent/incomplete."""
+    return config.get("oauth", {}).get("gitea", {})
+
+
+def is_enabled(config: dict) -> bool:
+    """Return True when all three required Gitea OAuth keys are present."""
+    g = _gitea_cfg(config)
+    return bool(g.get("url") and g.get("client_id") and g.get("client_secret"))
+
+
+def authorization_url(config: dict, state: str, redirect_uri: str) -> str:
+    """Return the Gitea OAuth2 authorization URL to redirect the browser to."""
+    g = _gitea_cfg(config)
+    if not (g.get("url") and g.get("client_id") and g.get("client_secret")):
+        raise OAuthError("Gitea OAuth2 is not configured")
+    params = urllib.parse.urlencode({
+        "client_id": g["client_id"],
+        "redirect_uri": redirect_uri,
+        "response_type": "code",
+        "scope": "user:email",
+        "state": state,
+    })
+    return f"{g['url'].rstrip('/')}/login/oauth/authorize?{params}"
+
+
+async def exchange_code(config: dict, code: str, redirect_uri: str) -> str:
+    """Exchange an authorization *code* for a Gitea access token.
+
+    Returns the access token string.  Raises OAuthError on any failure.
+    """
+    g = _gitea_cfg(config)
+    if not (g.get("url") and g.get("client_id") and g.get("client_secret")):
+        raise OAuthError("Gitea OAuth2 is not configured")
+    url = f"{g['url'].rstrip('/')}/login/oauth/access_token"
+    payload = {
+        "client_id": g["client_id"],
+        "client_secret": g["client_secret"],
+        "code": code,
+        "grant_type": "authorization_code",
+        "redirect_uri": redirect_uri,
+    }
+    timeout = aiohttp.ClientTimeout(total=10)
+    try:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.post(url, json=payload, headers={"Accept": "application/json"}) as resp:
+                if resp.status != 200:
+                    text = await resp.text()
+                    raise OAuthError(f"Token exchange failed ({resp.status}): {text}")
+                data = await resp.json()
+                token = data.get("access_token")
+                if not token:
+                    raise OAuthError(f"No access_token in response: {data}")
+    except aiohttp.ClientError as exc:
+        raise OAuthError(f"Token exchange network error: {exc}") from exc
+    return token
+
+
+async def fetch_user(config: dict, token: str) -> dict:
+    """Fetch the authenticated user's profile from Gitea.
+
+    Returns a dict with keys: login, full_name, avatar_url.
+    Raises OAuthError on any failure.
+    """
+    g = _gitea_cfg(config)
+    if not (g.get("url") and g.get("client_id") and g.get("client_secret")):
+        raise OAuthError("Gitea OAuth2 is not configured")
+    url = f"{g['url'].rstrip('/')}/api/v1/user"
+    timeout = aiohttp.ClientTimeout(total=10)
+    try:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(url, headers={"Authorization": f"token {token}"}) as resp:
+                if resp.status != 200:
+                    text = await resp.text()
+                    raise OAuthError(f"User fetch failed ({resp.status}): {text}")
+                data = await resp.json()
+    except aiohttp.ClientError as exc:
+        raise OAuthError(f"User fetch network error: {exc}") from exc
+    return {
+        "login": data.get("login", ""),
+        "full_name": data.get("full_name", ""),
+        "avatar_url": data.get("avatar_url", ""),
+    }
@@ -0,0 +1,373 @@
+"""Settings descriptor: maps config keys to display metadata.
+
+``get_settings_sections(config)`` returns an ordered list of sections, each
+containing a list of field descriptors.  The template iterates this structure
+generically, so adding editability later is a matter of:
+
+  1. Setting ``"editable": True`` on a field.
+  2. Adding the matching ``<input>``/``<select>`` in the template
+     (guided by ``"type"``).
+  3. Wiring a POST handler in http.py.
+
+Field descriptor keys
+---------------------
+key         str   Config key (for future form POST matching)
+label       str   Human-readable label
+description str   One-line help text shown below the value
+value       any   Sanitized display value (secrets replaced with "•••")
+type        str   One of: text | number | port | boolean | path | duration |
+                  list | secret | size | select
+editable    bool  Reserved for future use — currently always False
+sensitive   bool  True when the raw value must never be shown
+"""
+
+# Credential field names that should always be masked.
+_SECRET_KEYS = frozenset({
+    "password", "token", "user_key", "api_key", "secret",
+    "smtp_password", "smtp_user", "api_password", "access_token",
+})
+
+_CHANNEL_TYPE_LABELS = {
+    "pushover":   "Pushover",
+    "email":      "E-mail",
+    "signal":     "Signal",
+    "mattermost": "Mattermost",
+}
+
+
+def _mask(value):
+    """Return a masked placeholder for sensitive values."""
+    if not value:
+        return ""
+    return "•••"
+
+
+def _fmt_size(n):
+    """Format a byte count as a human-readable string."""
+    try:
+        n = int(n)
+    except (TypeError, ValueError):
+        return str(n)
+    for unit in ("B", "KB", "MB", "GB"):
+        if n < 1024:
+            return f"{n} {unit}"
+        n //= 1024
+    return f"{n} TB"
+
+
+def _fmt_duration(seconds):
+    """Format seconds into a human-readable duration string."""
+    try:
+        s = int(seconds)
+    except (TypeError, ValueError):
+        return str(seconds)
+    if s < 60:
+        return f"{s}s"
+    if s < 3600:
+        m, sec = divmod(s, 60)
+        return f"{m}m {sec}s" if sec else f"{m}m"
+    h, rem = divmod(s, 3600)
+    m = rem // 60
+    return f"{h}h {m}m" if m else f"{h}h"
+
+
+def _sanitize_channel(name, cfg):
+    """Return a sanitized copy of a notification channel config."""
+    result = {}
+    for k, v in cfg.items():
+        if k in _SECRET_KEYS:
+            result[k] = _mask(v)
+        elif isinstance(v, list):
+            result[k] = v
+        else:
+            result[k] = v
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def get_settings_sections(config: dict, threshold_checker=None) -> list:
+    """Return ordered list of setting sections for the settings page.
+
+    Each section:
+        {
+            "title": str,
+            "description": str,
+            "fields": [ field_descriptor, ... ]
+        }
+
+    Each field_descriptor:
+        {
+            "key":         str,
+            "label":       str,
+            "description": str,
+            "value":       display_value,
+            "raw":         raw_config_value,   # None for sensitive
+            "type":        str,
+            "editable":    bool,
+            "sensitive":   bool,
+        }
+    """
+    def field(key, label, ftype, description="", editable=False, sensitive=False):
+        raw = config.get(key)
+        if sensitive:
+            display = _mask(raw)
+            raw_out = None
+        elif ftype == "size":
+            display = _fmt_size(raw)
+            raw_out = raw
+        elif ftype == "duration":
+            display = _fmt_duration(raw)
+            raw_out = raw
+        elif ftype == "boolean":
+            display = bool(raw)
+            raw_out = raw
+        elif ftype == "list":
+            val = raw or []
+            display = list(val) if not isinstance(val, list) else val
+            raw_out = display
+        else:
+            display = raw if raw is not None else ""
+            raw_out = raw
+        return {
+            "key": key,
+            "label": label,
+            "description": description,
+            "value": display,
+            "raw": raw_out,
+            "type": ftype,
+            "editable": editable,
+            "sensitive": sensitive,
+        }
+
+    # ---- Notification channels (complex, built separately) ----------------
+    notif_channels = []
+    for ch_name, ch_cfg in (config.get("notification_channels") or {}).items():
+        if not isinstance(ch_cfg, dict):
+            continue
+        ch_type = ch_cfg.get("type", "")
+        fields = []
+        for k, v in ch_cfg.items():
+            if k == "type":
+                continue
+            sensitive = k in _SECRET_KEYS
+            fields.append({
+                "key": k,
+                "label": k.replace("_", " ").title(),
+                "value": _mask(v) if sensitive else (
+                    ", ".join(v) if isinstance(v, list) else str(v)
+                ),
+                "sensitive": sensitive,
+            })
+        notif_channels.append({
+            "name": ch_name,
+            "type": ch_type,
+            "type_label": _CHANNEL_TYPE_LABELS.get(ch_type, ch_type.title()),
+            "fields": fields,
+        })
+
+    # ---- Users (show metadata only, never password hashes) ----------------
+    users_list = []
+    for username, attrs in (config.get("users") or {}).items():
+        if not isinstance(attrs, dict):
+            continue
+        users_list.append({
+            "username": username,
+            "full_name": attrs.get("full_name", ""),
+            "admin": bool(attrs.get("admin", False)),
+            "avatar": attrs.get("avatar", ""),
+            "notification_channels": attrs.get("notification_channels", []),
+        })
+
+    # ---- Threshold configurations -----------------------------------------
+    def _tc_to_row(tc):
+        return {
+            "metric": tc.metric_path,
+            "operator": tc.operator.value,
+            "warning": tc.warning,
+            "critical": tc.critical,
+            "hysteresis": tc.hysteresis,
+            "count": tc.count,
+            "enabled": tc.enabled,
+        }
+
+    threshold_config_list = []
+    if threshold_checker is not None:
+        if threshold_checker.threshold_configs:
+            for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
+                # For the default config use the merged effective set;
+                # for named overrides use only the explicitly defined metrics
+                # (threshold_raw_configs) so inherited defaults are not repeated.
+                if cfg_name == "default":
+                    display_metrics = cfg_metrics
+                else:
+                    display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
+                metrics = sorted(
+                    [_tc_to_row(tc) for tc in display_metrics.values()],
+                    key=lambda m: m["metric"],
+                )
+                threshold_config_list.append({"name": cfg_name, "metrics": metrics})
+        elif threshold_checker.thresholds:
+            metrics = sorted(
+                [_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
+                key=lambda m: m["metric"],
+            )
+            threshold_config_list.append({"name": "default", "metrics": metrics})
+
+    # ---- Hosts summary ----------------------------------------------------
+    hosts_list = []
+    for hname, hcfg in (config.get("hosts") or {}).items():
+        if not isinstance(hcfg, dict):
+            continue
+        hosts_list.append({
+            "name": hname,
+            "watch": bool(hcfg.get("watch", True)),
+            "dyndns": bool(hcfg.get("dyndns", False)),
+            "owner": hcfg.get("owner", ""),
+            "managers": hcfg.get("managers", []),
+            "monitors": hcfg.get("monitors", []),
+            "threshold_config": hcfg.get("threshold_config", ""),
+            "notification_channels": hcfg.get("notification_channels", []),
+        })
+
+    return [
+        {
+            "id": "network",
+            "title": "Network",
+            "description": "Ports and bind addresses for all server sockets.",
+            "fields": [
+                field("hb_port",  "Heartbeat UDP port",  "port",
+                      "UDP port the server listens on for heartbeat datagrams."),
+                field("hbd_host", "HTTP bind address",   "text",
+                      "Interface to bind the HTTP server to. Empty = all interfaces."),
+                field("hbd_port", "HTTP API port",       "port",
+                      "TCP port for the HTTP API and web UI."),
+                field("ws_port",  "WebSocket port",      "port",
+                      "TCP port for the plain WebSocket server."),
+                field("wss_port", "Secure WebSocket port", "port",
+                      "TCP port for WSS (TLS WebSocket). Leave empty to disable."),
+            ],
+        },
+        {
+            "id": "tls",
+            "title": "TLS / WebSocket Security",
+            "description": "Certificate paths used when wss_port is set.",
+            "fields": [
+                field("cert_path", "Certificate directory", "path",
+                      "Directory containing the TLS certificate and key files."),
+                field("wss_pem",   "Certificate file",     "text",
+                      "Filename of the TLS certificate chain (PEM format)."),
+                field("wss_key",   "Key file",             "text",
+                      "Filename of the TLS private key (PEM format)."),
+            ],
+        },
+        {
+            "id": "monitoring",
+            "title": "Monitoring",
+            "description": "Heartbeat timing and alert re-notification behaviour.",
+            "fields": [
+                field("interval",  "Heartbeat interval", "duration",
+                      "Expected time between heartbeat messages from each client."),
+                field("grace",     "Grace multiplier",   "number",
+                      "A host is marked overdue after interval × grace seconds of silence."),
+                field("threshold_renotify_interval", "Re-notify interval", "duration",
+                      "How often to re-send notifications for ongoing threshold alerts."),
+                field("autosave_interval", "Autosave interval", "duration",
+                      "How often the server saves its state to disk."),
+            ],
+        },
+        {
+            "id": "persistence",
+            "title": "Persistence & Logging",
+            "description": "State file and event log settings.",
+            "fields": [
+                field("pickfile", "State file",   "path",
+                      "Path to the pickle file used to persist host state across restarts."),
+                field("logfile",  "Event log",    "path",
+                      "Path to the event log file."),
+            ],
+        },
+        {
+            "id": "journal",
+            "title": "Message Journal",
+            "description": "All received heartbeat and plugin messages are journalled here.",
+            "fields": [
+                field("journal_enabled",     "Enabled",          "boolean",
+                      "Turn journalling on or off."),
+                field("journal_dir",         "Journal directory","path",
+                      "Directory where journal files are written."),
+                field("journal_file",        "Journal filename", "text",
+                      "Base filename for the journal (rotated copies get a numeric suffix)."),
+                field("journal_max_size",    "Max file size",    "size",
+                      "Rotate the journal when it exceeds this size."),
+                field("journal_max_backups", "Backup count",     "number",
+                      "Number of rotated journal files to keep."),
+            ],
+        },
+        {
+            "id": "dns",
+            "title": "Dynamic DNS",
+            "description": "nsupdate-based DNS registration for dynamic hosts.",
+            "fields": [
+                field("nsupdate_bin", "nsupdate binary", "path",
+                      "Full path to the nsupdate executable."),
+                field("dyndomains",   "Dynamic domains", "list",
+                      "DNS zones managed by nsupdate for dynamic hosts."),
+                field("drophosts",    "Drop hosts",      "list",
+                      "Hostnames to silently ignore — no state, no alerts."),
+            ],
+        },
+        {
+            "id": "users",
+            "title": "Users",
+            "description": "Accounts defined in the config file. Password hashes are never shown.",
+            "users": users_list,
+            "fields": [
+                field("default_owner", "Default owner", "text",
+                      "Username that owns hosts with no explicit owner. "
+                      "Falls back to the first admin user."),
+            ],
+        },
+        {
+            "id": "channels",
+            "title": "Notification Channels",
+            "description": "Named notification providers. Credentials are masked.",
+            "channels": notif_channels,
+            "fields": [
+                field("default_notification_channels", "Default channels", "list",
+                      "Channels used when a host does not specify its own."),
+            ],
+        },
+        {
+            "id": "hosts",
+            "title": "Hosts",
+            "description": "Host definitions loaded from the config file.",
+            "hosts": hosts_list,
+            "fields": [],
+        },
+        {
+            "id": "thresholds",
+            "title": "Threshold Configurations",
+            "description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
+            "threshold_configs": threshold_config_list,
+            "fields": [
+                field("default_threshold_config", "Default config", "text",
+                      "Threshold config used for hosts with no explicit mapping."),
+            ],
+        },
+        {
+            "id": "runtime",
+            "title": "Runtime",
+            "description": "Flags set at startup (require restart to change).",
+            "fields": [
+                field("foreground", "Foreground mode", "boolean",
+                      "Run in the foreground instead of daemonising."),
+                field("verbose",    "Verbose logging",  "boolean",
+                      "Enable verbose log output."),
+                field("debug",      "Debug level",      "number",
+                      "0 = off. Higher values increase log verbosity."),
+            ],
+        },
+    ]
@@ -140,3 +140,68 @@
      float: left;
  }

+/* ── Responsive / mobile ── */
+
+/* Suppress the global transition on mobile to avoid sluggish feel */
+@media (max-width: 640px) {
+  * { transition: none !important; }
+
+  html, body {
+    overflow: auto;
+    height: auto;
+    font-size: 16px;          /* prevent iOS auto-zoom on inputs */
+  }
+
+  /* Pages that use flex-column full-viewport layout need to relax on mobile */
+  body[style*="height: 100vh"],
+  body {
+    height: auto !important;
+    min-height: 100vh;
+  }
+
+  /* Containers: full width, no fixed heights */
+  .container {
+    max-width: 100% !important;
+    max-height: none !important;
+    overflow: visible !important;
+    padding: 8px !important;
+  }
+
+  /* Log section: fixed reasonable height instead of flex-grow */
+  .log-section {
+    flex: none !important;
+    max-height: 40vh !important;
+    overflow-y: auto !important;
+  }
+
+  /* Table section: allow vertical scroll, cap height */
+  .table-section {
+    max-height: 55vh !important;
+    overflow-y: auto !important;
+    overflow-x: auto !important;
+    padding: 8px !important;
+  }
+
+  /* Slightly larger tap targets in tables */
+  #ntable td, #ntable th {
+    padding: 4px 6px !important;
+    font-size: 0.82em !important;
+  }
+
+  /* Cards on plugin/alerts pages */
+  .host-card, .alert-card, .card {
+    padding: 10px !important;
+    margin-bottom: 8px !important;
+  }
+
+  /* Settings page tables */
+  table { width: 100%; }
+
+  h1 { font-size: 1.2em !important; }
+  h2 { font-size: 1em !important; }
+}
+
+/* Suppress nav-username text on very narrow screens — avatar/initials is enough */
+@media (max-width: 400px) {
+  .nav-username { display: none; }
+}
@@ -0,0 +1,199 @@
+<!DOCTYPE html>
+<html>
+  {% include 'head.html' %}
+
+  <style>
+    html, body { overflow: visible; }
+
+    .container {
+      max-width: 700px;
+      margin: 0 auto;
+    }
+
+    h1 {
+      color: #333;
+      margin-bottom: 4px;
+      font-size: 1.5em;
+    }
+
+    .subtitle {
+      color: #666;
+      margin-bottom: 24px;
+      font-size: 0.9em;
+    }
+
+    .section {
+      background: #fff;
+      border-radius: 8px;
+      box-shadow: 0 1px 6px rgba(0,0,0,0.1);
+      padding: 20px 24px;
+      margin-bottom: 20px;
+    }
+
+    .section h2 {
+      font-size: 1em;
+      font-weight: 700;
+      color: #333;
+      margin: 0 0 16px;
+      padding-bottom: 10px;
+      border-bottom: 1px solid #eee;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+    }
+
+    .info-row {
+      display: flex;
+      align-items: baseline;
+      padding: 8px 0;
+      border-bottom: 1px solid #f5f5f5;
+      font-size: 0.9em;
+    }
+    .info-row:last-child { border-bottom: none; }
+
+    .info-label {
+      width: 160px;
+      flex-shrink: 0;
+      color: #666;
+      font-size: 0.88em;
+    }
+
+    .info-value {
+      color: #222;
+      word-break: break-all;
+    }
+
+    .info-value a {
+      color: #0066cc;
+      text-decoration: none;
+    }
+    .info-value a:hover { text-decoration: underline; }
+
+    .version-badge {
+      display: inline-block;
+      padding: 3px 12px;
+      background: #e8f0fe;
+      color: #1a73e8;
+      border-radius: 12px;
+      font-size: 0.85em;
+      font-weight: 600;
+      font-family: monospace;
+    }
+
+    .hb-logo {
+      font-size: 2.5em;
+      font-weight: 700;
+      color: #0066cc;
+      letter-spacing: -1px;
+      margin-bottom: 6px;
+    }
+
+    .hb-tagline {
+      color: #555;
+      font-size: 0.95em;
+    }
+
+    .logo-section {
+      display: flex;
+      align-items: center;
+      gap: 20px;
+      padding: 8px 0 4px;
+    }
+
+    .logo-text { flex: 1; }
+  </style>
+
+  <body>
+    {% include 'nav.html' %}
+
+    <div class="container">
+      <h1>{{ header }}</h1>
+      <p class="subtitle">Heartbeat monitoring system</p>
+
+      <div class="section">
+        <div class="logo-section">
+          <div class="logo-text">
+            <div class="hb-logo">Heartbeat</div>
+            <div class="hb-tagline">Lightweight host monitoring over UDP</div>
+          </div>
+          <span class="version-badge">v{{ hbd_version }}</span>
+        </div>
+      </div>
+
+      <div class="section">
+        <h2>Version</h2>
+        <div class="info-row">
+          <span class="info-label">Server version</span>
+          <span class="info-value">{{ hbd_version }}</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Python</span>
+          <span class="info-value">{{ python_version }}</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">License</span>
+          <span class="info-value">MIT</span>
+        </div>
+      </div>
+
+      <div class="section">
+        <h2>Runtime</h2>
+        <div class="info-row">
+          <span class="info-label">Host</span>
+          <span class="info-value">{{ server_hostname }}</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Started</span>
+          <span class="info-value">{{ start_time_str }}</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Uptime</span>
+          <span class="info-value" id="uptime-value">{{ uptime_str }}</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Hosts monitored</span>
+          <span class="info-value">{{ host_count }}</span>
+        </div>
+      </div>
+
+      <div class="section">
+        <h2>Contact &amp; Source</h2>
+        <div class="info-row">
+          <span class="info-label">Author</span>
+          <span class="info-value">Andreas Wrede</span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Email</span>
+          <span class="info-value"><a href="mailto:aew@wrede.ca">aew@wrede.ca</a></span>
+        </div>
+        <div class="info-row">
+          <span class="info-label">Repository</span>
+          <span class="info-value"><a href="https://git.wrede.ca/andreas/heartbeat" target="_blank" rel="noopener">git.wrede.ca/andreas/heartbeat</a></span>
+        </div>
+      </div>
+
+    </div>
+
+    <script>
+      (function() {
+        var startEpoch = {{ start_epoch }};
+        var el = document.getElementById('uptime-value');
+        if (!el) return;
+        function fmt(s) {
+          var d = Math.floor(s / 86400);
+          var h = Math.floor((s % 86400) / 3600);
+          var m = Math.floor((s % 3600) / 60);
+          var sec = s % 60;
+          if (d > 0) return d + 'd ' + h + 'h ' + m + 'm';
+          if (h > 0) return h + 'h ' + m + 'm ' + sec + 's';
+          return m + 'm ' + sec + 's';
+        }
+        function tick() {
+          var up = Math.floor(Date.now() / 1000 - startEpoch);
+          el.textContent = fmt(up);
+        }
+        tick();
+        setInterval(tick, 1000);
+      })();
+    </script>
+  </body>
+</html>
@@ -0,0 +1,598 @@
+<!DOCTYPE html>
+<html>
+  {% include 'head.html' %}
+
+  <style>
+
+    html, body {
+      height: auto;
+      overflow-y: auto;
+    }
+
+    .container {
+      max-width: 1400px;
+      margin: 0 auto;
+    }
+
+    h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
+
+    .subtitle {
+      color: #666;
+      margin-bottom: 30px;
+    }
+
+    .summary-cards {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin-bottom: 16px;
+    }
+
+    .summary-card {
+      background: white;
+      border-radius: 6px;
+      padding: 6px 14px;
+      box-shadow: 0 1px 4px rgba(0,0,0,0.1);
+      display: flex;
+      align-items: center;
+      gap: 8px;
+      border-left: 4px solid #ddd;
+    }
+
+    .summary-card.critical { border-left-color: #ea1e0f; }
+    .summary-card.warning  { border-left-color: #ff9800; }
+    .summary-card.ok       { border-left-color: #4caf50; }
+
+    .summary-number {
+      font-size: 1.4em;
+      font-weight: bold;
+      line-height: 1;
+    }
+
+    .summary-number.critical { color: #ea1e0f; }
+    .summary-number.warning  { color: #ff9800; }
+    .summary-number.ok       { color: #4caf50; }
+
+    .summary-label {
+      color: #666;
+      font-size: 0.85em;
+    }
+
+    .filters {
+      background: white;
+      border-radius: 8px;
+      padding: 15px;
+      margin-bottom: 20px;
+      box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+      display: flex;
+      gap: 15px;
+      align-items: center;
+    }
+
+    .filter-label {
+      font-weight: bold;
+      color: #555;
+    }
+
+    .filter-button {
+      padding: 8px 16px;
+      border: 2px solid #ddd;
+      background: white;
+      border-radius: 20px;
+      cursor: pointer;
+      transition: all 0.2s;
+      font-size: 0.9em;
+    }
+
+    .filter-button:hover {
+      border-color: #2196f3;
+    }
+
+    .filter-button.active {
+      background: #2196f3;
+      color: white;
+      border-color: #2196f3;
+    }
+
+    .filter-input {
+      padding: 7px 12px;
+      border: 2px solid #ddd;
+      border-radius: 20px;
+      font-size: 0.9em;
+      outline: none;
+      width: 200px;
+      transition: border-color 0.2s;
+    }
+
+    .filter-input:focus {
+      border-color: #2196f3;
+    }
+
+    .filter-input.invalid {
+      border-color: #f44336;
+    }
+
+    .alerts-container {
+      background: white;
+      border-radius: 8px;
+      padding: 20px;
+      box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+    }
+
+    .alert-item {
+      border-left: 5px solid #ddd;
+      padding: 15px;
+      margin-bottom: 15px;
+      background: #fafafa;
+      border-radius: 4px;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      transition: all 0.2s;
+    }
+    
+    .alert-item.acknowledged {
+      opacity: 0.8;
+      background: #f0f0f0;
+    }
+
+    .alert-item:hover {
+      box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+      transform: translateX(5px);
+    }
+
+    .alert-item.critical {
+      border-left-color: #f44336;
+      background: #ffebee;
+    }
+
+    .alert-item.warning {
+      border-left-color: #ff9800;
+      background: #fff3e0;
+    }
+
+    .alert-item.unknown {
+      border-left-color: #9e9e9e;
+      background: #f5f5f5;
+    }
+
+    .alert-main {
+      flex: 1;
+    }
+
+    .alert-header {
+      display: flex;
+      align-items: center;
+      gap: 15px;
+      margin-bottom: 8px;
+    }
+
+    .alert-level {
+      padding: 4px 12px;
+      border-radius: 12px;
+      font-size: 0.75em;
+      font-weight: bold;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+    }
+
+    .alert-level.critical {
+      background: #f44336;
+      color: white;
+    }
+
+    .alert-level.warning {
+      background: #ff9800;
+      color: white;
+    }
+
+    .alert-level.unknown {
+      background: #9e9e9e;
+      color: white;
+    }
+
+    .alert-hostname {
+      font-weight: bold;
+      color: #0066cc;
+      font-size: 1.1em;
+      text-decoration: none;
+    }
+    .alert-hostname:hover {
+      text-decoration: underline;
+    }
+
+    .alert-metric {
+      color: #0066cc;
+      font-size: 1.1em;
+      font-weight: normal;
+    }
+
+    .alert-details {
+      display: flex;
+      gap: 20px;
+      color: #666;
+      font-size: 0.9em;
+    }
+
+    .alert-value {
+      font-weight: bold;
+      color: #333;
+    }
+
+    .alert-duration {
+      color: #999;
+      font-size: 0.85em;
+    }
+    
+    .alert-actions {
+      display: flex;
+      flex-direction: column;
+      gap: 8px;
+      margin-left: 15px;
+    }
+    
+    .acknowledge-btn {
+      padding: 8px 16px;
+      background: #2196f3;
+      color: white;
+      border: none;
+      border-radius: 4px;
+      cursor: pointer;
+      font-size: 0.85em;
+      transition: all 0.2s;
+      white-space: nowrap;
+    }
+    
+    .acknowledge-btn:hover {
+      background: #1976d2;
+      transform: scale(1.05);
+    }
+    
+    .acknowledge-btn:disabled {
+      background: #ccc;
+      cursor: not-allowed;
+      transform: none;
+    }
+    
+    .acknowledged-badge {
+      padding: 4px 8px;
+      background: #4caf50;
+      color: white;
+      border-radius: 4px;
+      font-size: 0.75em;
+      text-align: center;
+      white-space: nowrap;
+    }
+
+    .no-alerts {
+      text-align: center;
+      padding: 60px 20px;
+      color: #999;
+    }
+
+    .no-alerts-icon {
+      font-size: 4em;
+      margin-bottom: 20px;
+    }
+
+    .loading {
+      text-align: center;
+      padding: 40px;
+      color: #666;
+    }
+
+    .error {
+      background: #ffebee;
+      border-left: 4px solid #f44336;
+      padding: 20px;
+      margin: 20px 0;
+      border-radius: 4px;
+      color: #c62828;
+    }
+
+    .refresh-info {
+      text-align: center;
+      color: #999;
+      font-size: 0.85em;
+      margin-top: 20px;
+      padding-top: 20px;
+      border-top: 1px solid #e0e0e0;
+    }
+
+    .last-update {
+      color: #666;
+      font-size: 0.9em;
+      text-align: right;
+      margin-bottom: 15px;
+    }
+  </style>
+
+  <body>
+    {% include 'nav.html' %}
+
+    <div class="container">
+      <h1>{{ header }}</h1>
+      <p class="subtitle">Real-time monitoring alerts and threshold violations</p>
+
+      <div class="summary-cards" id="summary-cards">
+        <div class="summary-card critical">
+          <div class="summary-label">Critical</div>
+          <div class="summary-number critical" id="critical-count">-</div>
+        </div>
+        <div class="summary-card warning">
+          <div class="summary-label">Warning</div>
+          <div class="summary-number warning" id="warning-count">-</div>
+        </div>
+        <div class="summary-card ok">
+          <div class="summary-label">Total Hosts</div>
+          <div class="summary-number ok" id="host-count">-</div>
+        </div>
+      </div>
+
+      <div class="filters">
+        <span class="filter-label">Show:</span>
+        <button class="filter-button active" onclick="filterAlerts('all')">All</button>
+        <button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
+        <button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
+        <input id="host-filter" class="filter-input" type="text" placeholder="host filter (regex)" oninput="onHostFilterInput(this)">
+      </div>
+
+      <div class="alerts-container">
+        <div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
+        <div id="alerts-list">
+          <div class="loading">Loading alerts...</div>
+        </div>
+        <div class="refresh-info">
+          Auto-refreshing every 15 seconds
+        </div>
+      </div>
+    </div>
+
+    <script>
+      let currentFilter = 'all';
+      let allAlerts = [];
+      let hostFilterRe = null;
+
+      async function loadAlerts() {
+        try {
+          const response = await fetch('/api/0/alerts');
+          if (!response.ok) {
+            throw new Error(`HTTP ${response.status}`);
+          }
+          
+          const data = await response.json();
+          allAlerts = data.alerts;
+          
+          // Update summary cards
+          document.getElementById('critical-count').textContent = data.summary.critical || 0;
+          document.getElementById('warning-count').textContent = data.summary.warning || 0;
+          document.getElementById('host-count').textContent = data.host_count || 0;
+          
+          // Update last update time
+          document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
+          
+          // Render alerts
+          renderAlerts(allAlerts);
+          
+        } catch (error) {
+          document.getElementById('alerts-list').innerHTML = 
+            `<div class="error">Failed to load alerts: ${error.message}</div>`;
+        }
+      }
+
+      function renderAlerts(alerts) {
+        const container = document.getElementById('alerts-list');
+        
+        // Filter alerts based on current filter
+        let filteredAlerts = alerts;
+        if (currentFilter !== 'all') {
+          filteredAlerts = filteredAlerts.filter(alert =>
+            alert.level.toLowerCase() === currentFilter
+          );
+        }
+        if (hostFilterRe) {
+          filteredAlerts = filteredAlerts.filter(alert => hostFilterRe.test(alert.hostname));
+        }
+        
+        if (filteredAlerts.length === 0) {
+          if (currentFilter === 'all' && alerts.length === 0) {
+            container.innerHTML = `
+              <div class="no-alerts">
+                <div class="no-alerts-icon">✓</div>
+                <h2>All Systems Normal</h2>
+                <p>No active alerts at this time</p>
+              </div>
+            `;
+          } else {
+            container.innerHTML = `
+              <div class="no-alerts">
+                <p>No ${currentFilter} alerts</p>
+              </div>
+            `;
+          }
+          return;
+        }
+        
+        let html = '';
+        for (const alert of filteredAlerts) {
+          html += renderAlert(alert);
+        }
+        container.innerHTML = html;
+      }
+
+      function renderAlert(alert) {
+        const level = alert.level.toLowerCase();
+        const duration = getDuration(alert.since);
+        const acknowledged = alert.acknowledged || false;
+        
+        // Use formatted message if available, otherwise build from individual fields
+        let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
+        if (alert.formatted_message) {
+          valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
+        } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
+          valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
+        }
+        if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
+          const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
+          valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
+        }
+        
+        // Build actions section
+        let actionsHtml = '';
+        if (acknowledged) {
+          actionsHtml = `
+            <div class="alert-actions">
+              <div class="acknowledged-badge">✓ Acknowledged</div>
+            </div>
+          `;
+        } else {
+          actionsHtml = `
+            <div class="alert-actions">
+              <button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
+                Acknowledge
+              </button>
+            </div>
+          `;
+        }
+        
+        return `
+          <div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
+            <div class="alert-main">
+              <div class="alert-header">
+                <span class="alert-level ${level}">${alert.level}</span>
+                <a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
+                <span class="alert-metric">${(alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path).replace(/_status_code$/, '')}</span>
+              </div>
+              <div class="alert-details">
+                <span>${valueText}</span>
+                <span class="alert-duration">Active for ${duration}</span>
+              </div>
+            </div>
+            ${actionsHtml}
+          </div>
+        `;
+      }
+
+      function formatValue(value) {
+        if (typeof value === 'number') {
+          if (value > 1000) {
+            return value.toLocaleString();
+          }
+          return value.toFixed(2);
+        }
+        return value;
+      }
+
+      function getDuration(timestamp) {
+        const now = Date.now() / 1000;
+        const seconds = Math.floor(now - timestamp);
+        
+        if (seconds < 60) {
+          return `${seconds}s`;
+        } else if (seconds < 3600) {
+          return `${Math.floor(seconds / 60)}m`;
+        } else if (seconds < 86400) {
+          const hours = Math.floor(seconds / 3600);
+          const minutes = Math.floor((seconds % 3600) / 60);
+          return `${hours}h ${minutes}m`;
+        } else {
+          const days = Math.floor(seconds / 86400);
+          const hours = Math.floor((seconds % 86400) / 3600);
+          return `${days}d ${hours}h`;
+        }
+      }
+
+      function filterAlerts(filter) {
+        currentFilter = filter;
+        
+        // Update active button
+        document.querySelectorAll('.filter-button').forEach(btn => {
+          btn.classList.remove('active');
+        });
+        event.target.classList.add('active');
+        
+        // Re-render with new filter
+        renderAlerts(allAlerts);
+      }
+      
+      async function acknowledgeAlert(hostname, metricPath, event) {
+        // Prevent event bubbling
+        if (event) {
+          event.stopPropagation();
+        }
+        
+        // Disable the button
+        const button = event.target;
+        button.disabled = true;
+        button.textContent = 'Acknowledging...';
+        
+        try {
+          const response = await fetch('/api/0/alerts/acknowledge', {
+            method: 'POST',
+            headers: {
+              'Content-Type': 'application/json',
+            },
+            body: JSON.stringify({
+              hostname: hostname,
+              metric_path: metricPath,
+            }),
+          });
+          
+          if (!response.ok) {
+            throw new Error(`HTTP ${response.status}`);
+          }
+          
+          const result = await response.json();
+          
+          // Update the alert in our local data
+          const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
+          if (alert) {
+            alert.acknowledged = true;
+            alert.acknowledged_at = result.acknowledged_at;
+          }
+          
+          // Re-render alerts
+          renderAlerts(allAlerts);
+          
+        } catch (error) {
+          alert(`Failed to acknowledge alert: ${error.message}`);
+          button.disabled = false;
+          button.textContent = 'Acknowledge';
+        }
+      }
+
+      function onHostFilterInput(input) {
+        const val = input.value.trim();
+        if (!val) {
+          hostFilterRe = null;
+          input.classList.remove('invalid');
+        } else {
+          try {
+            hostFilterRe = new RegExp(val, 'i');
+            input.classList.remove('invalid');
+          } catch (_) {
+            hostFilterRe = null;
+            input.classList.add('invalid');
+          }
+        }
+        renderAlerts(allAlerts);
+      }
+
+      // Auto-refresh every 15 seconds
+      setInterval(loadAlerts, 15000);
+
+      // Initialise filter from URL query string (?filter=...)
+      (function () {
+        const param = new URLSearchParams(window.location.search).get('filter');
+        if (param) {
+          const input = document.getElementById('host-filter');
+          input.value = param;
+          onHostFilterInput(input);
+        }
+      })();
+
+      // Initial load
+      loadAlerts();
+    </script>
+  </body>
+</html>
@@ -1,5 +1,5 @@
 <footer>
 <div id="copyright">
-    &copy;2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
+    &copy;2002-2026 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
    </div>
 </footer>
@@ -0,0 +1,287 @@
+<head>
+    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <link rel="stylesheet" href="/static/style.css" type="text/css" />
+    <link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
+    <title>{{ title }}</title>
+    {% if extra_scripts %}<script src="{{ extra_scripts }}"></script>{% endif %}
+    <style>
+      /* ── Reset / shared baseline ── */
+      *, *::before, *::after { box-sizing: border-box; }
+      html {
+        font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
+        font-size: 14px;
+      }
+      body {
+        margin: 0;
+        padding: 10px;
+        padding-top: 60px;
+        background: #f5f5f5;
+      }
+      h1 { font-size: 1.5em; color: #333; margin: 0 0 5px; }
+      h2 { font-size: 1.1em; color: #333; margin: 0 0 8px; }
+      p  { margin: 0; }
+
+      /* Navigation bar — shared across all pages */
+      .nav {
+        position: fixed;
+        top: 0;
+        left: 0;
+        right: 0;
+        z-index: 200;
+        background: #fff;
+        padding: 6px 12px;
+        box-shadow: 0 2px 4px rgba(0,0,0,.1);
+        display: flex;
+        align-items: center;
+        justify-content: space-between;
+        flex-wrap: wrap;
+        gap: 8px;
+      }
+      .nav-links { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
+      .nav a {
+        margin-right: 20px;
+        text-decoration: none;
+        color: #0066cc;
+        font-weight: 500;
+        font-size: 0.9em;
+      }
+      .nav a:hover { text-decoration: underline; }
+      .nav a.active { color: #333; font-weight: bold; }
+      .nav-user {
+        display: flex;
+        align-items: center;
+        gap: 8px;
+        text-decoration: none;
+        color: #333;
+        font-size: 0.9em;
+        font-weight: 500;
+        padding: 4px 8px;
+        border-radius: 20px;
+        transition: background 0.15s;
+      }
+      .nav-user:hover { background: #f0f4ff; text-decoration: none; }
+      .nav-username {
+        max-width: 0;
+        overflow: hidden;
+        white-space: nowrap;
+        opacity: 0;
+        transition: max-width 0.2s ease, opacity 0.2s ease;
+      }
+      .nav-user:hover .nav-username {
+        max-width: 160px;
+        opacity: 1;
+      }
+      .nav-avatar {
+        width: 28px; height: 28px;
+        border-radius: 50%;
+        object-fit: cover;
+        flex-shrink: 0;
+      }
+      .nav-initials {
+        width: 28px; height: 28px;
+        border-radius: 50%;
+        background: #0066cc;
+        color: #fff;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        font-size: 0.75em;
+        font-weight: 700;
+        flex-shrink: 0;
+      }
+
+      /* ── Mobile nav: hamburger toggle ── */
+      .nav-hamburger {
+        display: none;
+        flex-direction: column;
+        justify-content: space-between;
+        width: 26px; height: 20px;
+        cursor: pointer;
+        flex-shrink: 0;
+        background: none;
+        border: none;
+        padding: 0;
+      }
+      .nav-hamburger span {
+        display: block;
+        height: 3px;
+        background: #555;
+        border-radius: 2px;
+      }
+
+      @media (max-width: 640px) {
+        .nav-hamburger { display: flex; }
+        .nav-links {
+          display: none;
+          width: 100%;
+          flex-direction: column;
+          align-items: flex-start;
+          padding-top: 8px;
+          border-top: 1px solid #eee;
+          order: 3;
+        }
+        .nav-links.nav-open { display: flex; }
+        .nav-links a { margin-right: 0; padding: 6px 0; font-size: 1em; }
+      }
+
+      /* Swiss railway clock — nav */
+      .nav-pie {
+        flex-shrink: 0;
+        line-height: 0;
+        margin-left: auto;
+        padding: 4px 4px 4px 0;
+      }
+      #alert-pie { display: block; cursor: default; }
+      .nav-clock {
+        flex-shrink: 0;
+        line-height: 0;
+        padding: 4px 4px 4px 0;
+        cursor: pointer;
+      }
+      #swiss-clock { display: block; }
+
+      /* Swiss railway clock — full-page overlay */
+      #clock-overlay {
+        display: none;
+        position: fixed;
+        inset: 0;
+        z-index: 9999;
+        background: #1a1a1a;
+        align-items: center;
+        justify-content: center;
+        cursor: pointer;
+      }
+      #clock-overlay.visible { display: flex; }
+      #swiss-clock-overlay { display: block; }
+    </style>
+    <script>
+    /* ── Swiss Federal Railway (SBB) clock ── */
+
+    /* Draw one frame of the clock onto any canvas element. */
+    function drawSwissClock(canvas) {
+      var SIZE = canvas.width;
+      var R = SIZE / 2;
+      var ctx = canvas.getContext('2d');
+      var now = new Date();
+      var h  = now.getHours() % 12;
+      var m  = now.getMinutes();
+      var s  = now.getSeconds();
+      var ms = now.getMilliseconds();
+
+      /* Seconds hand idles ~1.5 s at 12 before advancing (SBB behaviour) */
+      var sFrac = s + ms / 1000;
+      var sAngle = sFrac >= 58.5 ? 0 : (sFrac / 58.5) * Math.PI * 2;
+
+      ctx.clearRect(0, 0, SIZE, SIZE);
+
+      /* face */
+      ctx.beginPath();
+      ctx.arc(R, R, R - 1, 0, Math.PI * 2);
+      ctx.fillStyle = '#fff';
+      ctx.fill();
+      ctx.strokeStyle = '#333';
+      ctx.lineWidth = SIZE * 0.018;
+      ctx.stroke();
+
+      /* tick marks */
+      for (var i = 0; i < 60; i++) {
+        var a = (i / 60) * Math.PI * 2 - Math.PI / 2;
+        var isHour = (i % 5 === 0);
+        ctx.beginPath();
+        ctx.moveTo(R + Math.cos(a) * (isHour ? R * 0.72 : R * 0.88),
+                   R + Math.sin(a) * (isHour ? R * 0.72 : R * 0.88));
+        ctx.lineTo(R + Math.cos(a) * R * 0.94,
+                   R + Math.sin(a) * R * 0.94);
+        ctx.strokeStyle = '#222';
+        ctx.lineWidth = isHour ? SIZE * 0.027 : SIZE * 0.011;
+        ctx.lineCap = 'butt';
+        ctx.stroke();
+      }
+
+      /* hands */
+      function hand(angle, tip, tail, width, color) {
+        ctx.save();
+        ctx.translate(R, R);
+        ctx.rotate(angle);
+        ctx.beginPath();
+        ctx.moveTo(tail, 0);
+        ctx.lineTo(tip,  0);
+        ctx.strokeStyle = color;
+        ctx.lineWidth = width;
+        ctx.lineCap = 'square';
+        ctx.stroke();
+        ctx.restore();
+      }
+
+      hand((sFrac >= 58.5 ? m + 1 : m) / 60 * Math.PI * 2 - Math.PI / 2,
+           R * 0.88, -R * 0.12, SIZE * 0.027, '#222');           /* minute */
+      hand((h + m / 60) / 12 * Math.PI * 2 - Math.PI / 2,
+           R * 0.58, -R * 0.12, SIZE * 0.039, '#222');           /* hour   */
+      hand(sAngle - Math.PI / 2, R * 0.78, -R * 0.22,
+           SIZE * 0.013, '#e00');                                 /* second tail+tip */
+
+      /* round dot at tip of second hand */
+      var dotR = SIZE * 0.028;
+      ctx.save();
+      ctx.translate(R, R);
+      ctx.rotate(sAngle - Math.PI / 2);
+      ctx.beginPath();
+      ctx.arc(R * 0.78, 0, dotR, 0, Math.PI * 2);
+      ctx.fillStyle = '#e00';
+      ctx.fill();
+      ctx.restore();
+
+      /* centre cap */
+      ctx.beginPath();
+      ctx.arc(R, R, R * 0.04, 0, Math.PI * 2);
+      ctx.fillStyle = '#222';
+      ctx.fill();
+    }
+
+    /* Resize the overlay canvas to fit the viewport, keeping it square. */
+    function resizeOverlayClock() {
+      var oc = document.getElementById('swiss-clock-overlay');
+      if (!oc) return;
+      var size = Math.min(window.innerWidth, window.innerHeight) * 0.88;
+      size = Math.floor(size);
+      oc.width  = size;
+      oc.height = size;
+    }
+
+    /* Main tick — redraws both nav clock and (if visible) overlay clock. */
+    function clockTick() {
+      var nav = document.getElementById('swiss-clock');
+      if (nav) drawSwissClock(nav);
+      var overlay = document.getElementById('clock-overlay');
+      if (overlay && overlay.classList.contains('visible')) {
+        var oc = document.getElementById('swiss-clock-overlay');
+        if (oc) drawSwissClock(oc);
+      }
+      var delay = 100 - (Date.now() % 100);
+      setTimeout(clockTick, delay);
+    }
+
+    document.addEventListener('DOMContentLoaded', function() {
+      /* Start the shared tick loop */
+      clockTick();
+
+      /* Overlay toggle — clicking the nav clock opens it */
+      var navClock = document.querySelector('.nav-clock');
+      var overlay  = document.getElementById('clock-overlay');
+      if (navClock && overlay) {
+        navClock.addEventListener('click', function() {
+          resizeOverlayClock();
+          overlay.classList.add('visible');
+        });
+        overlay.addEventListener('click', function() {
+          overlay.classList.remove('visible');
+        });
+        window.addEventListener('resize', function() {
+          if (overlay.classList.contains('visible')) resizeOverlayClock();
+        });
+      }
+    });
+    </script>
+    <script src="static/sorttable.js"></script>
+</head>
@@ -0,0 +1,596 @@
+<!DOCTYPE html>
+<html>
+  {% include 'head.html' %}
+
+  <style>
+    body {
+      display: flex;
+      flex-direction: column;
+      height: 100vh;
+      overflow: hidden;
+    }
+
+    @media (max-width: 640px) {
+      body {
+        height: auto;
+        min-height: 100vh;
+        overflow: auto;
+        flex-direction: column;
+      }
+      .container {
+        max-height: none;
+        overflow: visible;
+      }
+      .table-section {
+        max-height: 55vh;
+      }
+      .log-section {
+        flex: none;
+        max-height: 40vh;
+      }
+    }
+
+    .container {
+      flex: 1;
+      min-height: 0;
+      max-width: 1600px;
+      width: 100%;
+      margin: 0 auto;
+      display: flex;
+      flex-direction: column;
+      gap: 15px;
+      overflow: hidden;
+    }
+
+    h1 {
+      color: #333;
+      margin-bottom: 5px;
+      margin-top: 15px; 
+      font-size: 1.5em;
+    }
+
+    h2 {
+      color: #333;
+      margin-bottom: 10px;
+      font-size: 1.2em;
+      padding: 10px 15px;
+      background: white;
+      border-radius: 6px;
+      box-shadow: 0 1px 4px rgba(0,0,0,0.1);
+    }
+
+    .subtitle {
+      color: #666;
+      margin-bottom: 15px;
+      font-size: 0.9em;
+    }
+
+    .content {
+      display: flex;
+      flex-direction: column;
+      gap: 15px;
+    }
+
+    .table-section {
+      background: white;
+      border-radius: 6px;
+      padding: 15px;
+      box-shadow: 0 1px 4px rgba(0,0,0,0.1);
+      overflow-x: auto;
+      overflow-y: auto;
+      max-height: 60vh;
+    }
+
+    .log-section {
+      flex: 1;
+      min-height: 0;
+      background: white;
+      border-radius: 6px;
+      padding: 15px;
+      box-shadow: 0 1px 4px rgba(0,0,0,0.1);
+      overflow-y: auto;
+    }
+
+    #ntable {
+      border-collapse: collapse;
+      width: 100%;
+      font-size: 0.9em;
+    }
+
+    #ntable td,
+    #ntable th {
+      border: 1px solid #e0e0e0;
+      text-align: left;
+      padding: 2px 4px;
+      white-space: nowrap;
+    }
+
+    #ntable tr:nth-child(even) {
+      background-color: #fafafa;
+    }
+
+    #ntable tr:hover {
+      background-color: #e3f2fd;
+    }
+
+    #ntable tbody tr.row-warning {
+      background-color: #fff8c5;
+    }
+
+    #ntable tbody tr.row-critical {
+      background-color: #fde8e8;
+    }
+
+    #ntable tbody tr.row-warning:hover {
+      background-color: #fff0a0;
+    }
+
+    #ntable tbody tr.row-critical:hover {
+      background-color: #f9c8c8;
+    }
+
+    #ntable th {
+      padding: 6px 8px;
+      background-color: #2196f3;
+      color: white;
+      font-weight: 600;
+      position: sticky;
+      top: 0;
+      z-index: 10;
+    }
+
+    #ntable
+      th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
+      content: " ⇅";
+      opacity: 0.5;
+    }
+    
+    /* Alert count column styling */
+    #ntable td.alert-warning {
+      color: #ff9800;
+      font-weight: bold;
+      text-align: center;
+    }
+    
+    #ntable td.alert-critical {
+      color: #f44336;
+      font-weight: bold;
+      text-align: center;
+    }
+
+    /* Scrollbar styling */
+    .log-section::-webkit-scrollbar {
+      width: 8px;
+    }
+
+    .log-section::-webkit-scrollbar-track {
+      background: #f1f1f1;
+      border-radius: 4px;
+    }
+
+    .log-section::-webkit-scrollbar-thumb {
+      background: #888;
+      border-radius: 4px;
+    }
+
+    .log-section::-webkit-scrollbar-thumb:hover {
+      background: #555;
+    }
+
+    /* Message styling */
+    #messages {
+      font-size: 0.85em;
+      line-height: 1.0;
+    }
+
+    #messages .log-entry {
+      padding: 5px 0;
+      border-bottom: 1px solid #f0f0f0;
+      display: flex;
+      gap: 0.5em;
+      align-items: baseline;
+    }
+
+    .log-ts { color: #888; white-space: nowrap; }
+    .log-level { font-weight: bold; min-width: 6em; }
+    .log-host { font-weight: 600; }
+    .log-service { color: #888; }
+
+    .log-warning .log-level  { color: #b8860b; }
+    .log-critical .log-level { color: #c00; }
+    .log-recover .log-level  { color: #2a7a2a; }
+    .log-info .log-level     { color: #555; }
+
+    /* Modal for connection status messages */
+    .connection-modal {
+      display: none;
+      position: fixed;
+      z-index: 1000;
+      left: 0;
+      top: 0;
+      width: 100%;
+      height: 100%;
+      background-color: rgba(0, 0, 0, 0.5);
+    }
+
+    .connection-modal.show {
+      display: flex;
+      justify-content: center;
+      align-items: center;
+    }
+
+    .connection-modal-content {
+      background-color: white;
+      padding: 30px 40px;
+      border-radius: 8px;
+      text-align: center;
+      box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
+      min-width: 300px;
+    }
+
+    .connection-modal-content p {
+      margin: 0;
+      font-size: 16px;
+      color: #333;
+    }
+
+    /* State indicators */
+    .state-up {
+      color: #4caf50;
+      font-weight: 600;
+    }
+
+    .state-down {
+      color: #f44336;
+      font-weight: 700;
+    }
+
+    .state-overdue {
+      color: #ff9800;
+      font-weight: 700;
+    }
+    #ntable a.host-link { color: inherit; text-decoration: none; }
+    #ntable a.host-link:hover { text-decoration: underline; }
+  </style>
+  <script type="text/javascript">
+    var cnt = 0;
+    var nTable = document;
+    var name_idx = {};
+    var c = 0;
+    var HBD_VERSION = "{{ hbd_version }}";
+
+    function hostNameHtml(data) {
+      var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
+      var nameHtml = data.name;
+      if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
+        nameHtml += ' 🥀';
+      }
+      var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
+      return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
+    }
+
+    function setup() {
+      name_idx = {};
+      nTable = document.getElementById("ntable");
+      for (var i = 0, row; (row = nTable.rows[i]); i++) {
+        if (i == 0) continue;
+        var cell = nTable.rows[i].cells[0];
+        var name = cell.dataset.name || cell.innerText.replace(/\s*🥀\s*$/, '').trim();
+        name_idx[name] = nTable.rows[i];
+      }
+    }
+
+    function updateRowAlert(row, data) {
+      var criticalUnacked = data.alert_critical_unacked || 0;
+      var criticalAcked = data.alert_critical_acked || 0;
+      var warningUnacked = data.alert_warning_unacked || 0;
+      var warningAcked = data.alert_warning_acked || 0;
+      row.classList.remove('row-warning', 'row-critical');
+      if (criticalUnacked > 0 || criticalAcked > 0) {
+        row.classList.add('row-critical');
+      } else if (warningUnacked > 0 || warningAcked > 0) {
+        row.classList.add('row-warning');
+      }
+    }
+
+    function createRow(data) {
+      var row = document.createElement("tr");
+      var c_name = document.createElement("td");
+      var c_warning = document.createElement("td");
+      c_warning.style.textAlign = "center";
+      c_warning.style.color = "#ff9800";
+      c_warning.style.fontWeight = "bold";
+      var c_critical = document.createElement("td");
+      c_critical.style.textAlign = "center";
+      c_critical.style.color = "#f44336";
+      c_critical.style.fontWeight = "bold";
+      var c_ipv4addr = document.createElement("td");
+      var c_ipv4state = document.createElement("td");
+      var c_ipv4latency = document.createElement("td");
+      c_ipv4latency.style.textAlign = "right";
+      var c_ipv4statets = document.createElement("td");
+      c_ipv4statets.style.textAlign = "right";
+      var c_ipv6addr = document.createElement("td");
+      var c_ipv6state = document.createElement("td");
+      var c_ipv6latency = document.createElement("td");
+      c_ipv6latency.style.textAlign = "right";
+      var c_ipv6statets = document.createElement("td");
+      c_ipv6statets.style.textAlign = "right";
+      row.appendChild(c_name);
+      row.appendChild(c_warning);
+      row.appendChild(c_critical);
+      row.appendChild(c_ipv4addr);
+      row.appendChild(c_ipv4state);
+      row.appendChild(c_ipv4latency);
+      row.appendChild(c_ipv4statets);
+      row.appendChild(c_ipv6addr);
+      row.appendChild(c_ipv6state);
+      row.appendChild(c_ipv6latency);
+      row.appendChild(c_ipv6statets);
+      c_name.dataset.name = data.name;
+      c_name.innerHTML = hostNameHtml(data);
+      
+      // Set alert counts in "x/y" format (unacked/acked)
+      var warningUnacked = data.alert_warning_unacked || 0;
+      var warningAcked = data.alert_warning_acked || 0;
+      var criticalUnacked = data.alert_critical_unacked || 0;
+      var criticalAcked = data.alert_critical_acked || 0;
+      
+      if (warningUnacked > 0 || warningAcked > 0) {
+        c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
+      } else {
+        c_warning.innerHTML = "";
+      }
+      
+      if (criticalUnacked > 0 || criticalAcked > 0) {
+        c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
+      } else {
+        c_critical.innerHTML = "";
+      }
+      
+      c_ipv4addr.innerHTML = data.connections[0].addr;
+      c_ipv4state.innerHTML = data.connections[0].state;
+      if (data.connections.length > 1) {
+        c_ipv6addr.innerHTML = data.connections[1].addr;
+        c_ipv6state.innerHTML = data.connections[1].state;
+      }
+      var table = document.getElementById("ntablebody"); // find table to append to
+      table.appendChild(row); // append row to table
+      name_idx[c_name] = row;
+      updateRowAlert(row, data);
+    }
+
+    function formatTS(ts) {
+      const now = new Date();
+      const d = new Date(ts * 1000);
+
+      const pad = n => String(n).padStart(2, '0');
+      const timeStr = `${pad(d.getHours())}:${pad(d.getMinutes())}:${pad(d.getSeconds())}`;
+
+      // Same calendar day → show time only
+      if (d.toDateString() === now.toDateString()) {
+        return timeStr;
+      }
+
+      // Within 8 days → show "-X d hh:mm:ss"
+      const todayStart = new Date(now.getFullYear(), now.getMonth(), now.getDate());
+      const dStart = new Date(d.getFullYear(), d.getMonth(), d.getDate());
+      const diffDays = Math.round((todayStart - dStart) / 86400000);
+      if (diffDays < 8) {
+        return `-${diffDays}d ${timeStr}`;
+      }
+
+      // Older → date only
+      return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}`;
+    }
+
+    function update_table(data) {
+      if (!(data.name in name_idx)) {
+        createRow(data);
+        setup();
+      }
+      
+      // Update name cell (version indicator)
+      var nameCell = name_idx[data.name].cells[0];
+      nameCell.dataset.name = data.name;
+      nameCell.innerHTML = hostNameHtml(data);
+
+      // Update warning and critical counts in "x/y" format (unacked/acked)
+      var warningUnacked = data.alert_warning_unacked || 0;
+      var warningAcked = data.alert_warning_acked || 0;
+      var criticalUnacked = data.alert_critical_unacked || 0;
+      var criticalAcked = data.alert_critical_acked || 0;
+      
+      if (warningUnacked > 0 || warningAcked > 0) {
+        name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
+      } else {
+        name_idx[data.name].cells[1].innerHTML = "";
+      }
+      
+      if (criticalUnacked > 0 || criticalAcked > 0) {
+        name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
+      } else {
+        name_idx[data.name].cells[2].innerHTML = "";
+      }
+
+      for  (var i = 0; i < data.connections.length; i++) {
+        // Offset by 2 for the warning/critical count columns
+        name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
+        name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
+          data.connections[i].statetime
+        );
+        if (data.connections[i].state == "up") {
+          state = '<span class="state-up">up</span>';
+          latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
+        } else {
+          if (data.connections[i].state == "unknown") {
+            state = "";
+            latency = "";
+            name_idx[data.name].cells[3 + i * 4].innerHTML = "";
+            name_idx[data.name].cells[6 + i * 4].innerHTML = "";
+          } else if (data.connections[i].state == "down") {
+            state = '<span class="state-down">down</span>';
+            latency = "-";
+          } else if (data.connections[i].state == "overdue") {
+            state = '<span class="state-overdue">overdue</span>';
+            latency = "-";
+          } else {
+            state = "<b>" + data.connections[i].state + "</b>";
+            latency = "-";
+          }
+        }
+        name_idx[data.name].cells[4 + i * 4].innerHTML = state;
+        name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
+      }
+      updateRowAlert(name_idx[data.name], data);
+    }
+
+    function WS_Connect() {
+      if ("WebSocket" in window) {
+        //N.B: subprotocol field causes chrome to error 1006
+        var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
+
+        ws_hbd.onopen = function () {
+          // Web Socket is connected, send data using send()
+          console.log("ws connect {{heartbeat_ws_url}}");
+          // Hide modal window if visible
+          var modal = document.getElementById("connectionModal");
+          if (modal) {
+            modal.classList.remove("show");
+          }
+          ws_hbd.send("heartbeat_web");
+        };
+
+        ws_hbd.onerror = function (event) {
+          console.log(event);
+        };
+
+        ws_hbd.onmessage = function (event) {
+          /*      console.log(event.data); */
+          var state = JSON.parse(event.data);
+          /* console.log("State: " + state.type); */
+          if (state.type == "host") {
+            update_table(state.data);
+          } else if (state.type == "message") {
+            var msgs = document.getElementById("messages");
+            var msg = state.data;
+            var _d = new Date(msg.ts * 1000);
+            function _p(n) { return n < 10 ? '0' + n : '' + n; }
+            var ts_str = _d.getFullYear() + '-' + _p(_d.getMonth()+1) + '-' + _p(_d.getDate())
+                       + ' ' + _p(_d.getHours()) + ':' + _p(_d.getMinutes()) + ':' + _p(_d.getSeconds());
+            var lvl = (msg.level || "INFO").toLowerCase();
+            var html = '<div class="log-entry log-' + lvl + '">';
+            html += '<span class="log-ts">' + ts_str + '</span>';
+            html += '<span class="log-level">' + (msg.level || "") + '</span>';
+            if (msg.host) html += '<span class="log-host">' + msg.host + '</span>';
+            if (msg.service) html += '<span class="log-service">' + msg.service + '</span>';
+            html += '<span class="log-msg">' + msg.message + '</span>';
+            html += '</div>';
+            msgs.insertAdjacentHTML("afterbegin", html);
+          }
+          cnt++;
+        };
+
+        ws_hbd.onclose = function (event) {
+          /*     console.log(event); */
+          console.log("Connection is closed, reopening");
+          // Show modal window
+          var modal = document.getElementById("connectionModal");
+          if (modal) {
+            modal.classList.add("show");
+          }
+          setTimeout(function () {
+            WS_Connect();
+          }, 3000);
+        };
+      } else {
+        // The browser doesn't support WebSocket
+        console.log("WebSocket NOT supported by your Browser!");
+      }
+    }
+    WS_Connect();
+  </script>
+  <body>
+    {% include 'nav.html' %}
+
+    {% include 'menu.html' %}
+
+    <div class="container">
+      <div>
+        <h1>{{ header }}</h1>
+        <p class="subtitle">Real-time host monitoring and event log</p>
+      </div>
+      
+      <div class="table-section">
+        <table id="ntable" class="sortable">
+          <thead>
+            <tr>
+              <th>Name</th>
+              <th style="text-align: center" title="Warning Alerts">⚠️</th>
+              <th style="text-align: center" title="Critical Alerts">🔴</th>
+              <th>IPv4 Addr</th>
+              <th>State</th>
+              <th style="text-align: right">Latency</th>
+              <th style="text-align: right">Last State</th>
+              <th>IPv6 Addr</th>
+              <th>State</th>
+              <th style="text-align: right">Latency</th>
+              <th style="text-align: right">Last State</th>
+            </tr>
+          </thead>
+          <tbody id="ntablebody">
+            {% for host in hosts %}
+            <tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
+              <td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
+              <td style="text-align: center; color: #ff9800; font-weight: bold;">
+                {%- set warning_unacked = host.alert_warning_unacked -%}
+                {%- set warning_acked = host.alert_warning_acked -%}
+                {%- if warning_unacked > 0 or warning_acked > 0 -%}
+                  {{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
+                {%- endif -%}
+              </td>
+              <td style="text-align: center; color: #f44336; font-weight: bold;">
+                {%- set critical_unacked = host.alert_critical_unacked -%}
+                {%- set critical_acked = host.alert_critical_acked -%}
+                {%- if critical_unacked > 0 or critical_acked > 0 -%}
+                  {{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
+                {%- endif -%}
+              </td>
+              {% for conn in host.connections %}
+                <td>{{ conn.addr if conn.addr else '' }}</td>
+                <td>{{ conn.state if conn.state else '' }}</td>
+                <td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
+                <td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
+              {% endfor %}
+              {% if host.connections|length == 0 %}
+                <td></td><td></td><td></td><td></td>
+                <td></td><td></td><td></td><td></td>
+              {% elif host.connections|length == 1 %}
+                <td></td><td></td><td></td><td></td>
+              {% endif %}
+            </tr>
+            {% endfor %}
+          </tbody>
+        </table>
+      </div>
+      
+      <div class="log-section">
+        <h2>Log of Events</h2>
+        <div id="messages"></div>
+      </div>
+    </div>
+    
+    {% include 'foot.html' %}
+    
+    <!-- Connection status modal -->
+    <div id="connectionModal" class="connection-modal">
+      <div class="connection-modal-content">
+        <p>⚠️ Connection is closed, reopening...</p>
+      </div>
+    </div>
+    
+    <script>
+      setup();
+    </script>
+  </body>
+</html>
@@ -0,0 +1,2 @@
+<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
+s<header>{{ header }}</header> -->
@@ -0,0 +1,96 @@
+<div class="nav">
+  <button class="nav-hamburger" id="nav-hamburger-btn" aria-label="Menu" aria-expanded="false">
+    <span></span><span></span><span></span>
+  </button>
+  <div class="nav-links" id="nav-links">
+    <a href="/live"{% if active_page == "live" %} class="active"{% endif %}>Live Dashboard</a>
+    <a href="/plugins"{% if active_page == "plugins" %} class="active"{% endif %}>Host Overview</a>
+    <a href="/alerts"{% if active_page == "alerts" %} class="active"{% endif %}>Alerts</a>
+    {% if current_user and current_user.admin %}
+    <a href="/settings"{% if active_page == "settings" %} class="active"{% endif %}>Settings</a>
+    {% endif %}
+    <a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
+  </div>
+  <div class="nav-pie" title="Host alert status">
+    <canvas id="alert-pie" width="44" height="44"></canvas>
+  </div>
+  <div class="nav-clock" title="Click for full-screen clock">
+    <canvas id="swiss-clock" width="44" height="44"></canvas>
+  </div>
+  {% if current_user %}
+  <a href="/profile" class="nav-user{% if active_page == 'profile' %} active{% endif %}" title="{{ current_user.full_name or current_user.username }}">
+    {% if current_user.avatar %}
+    <img class="nav-avatar" src="{{ current_user.avatar_url }}" alt="{{ current_user.full_name or current_user.username }}">
+    {% else %}
+    <span class="nav-initials">{{ (current_user.full_name or current_user.username)[:1] | upper }}</span>
+    {% endif %}
+    <span class="nav-username">{{ current_user.full_name or current_user.username }}</span>
+  </a>
+  {% endif %}
+</div>
+
+<!-- Full-page clock overlay (click anywhere to dismiss) -->
+<div id="clock-overlay">
+  <canvas id="swiss-clock-overlay" width="400" height="400"></canvas>
+</div>
+
+<script>
+  (function() {
+    var btn = document.getElementById('nav-hamburger-btn');
+    var links = document.getElementById('nav-links');
+    if (btn && links) {
+      btn.addEventListener('click', function() {
+        var open = links.classList.toggle('nav-open');
+        btn.setAttribute('aria-expanded', open ? 'true' : 'false');
+      });
+    }
+  })();
+
+  function drawAlertPie(critical, warning, ok) {
+    var canvas = document.getElementById('alert-pie');
+    if (!canvas) return;
+    var ctx = canvas.getContext('2d');
+    var SIZE = canvas.width;
+    var R = SIZE / 2;
+    ctx.clearRect(0, 0, SIZE, SIZE);
+    var total = critical + warning + ok;
+    if (total === 0) {
+      ctx.beginPath();
+      ctx.arc(R, R, R - 1, 0, Math.PI * 2);
+      ctx.fillStyle = '#ccc';
+      ctx.fill();
+      return;
+    }
+    var slices = [
+      { value: critical, color: '#e53935' },
+      { value: warning,  color: '#ffb300' },
+      { value: ok,       color: '#43a047' }
+    ];
+    var start = -Math.PI / 2;
+    slices.forEach(function(s) {
+      if (s.value === 0) return;
+      var sweep = (s.value / total) * Math.PI * 2;
+      ctx.beginPath();
+      ctx.moveTo(R, R);
+      ctx.arc(R, R, R - 1, start, start + sweep);
+      ctx.closePath();
+      ctx.fillStyle = s.color;
+      ctx.fill();
+      start += sweep;
+    });
+  }
+
+  function updateAlertPie() {
+    fetch('/api/0/alert_summary').then(function(r) {
+      if (!r.ok) return;
+      return r.json();
+    }).then(function(d) {
+      if (d) drawAlertPie(d.critical || 0, d.warning || 0, d.ok || 0);
+    }).catch(function() {});
+  }
+
+  document.addEventListener('DOMContentLoaded', function() {
+    updateAlertPie();
+    setInterval(updateAlertPie, 30000);
+  });
+</script>
@@ -0,0 +1,330 @@
+<!DOCTYPE html>
+<html>
+  {% include 'head.html' %}
+
+  <style>
+    html, body { overflow: visible; }
+
+    .container {
+      max-width: 900px;
+      margin: 0 auto;
+    }
+
+    h1 {
+      color: #333;
+      margin-bottom: 4px;
+      font-size: 1.5em;
+    }
+
+    .subtitle {
+      color: #666;
+      margin-bottom: 24px;
+      font-size: 0.9em;
+    }
+
+    /* ---- Profile card ---- */
+    .profile-card {
+      background: #fff;
+      border-radius: 8px;
+      box-shadow: 0 1px 6px rgba(0,0,0,0.1);
+      padding: 28px 32px;
+      margin-bottom: 24px;
+      display: flex;
+      align-items: center;
+      gap: 28px;
+    }
+
+    .avatar-large {
+      width: 80px;
+      height: 80px;
+      border-radius: 50%;
+      object-fit: cover;
+      flex-shrink: 0;
+      box-shadow: 0 2px 8px rgba(0,0,0,0.15);
+    }
+
+    .avatar-initials-large {
+      width: 80px;
+      height: 80px;
+      border-radius: 50%;
+      background: #0066cc;
+      color: #fff;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      font-size: 2em;
+      font-weight: 700;
+      flex-shrink: 0;
+      box-shadow: 0 2px 8px rgba(0,0,0,0.15);
+    }
+
+    .profile-info { flex: 1; }
+
+    .profile-name {
+      font-size: 1.4em;
+      font-weight: 700;
+      color: #222;
+      margin-bottom: 4px;
+    }
+
+    .profile-username {
+      font-size: 0.9em;
+      color: #666;
+      margin-bottom: 10px;
+    }
+
+    .badge {
+      display: inline-block;
+      padding: 2px 10px;
+      border-radius: 12px;
+      font-size: 0.78em;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.4px;
+    }
+
+    .badge-admin { background: #e8f0fe; color: #1a73e8; }
+    .badge-user  { background: #f1f3f4; color: #555; }
+
+    .profile-logout {
+      margin-top: 14px;
+    }
+
+    .btn-logout {
+      display: inline-block;
+      padding: 6px 16px;
+      border-radius: 4px;
+      background: #f44336;
+      color: #fff;
+      font-size: 0.85em;
+      font-weight: 500;
+      text-decoration: none;
+      transition: background 0.15s;
+    }
+    .btn-logout:hover { background: #d32f2f; text-decoration: none; }
+
+    /* ---- Section cards ---- */
+    .section {
+      background: #fff;
+      border-radius: 8px;
+      box-shadow: 0 1px 6px rgba(0,0,0,0.1);
+      padding: 20px 24px;
+      margin-bottom: 20px;
+    }
+
+    .section h2 {
+      font-size: 1em;
+      font-weight: 700;
+      color: #333;
+      margin: 0 0 16px;
+      padding-bottom: 10px;
+      border-bottom: 1px solid #eee;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+    }
+
+    /* ---- Settings rows ---- */
+    .settings-row {
+      display: flex;
+      align-items: baseline;
+      padding: 8px 0;
+      border-bottom: 1px solid #f5f5f5;
+      font-size: 0.9em;
+    }
+    .settings-row:last-child { border-bottom: none; }
+
+    .settings-label {
+      width: 180px;
+      flex-shrink: 0;
+      color: #666;
+      font-size: 0.88em;
+    }
+
+    .settings-value { color: #222; }
+
+    .settings-empty { color: #aaa; font-style: italic; }
+
+    /* ---- Host lists ---- */
+    .host-grid {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+    }
+
+    .host-chip {
+      display: inline-flex;
+      align-items: center;
+      gap: 6px;
+      padding: 4px 12px;
+      border-radius: 16px;
+      font-size: 0.85em;
+      font-weight: 500;
+      text-decoration: none;
+    }
+
+    .host-chip.owner   { background: #e8f5e9; color: #2e7d32; }
+    .host-chip.manager { background: #e3f2fd; color: #1565c0; }
+    .host-chip.monitor { background: #f3e5f5; color: #6a1b9a; }
+
+    .host-chip-dot {
+      width: 7px; height: 7px; border-radius: 50%;
+    }
+    .owner   .host-chip-dot { background: #2e7d32; }
+    .manager .host-chip-dot { background: #1565c0; }
+    .monitor .host-chip-dot { background: #6a1b9a; }
+
+    .no-hosts {
+      color: #aaa;
+      font-size: 0.9em;
+      font-style: italic;
+    }
+
+    /* ---- Notification channels ---- */
+    .channel-row {
+      display: flex;
+      align-items: center;
+      gap: 10px;
+      padding: 6px 0;
+      border-bottom: 1px solid #f5f5f5;
+      font-size: 0.9em;
+    }
+    .channel-row:last-child { border-bottom: none; }
+
+    .channel-type {
+      display: inline-block;
+      padding: 2px 8px;
+      border-radius: 10px;
+      font-size: 0.78em;
+      font-weight: 600;
+      text-transform: uppercase;
+      background: #f1f3f4;
+      color: #555;
+      min-width: 70px;
+      text-align: center;
+    }
+
+    .channel-name { color: #333; }
+  </style>
+
+  <body>
+    {% include 'nav.html' %}
+
+    <div class="container">
+      <h1>{{ header }}</h1>
+      <p class="subtitle">Your account settings and host access</p>
+
+      <!-- Profile card -->
+      <div class="profile-card">
+        {% if current_user and current_user.avatar %}
+        <img class="avatar-large" src="{{ current_user.avatar_url }}" alt="">
+        {% else %}
+        <div class="avatar-initials-large">
+          {{ ((current_user.full_name if current_user else '') or (current_user.username if current_user else '?'))[:1] | upper }}
+        </div>
+        {% endif %}
+
+        <div class="profile-info">
+          <div class="profile-name">{{ current_user.full_name if current_user and current_user.full_name else (current_user.username if current_user else '—') }}</div>
+          <div class="profile-username">@{{ current_user.username if current_user else '—' }}</div>
+          {% if current_user and current_user.admin %}
+          <span class="badge badge-admin">Admin</span>
+          {% else %}
+          <span class="badge badge-user">User</span>
+          {% endif %}
+          <div class="profile-logout">
+            <a href="/logout" class="btn-logout">Sign out</a>
+          </div>
+        </div>
+      </div>
+
+      <!-- Account settings -->
+      <div class="section">
+        <h2>Account</h2>
+        <div class="settings-row">
+          <span class="settings-label">Username</span>
+          <span class="settings-value">{{ current_user.username if current_user else '—' }}</span>
+        </div>
+        <div class="settings-row">
+          <span class="settings-label">Full name</span>
+          {% if current_user and current_user.full_name %}
+          <span class="settings-value">{{ current_user.full_name }}</span>
+          {% else %}
+          <span class="settings-empty">Not set</span>
+          {% endif %}
+        </div>
+        <div class="settings-row">
+          <span class="settings-label">Role</span>
+          <span class="settings-value">{{ 'Administrator' if current_user and current_user.admin else 'User' }}</span>
+        </div>
+        <div class="settings-row">
+          <span class="settings-label">Avatar</span>
+          {% if current_user and current_user.avatar %}
+          <span class="settings-value" style="word-break:break-all;">{{ current_user.avatar }}</span>
+          {% else %}
+          <span class="settings-empty">Not set (initials used)</span>
+          {% endif %}
+        </div>
+      </div>
+
+      <!-- Notification channels -->
+      <div class="section">
+        <h2>Notification Channels</h2>
+        {% if notification_channels %}
+        {% for ch in notification_channels %}
+        <div class="channel-row">
+          <span class="channel-type">{{ ch.type }}</span>
+          <span class="channel-name">{{ ch.name }}</span>
+        </div>
+        {% endfor %}
+        {% else %}
+        <span class="no-hosts">No personal notification channels configured.</span>
+        {% endif %}
+      </div>
+
+      <!-- Host access -->
+      <div class="section">
+        <h2>Host Access</h2>
+
+        <div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
+          <span class="settings-label" style="padding-top: 2px;">Owner</span>
+          <div class="host-grid">
+            {% if owned_hosts %}
+            {% for h in owned_hosts %}
+            <span class="host-chip owner"><span class="host-chip-dot"></span>{{ h }}</span>
+            {% endfor %}
+            {% else %}
+            <span class="no-hosts">None</span>
+            {% endif %}
+          </div>
+        </div>
+
+        <div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
+          <span class="settings-label" style="padding-top: 2px;">Manager</span>
+          <div class="host-grid">
+            {% if managed_hosts %}
+            {% for h in managed_hosts %}
+            <span class="host-chip manager"><span class="host-chip-dot"></span>{{ h }}</span>
+            {% endfor %}
+            {% else %}
+            <span class="no-hosts">None</span>
+            {% endif %}
+          </div>
+        </div>
+
+        <div class="settings-row" style="align-items: flex-start; padding-bottom: 4px;">
+          <span class="settings-label" style="padding-top: 2px;">Monitor</span>
+          <div class="host-grid">
+            {% if monitored_hosts %}
+            {% for h in monitored_hosts %}
+            <span class="host-chip monitor"><span class="host-chip-dot"></span>{{ h }}</span>
+            {% endfor %}
+            {% else %}
+            <span class="no-hosts">None</span>
+            {% endif %}
+          </div>
+        </div>
+      </div>
+
+    </div>
+  </body>
+</html>
@@ -0,0 +1,544 @@
+<!DOCTYPE html>
+<html>
+  {% include 'head.html' %}
+
+  <style>
+    html, body { overflow: visible; }
+
+    .container {
+      max-width: 960px;
+    }
+
+    h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
+    .subtitle { color: #666; margin-bottom: 24px; font-size: 0.9em; }
+
+    /* ---- Sidebar + content layout ---- */
+    .settings-layout {
+      display: flex;
+      gap: 24px;
+      align-items: flex-start;
+    }
+
+    .settings-sidebar {
+      width: 180px;
+      flex-shrink: 0;
+      position: sticky;
+      top: 60px;
+    }
+
+    .sidebar-nav a {
+      display: block;
+      padding: 6px 10px;
+      border-radius: 4px;
+      text-decoration: none;
+      font-size: 0.85em;
+      color: #444;
+      margin-bottom: 2px;
+      transition: background 0.1s, color 0.1s;
+    }
+    .sidebar-nav a:hover { background: #e8eaf6; color: #1a237e; }
+    .sidebar-nav a.active { background: #e3f2fd; color: #0066cc; font-weight: 600; }
+
+    .settings-main { flex: 1; min-width: 0; }
+
+    /* ---- Section card ---- */
+    .section {
+      background: #fff;
+      border-radius: 8px;
+      box-shadow: 0 1px 4px rgba(0,0,0,.08);
+      margin-bottom: 24px;
+      overflow: hidden;
+    }
+
+    .section-header {
+      padding: 14px 20px 12px;
+      border-bottom: 1px solid #eee;
+    }
+
+    .section-title {
+      font-size: 0.95em;
+      font-weight: 700;
+      color: #222;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+      margin: 0 0 3px;
+    }
+
+    .section-desc {
+      font-size: 0.82em;
+      color: #888;
+      margin: 0;
+    }
+
+    /* ---- Field rows ---- */
+    .field-row {
+      display: flex;
+      align-items: baseline;
+      padding: 10px 20px;
+      border-bottom: 1px solid #f5f5f5;
+      gap: 16px;
+    }
+    .field-row:last-child { border-bottom: none; }
+
+    .field-label {
+      width: 200px;
+      flex-shrink: 0;
+      font-size: 0.88em;
+      font-weight: 500;
+      color: #444;
+    }
+
+    .field-body { flex: 1; min-width: 0; }
+
+    .field-value {
+      font-size: 0.9em;
+      color: #222;
+      word-break: break-all;
+    }
+
+    .field-desc {
+      font-size: 0.78em;
+      color: #999;
+      margin-top: 2px;
+    }
+
+    /* ---- Value type renderers ---- */
+    .val-boolean {
+      display: inline-block;
+      padding: 2px 9px;
+      border-radius: 10px;
+      font-size: 0.8em;
+      font-weight: 600;
+    }
+    .val-boolean.on  { background: #e8f5e9; color: #2e7d32; }
+    .val-boolean.off { background: #fce4ec; color: #c62828; }
+
+    .val-masked {
+      font-family: monospace;
+      color: #bbb;
+      letter-spacing: 2px;
+    }
+
+    .val-list { display: flex; flex-wrap: wrap; gap: 5px; }
+    .val-tag {
+      display: inline-block;
+      padding: 2px 9px;
+      background: #e8eaf6;
+      color: #283593;
+      border-radius: 10px;
+      font-size: 0.8em;
+    }
+    .val-empty { color: #ccc; font-style: italic; font-size: 0.88em; }
+
+    /* ---- Users table ---- */
+    .mini-table {
+      width: 100%;
+      border-collapse: collapse;
+      font-size: 0.875em;
+    }
+    .mini-table th {
+      background: #f5f5f5;
+      padding: 7px 12px;
+      text-align: left;
+      font-weight: 600;
+      color: #555;
+      font-size: 0.82em;
+      text-transform: uppercase;
+      letter-spacing: 0.4px;
+      border-bottom: 1px solid #e0e0e0;
+    }
+    .mini-table td {
+      padding: 7px 12px;
+      border-bottom: 1px solid #f0f0f0;
+      color: #333;
+      vertical-align: middle;
+    }
+    .mini-table tbody tr:last-child td { border-bottom: none; }
+    .mini-table tbody tr:hover { background: #fafafa; }
+
+    .badge {
+      display: inline-block;
+      padding: 1px 8px;
+      border-radius: 10px;
+      font-size: 0.75em;
+      font-weight: 600;
+    }
+    .badge-admin { background: #e8f0fe; color: #1a73e8; }
+    .badge-user  { background: #f1f3f4; color: #666; }
+
+    /* ---- Notification channels ---- */
+    .channel-card {
+      border: 1px solid #e8eaf6;
+      border-radius: 6px;
+      margin: 12px 20px;
+      overflow: hidden;
+    }
+
+    .channel-header {
+      display: flex;
+      align-items: center;
+      gap: 10px;
+      padding: 9px 14px;
+      background: #f8f9ff;
+      border-bottom: 1px solid #e8eaf6;
+    }
+
+    .channel-name-text { font-weight: 600; font-size: 0.9em; color: #222; }
+
+    .ch-type-badge {
+      padding: 2px 8px;
+      border-radius: 8px;
+      font-size: 0.75em;
+      font-weight: 600;
+      background: #e8eaf6;
+      color: #3949ab;
+    }
+
+    .channel-fields { padding: 6px 0; }
+
+    .channel-field {
+      display: flex;
+      padding: 5px 14px;
+      font-size: 0.85em;
+      border-bottom: 1px solid #f5f5f5;
+      gap: 12px;
+    }
+    .channel-field:last-child { border-bottom: none; }
+    .channel-field-label { width: 130px; flex-shrink: 0; color: #777; }
+    .channel-field-value { color: #333; word-break: break-all; }
+
+    /* ---- Hosts table ---- */
+    /* ---- Mobile: collapsible sidebar ---- */
+    .sidebar-toggle {
+      display: none;
+      width: 100%;
+      padding: 8px 12px;
+      background: #e8eaf6;
+      border: none;
+      border-radius: 6px;
+      font-size: 0.9em;
+      font-weight: 600;
+      color: #283593;
+      cursor: pointer;
+      text-align: left;
+      margin-bottom: 16px;
+    }
+    .sidebar-toggle::after { content: ' ▾'; float: right; }
+    .sidebar-toggle.open::after { content: ' ▴'; }
+
+    @media (max-width: 640px) {
+      .sidebar-toggle { display: block; }
+
+      .settings-layout { flex-direction: column; gap: 0; }
+
+      .settings-sidebar {
+        width: 100%;
+        position: static;
+        margin-bottom: 0;
+      }
+
+      .sidebar-nav {
+        display: none;
+        background: white;
+        border-radius: 6px;
+        box-shadow: 0 1px 4px rgba(0,0,0,.1);
+        margin-bottom: 16px;
+        padding: 4px 0;
+      }
+      .sidebar-nav.open { display: block; }
+      .sidebar-nav a { padding: 10px 16px; font-size: 1em; }
+
+      .field-row { flex-direction: column; gap: 4px; }
+      .field-label { width: 100%; font-size: 0.82em; color: #888; }
+    }
+    .host-bool { text-align: center; }
+    .dot-yes { color: #2e7d32; font-size: 1.1em; }
+    .dot-no  { color: #ddd;    font-size: 1.1em; }
+
+    /* ---- Threshold configurations ---- */
+    .thresh-config { margin: 12px 20px 20px; }
+    .thresh-config-name {
+      font-weight: 600; font-size: 0.9em; color: #1a237e;
+      margin-bottom: 6px;
+    }
+    .mini-table .warn  { color: #e65100; font-weight: 600; }
+    .mini-table .crit  { color: #b71c1c; font-weight: 600; }
+    .mini-table .dim   { color: #aaa; }
+    .mini-table .metric-path { font-family: monospace; font-size: 0.88em; }
+  </style>
+
+  <body>
+    {% include 'nav.html' %}
+
+    <div class="container">
+      <h1>Settings</h1>
+      <p class="subtitle">Current server configuration — read from the config file at startup.</p>
+
+      <div class="settings-layout">
+
+        <!-- Sidebar navigation -->
+        <nav class="settings-sidebar">
+          <button class="sidebar-toggle" id="sidebar-toggle" aria-expanded="false">Sections</button>
+          <div class="sidebar-nav" id="sidebar-nav">
+            {% for section in sections %}
+            <a href="#{{ section.id }}" onclick="closeSidebar()">{{ section.title }}</a>
+            {% endfor %}
+          </div>
+        </nav>
+
+        <!-- Main content -->
+        <div class="settings-main">
+          {% for section in sections %}
+          <div class="section" id="{{ section.id }}">
+            <div class="section-header">
+              <p class="section-title">{{ section.title }}</p>
+              {% if section.description %}<p class="section-desc">{{ section.description }}</p>{% endif %}
+            </div>
+
+            {# ---- Standard field rows ---- #}
+            {% for f in section.fields %}
+            <div class="field-row">
+              <div class="field-label">{{ f.label }}</div>
+              <div class="field-body">
+                {% if f.sensitive %}
+                  <div class="field-value"><span class="val-masked">••••••••</span></div>
+                {% elif f.type == "boolean" %}
+                  <div class="field-value">
+                    <span class="val-boolean {{ 'on' if f.value else 'off' }}">
+                      {{ 'Enabled' if f.value else 'Disabled' }}
+                    </span>
+                  </div>
+                {% elif f.type == "list" %}
+                  <div class="field-value">
+                    {% if f.value %}
+                    <span class="val-list">
+                      {% for item in f.value %}<span class="val-tag">{{ item }}</span>{% endfor %}
+                    </span>
+                    {% else %}
+                    <span class="val-empty">None</span>
+                    {% endif %}
+                  </div>
+                {% elif f.value is none or f.value == "" %}
+                  <div class="field-value"><span class="val-empty">Not set</span></div>
+                {% else %}
+                  <div class="field-value">{{ f.value }}</div>
+                {% endif %}
+                {% if f.description %}
+                <div class="field-desc">{{ f.description }}</div>
+                {% endif %}
+              </div>
+            </div>
+            {% endfor %}
+
+            {# ---- Users section ---- #}
+            {% if section.id == "users" and section.users %}
+            <div style="padding: 0 0 4px;">
+              <table class="mini-table">
+                <thead>
+                  <tr>
+                    <th>Username</th>
+                    <th>Full Name</th>
+                    <th>Role</th>
+                    <th>Avatar</th>
+                    <th>Channels</th>
+                  </tr>
+                </thead>
+                <tbody>
+                  {% for u in section.users %}
+                  <tr>
+                    <td><strong>{{ u.username }}</strong></td>
+                    <td>{{ u.full_name or '—' }}</td>
+                    <td>
+                      {% if u.admin %}
+                      <span class="badge badge-admin">Admin</span>
+                      {% else %}
+                      <span class="badge badge-user">User</span>
+                      {% endif %}
+                    </td>
+                    <td style="font-size:0.8em; color:#888;">
+                      {% if u.avatar %}{{ u.avatar }}{% else %}—{% endif %}
+                    </td>
+                    <td>
+                      {% if u.notification_channels %}
+                      <span class="val-list">
+                        {% for ch in u.notification_channels %}
+                        <span class="val-tag">{{ ch }}</span>
+                        {% endfor %}
+                      </span>
+                      {% else %}—{% endif %}
+                    </td>
+                  </tr>
+                  {% endfor %}
+                </tbody>
+              </table>
+            </div>
+            {% endif %}
+
+            {# ---- Notification channels section ---- #}
+            {% if section.id == "channels" %}
+            {% for ch in section.channels %}
+            <div class="channel-card">
+              <div class="channel-header">
+                <span class="channel-name-text">{{ ch.name }}</span>
+                <span class="ch-type-badge">{{ ch.type_label }}</span>
+              </div>
+              <div class="channel-fields">
+                {% for cf in ch.fields %}
+                <div class="channel-field">
+                  <span class="channel-field-label">{{ cf.label }}</span>
+                  <span class="channel-field-value">
+                    {% if cf.sensitive %}
+                    <span class="val-masked">••••••••</span>
+                    {% elif cf.value is iterable and cf.value is not string %}
+                    {{ cf.value | join(', ') }}
+                    {% else %}
+                    {{ cf.value }}
+                    {% endif %}
+                  </span>
+                </div>
+                {% endfor %}
+              </div>
+            </div>
+            {% endfor %}
+            {% if not section.channels %}
+            <div class="field-row"><span class="val-empty">No notification channels configured.</span></div>
+            {% endif %}
+            {% endif %}
+
+            {# ---- Threshold configurations section ---- #}
+            {% if section.id == "thresholds" %}
+            {% if section.threshold_configs %}
+            {% for tc in section.threshold_configs %}
+            <div class="thresh-config">
+              <div class="thresh-config-name">{{ tc.name }}</div>
+              {% if tc.metrics %}
+              <div style="overflow-x: auto;">
+                <table class="mini-table">
+                  <thead>
+                    <tr>
+                      <th>Metric</th>
+                      <th>Op</th>
+                      <th>Warning</th>
+                      <th>Critical</th>
+                      <th>Hysteresis</th>
+                      <th>Count</th>
+                    </tr>
+                  </thead>
+                  <tbody>
+                    {% for m in tc.metrics %}
+                    <tr {% if not m.enabled %} style="opacity:0.45"{% endif %}>
+                      <td class="metric-path">{{ m.metric }}</td>
+                      <td>{{ m.operator or '>' }}</td>
+                      <td class="warn">{{ m.warning if m.warning is not none else '—' }}</td>
+                      <td class="crit">{{ m.critical if m.critical is not none else '—' }}</td>
+                      <td class="dim">{{ '%.0f%%' % (m.hysteresis * 100) if m.hysteresis else '—' }}</td>
+                      <td class="dim">{{ m.count }}</td>
+                    </tr>
+                    {% endfor %}
+                  </tbody>
+                </table>
+              </div>
+              {% else %}
+              <span class="val-empty">No thresholds defined.</span>
+              {% endif %}
+            </div>
+            {% endfor %}
+            {% else %}
+            <div class="field-row"><span class="val-empty">No threshold configurations defined.</span></div>
+            {% endif %}
+            {% endif %}
+
+            {# ---- Hosts section ---- #}
+            {% if section.id == "hosts" %}
+            {% if section.hosts %}
+            <div style="overflow-x: auto;">
+              <table class="mini-table">
+                <thead>
+                  <tr>
+                    <th>Host</th>
+                    <th>Watch</th>
+                    <th>DynDNS</th>
+                    <th>Owner</th>
+                    <th>Threshold config</th>
+                    <th>Channels</th>
+                  </tr>
+                </thead>
+                <tbody>
+                  {% for h in section.hosts %}
+                  <tr>
+                    <td><strong>{{ h.name }}</strong></td>
+                    <td class="host-bool">
+                      <span class="{{ 'dot-yes' if h.watch else 'dot-no' }}">●</span>
+                    </td>
+                    <td class="host-bool">
+                      <span class="{{ 'dot-yes' if h.dyndns else 'dot-no' }}">●</span>
+                    </td>
+                    <td>{{ h.owner or '—' }}</td>
+                    <td>{{ h.threshold_config or '—' }}</td>
+                    <td>
+                      {% if h.notification_channels %}
+                      <span class="val-list">
+                        {% for ch in h.notification_channels %}
+                        <span class="val-tag">{{ ch }}</span>
+                        {% endfor %}
+                      </span>
+                      {% else %}—{% endif %}
+                    </td>
+                  </tr>
+                  {% endfor %}
+                </tbody>
+              </table>
+            </div>
+            {% else %}
+            <div class="field-row"><span class="val-empty">No hosts defined in config.</span></div>
+            {% endif %}
+            {% endif %}
+
+          </div>{# /section #}
+          {% endfor %}
+        </div>{# /settings-main #}
+      </div>{# /settings-layout #}
+    </div>{# /container #}
+
+    <script>
+      // Highlight sidebar link for the section currently in view
+      const sections = document.querySelectorAll('.section');
+      const navLinks = document.querySelectorAll('.sidebar-nav a');
+
+      const observer = new IntersectionObserver(entries => {
+        entries.forEach(entry => {
+          if (entry.isIntersecting) {
+            const id = entry.target.id;
+            navLinks.forEach(a => {
+              a.classList.toggle('active', a.getAttribute('href') === '#' + id);
+            });
+          }
+        });
+      }, { threshold: 0.25 });
+
+      sections.forEach(s => observer.observe(s));
+
+      // Collapsible sidebar on mobile
+      var sidebarToggle = document.getElementById('sidebar-toggle');
+      var sidebarNav = document.getElementById('sidebar-nav');
+      if (sidebarToggle && sidebarNav) {
+        sidebarToggle.addEventListener('click', function() {
+          var open = sidebarNav.classList.toggle('open');
+          sidebarToggle.classList.toggle('open', open);
+          sidebarToggle.setAttribute('aria-expanded', open ? 'true' : 'false');
+        });
+      }
+    </script>
+    <script>
+      function closeSidebar() {
+        var sidebarNav = document.getElementById('sidebar-nav');
+        var sidebarToggle = document.getElementById('sidebar-toggle');
+        if (sidebarNav) { sidebarNav.classList.remove('open'); }
+        if (sidebarToggle) {
+          sidebarToggle.classList.remove('open');
+          sidebarToggle.setAttribute('aria-expanded', 'false');
+        }
+      }
+    </script>
+  </body>
+</html>
@@ -0,0 +1,527 @@
+"""UDP listener and datagram processing."""
+
+import asyncio
+import socket
+import struct
+import time
+import zlib
+import logging
+
+from platform import system as platform_system
+
+from ..common.proto import stodict, oldmtodict
+from ..common.utils import dur
+from . import notify as notify_mod
+
+logger = logging.getLogger(__name__)
+eventlog = notify_mod.eventlog
+
+# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
+# Supported on Linux, FreeBSD, and macOS.  The constant is not exposed by
+# Python's socket module on all platforms 
+platform = platform_system()
+if platform == "Darwin":
+    _SO_TIMESTAMP = 1024  # SO_TIMESTAMP on macOS (not in Python's socket module)
+elif platform == "Linux":
+    _SO_TIMESTAMP = 29  # Linux value (not in older Python versions)
+elif platform == "FreeBSD":
+     _SO_TIMESTAMP = 32  # FreeBSD value (not in older Python versions)
+else:
+    logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
+    _SO_TIMESTAMP = None
+
+# struct timeval uses two native C longs: tv_sec and tv_usec
+_TIMEVAL = struct.Struct('@ll')
+
+
+def enable_kernel_timestamps(sock) -> bool:
+    """Try to enable SO_TIMESTAMP on *sock*.
+
+    Returns True if the kernel will supply receive timestamps, False otherwise
+    (unsupported platform, older kernel, or insufficient permissions).
+    """
+    try:
+        sock.setsockopt(socket.SOL_SOCKET, _SO_TIMESTAMP, 1)
+        return True
+    except OSError:
+        return False
+
+
+def _extract_kernel_ts(ancdata) -> float | None:
+    """Parse recvmsg ancillary data and return the kernel receive time.
+
+    Returns seconds as a float, or None if no SO_TIMESTAMP cmsg is present.
+    """
+    for cmsg_level, cmsg_type, cmsg_data in ancdata:
+        if cmsg_level == socket.SOL_SOCKET and cmsg_type == _SO_TIMESTAMP:
+            if len(cmsg_data) >= _TIMEVAL.size:
+                sec, usec = _TIMEVAL.unpack_from(cmsg_data)
+                return sec + usec * 1e-6
+    return None
+
+
+class RecvmsgTransport:
+    """Thin wrapper used when SO_TIMESTAMP is active (add_reader path).
+
+    Exposes the same sendto() / close() interface as asyncio's DatagramTransport
+    so the rest of the code does not need to know which path is in use.
+    """
+    def __init__(self, loop, sock):
+        self._loop = loop
+        self._sock = sock
+
+    def sendto(self, data, addr):
+        try:
+            self._sock.sendto(data, addr)
+        except Exception as e:
+            logger.debug("sendto failed: %s", e)
+
+    def close(self):
+        try:
+            self._loop.remove_reader(self._sock.fileno())
+        except Exception:
+            pass
+        try:
+            self._sock.close()
+        except Exception:
+            pass
+
+
+def make_recvmsg_reader(sock, handler, transport):
+    """Return a callback suitable for loop.add_reader().
+
+    Reads one datagram per call using recvmsg() so that kernel timestamps in
+    the ancillary data are accessible.  Falls back to time.time() if the
+    cmsg is missing.
+
+    handler(msg, addr, transport, kernel_ts) – same signature as udp_handler
+    in main.py with the optional kernel_ts argument.
+    """
+    BUFSIZE = 65536
+    ANCBUFSIZE = 128  # enough for one struct timespec cmsg
+
+    def _read():
+        try:
+            data, ancdata, _, addr = sock.recvmsg(BUFSIZE, ANCBUFSIZE)
+        except BlockingIOError:
+            return
+        except OSError as e:
+            logger.warning("recvmsg error: %s", e)
+            return
+        try:
+            kernel_ts = _extract_kernel_ts(ancdata)
+            msg = parse_message(data)
+            if msg:
+                handler(msg, addr, transport, kernel_ts)
+        except Exception:
+            logger.exception("Error processing datagram from %s", addr)
+
+    return _read
+
+
+class EchoServerProtocol(asyncio.DatagramProtocol):
+    def __init__(self, config=None, handler=None):
+        super().__init__()
+        self.config = config or {}
+        self.handler = handler
+
+    def connection_made(self, transport):
+        self.transport = transport
+        logger.info("UDP Server listening...")
+
+    def datagram_received(self, data, addr):
+        logger.debug("Received from %s", addr)
+        try:
+            msg = parse_message(data)
+            if self.handler:
+                # handler can be a callable provided by the application
+                # pass the transport so handlers can send replies (ACKs/commands)
+                self.handler(msg, addr, self.transport)
+        except Exception:
+            logger.exception("Error while processing datagram from %s", addr)
+
+
+def parse_message(data: bytes):
+    """Parse a raw datagram into a message dict.
+
+    Uses the protocol decoding helpers and falls back to old format when
+    decoding returns an empty dict (compat with older clients).
+    """
+    msg = stodict(data)
+    if not msg:
+        # fallback to old format
+        msg = oldmtodict(data)
+    return msg
+
+
+def dicttos(ID, d):
+    s = []
+    for k in d:
+        if isinstance(d[k], float):
+            s.append("%s=%0.5f" % (k, d[k]))
+        else:
+            s.append("%s=%s" % (k, d[k]))
+    pk = ";".join(s)
+    zpk = zlib.compress(pk.encode(), 6)
+    ID = "!" + ID + ":"
+    opk = ID.encode() + zpk
+    return opk
+
+
+DROPOVERDUE = 7 * 24 * 3600  # seconds before an overdue host becomes UNKNOWN
+
+
+def _set_connectivity_alert(host, afam, level_name):
+    """Update (or clear) a connectivity alert_state entry for a host/address-family.
+
+    level_name is "CRITICAL", "WARNING", or "OK".  "OK" removes the entry so
+    that recovered hosts don't clutter the Alerts Dashboard.
+    """
+    from .threshold import AlertState, AlertLevel
+    metric_path = f"connectivity.{afam}"
+    level = getattr(AlertLevel, level_name, AlertLevel.OK)
+    if level == AlertLevel.OK:
+        host.alert_states.pop(metric_path, None)
+        return
+    if metric_path not in host.alert_states:
+        host.alert_states[metric_path] = AlertState(metric_path)
+    state = host.alert_states[metric_path]
+    state.update(level, level_name)
+
+
+def _make_timer_callbacks(uname, host, ctx):
+    """Return (on_overdue, on_unknown) async callbacks for connection timer logic.
+
+    Captured values are bound at call time so callbacks are safe to use in loops.
+    """
+    msg_to_websockets = ctx.get("msg_to_websockets")
+    threshold_checker = ctx.get("threshold_checker")
+    cfg = ctx.get("config", {})
+
+    async def on_unknown(connection):
+        connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
+        # Keep connectivity alert active when host transitions to unknown
+        if msg_to_websockets:
+            msg_to_websockets("host", host.stateinfo())
+
+    async def on_overdue(connection):
+        if connection.getstate() != connection.__class__.UP:
+            return
+        now = time.time()
+        connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
+        msg = f"{connection.afam} overdue"
+        eventlog(uname, "CRITICAL", msg)
+        if host.watched:
+            asyncio.create_task(notify_mod.send_notification(
+                uname,
+                notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
+            ))
+        # Track in alert_states so the Alerts Dashboard shows this
+        _set_connectivity_alert(host, connection.afam, "CRITICAL")
+        if threshold_checker:
+            threshold_checker.check_value(
+                host_name=uname,
+                metric_path="rtt",
+                value=float("inf"),
+                alert_states=host.alert_states,
+            )
+        if msg_to_websockets:
+            msg_to_websockets("host", host.stateinfo())
+        connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
+
+    return on_overdue, on_unknown
+
+
+def restore_connection_timers(hbdclass, ctx):
+    """Restore overdue timers for all loaded connections after a pickle restore.
+
+    For UP connections, the remaining time until overdue is calculated from
+    lastbeat so that clients that vanished during hbd's downtime are detected.
+    For OVERDUE connections, the UNKNOWN drop timer is restored.
+    """
+    now = time.time()
+    cfg = ctx.get("config", {})
+    grace = cfg.get("grace", 2)
+
+    restored = 0
+    for uname, host in list(hbdclass.Host.hosts.items()):
+        interval = host.interval
+        for afam, conn in list(host.connections.items()):
+            state = conn.getstate()
+            if state == hbdclass.Connection.DOWN:
+                continue
+
+            on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
+
+            if state == hbdclass.Connection.UP and interval > 0:
+                elapsed = now - conn.lastbeat
+                # Give hosts one full (interval + grace) of extra time on startup
+                # so hosts that were silent while hbd was down are not immediately
+                # flagged as overdue before they have a chance to check in.
+                startup_grace = interval + grace
+                remaining = max(startup_grace, 2 * startup_grace - elapsed)
+                conn.reset_overdue_timer(remaining, on_overdue)
+                logger.debug(
+                    "Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
+                    uname, afam, remaining, elapsed, startup_grace,
+                )
+                restored += 1
+
+            elif state == hbdclass.Connection.OVERDUE:
+                elapsed_overdue = now - conn.statetime
+                remaining = DROPOVERDUE - elapsed_overdue
+                if remaining <= 1:
+                    # Already past the drop window — mark UNKNOWN immediately
+                    conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
+                    logger.info(
+                        "Marking %s/%s UNKNOWN (overdue %.1f days)",
+                        uname, afam, elapsed_overdue / 86400,
+                    )
+                else:
+                    conn.reset_overdue_timer(remaining, on_unknown)
+                    logger.debug(
+                        "Restored OVERDUE timer %s/%s: %.0fs remaining",
+                        uname, afam, remaining,
+                    )
+                restored += 1
+
+    logger.info("Restored timers for %d connection(s)", restored)
+
+
+def handle_datagram(msg: dict, addr, transport, ctx: dict):
+    """Handle a parsed datagram message.
+
+    ctx is a dictionary with runtime dependencies:
+      - config: dict of configuration
+      - hbdclass: module providing Host/Connection classes
+      - log: callable(loghost, message)
+      - msg_to_websockets: callable(typ, data)
+      - msg_journal: MessageJournal instance for logging all messages
+      - DEBUG, verbose
+    """
+    if not msg:
+        return
+    now = ctx.get("recv_ts") or time.time()
+    
+    # Log message to journal
+    msg_journal = ctx.get("msg_journal")
+    if msg_journal:
+        # Create async task to log message (non-blocking)
+        import asyncio
+        try:
+            loop = asyncio.get_event_loop()
+            loop.create_task(msg_journal.log_message(msg, addr, now))
+        except Exception as e:
+            logger.debug(f"Failed to log message to journal: {e}")
+    
+    cfg = ctx.get("config", {})
+    hbdcls = ctx.get("hbdclass")
+    msg_to_websockets = ctx.get("msg_to_websockets")
+    DEBUG = ctx.get("DEBUG", 0)
+    verbose = ctx.get("verbose", False)
+
+    # normalize addr (ip, port)
+    ip = addr[0] if isinstance(addr, (list, tuple)) else addr
+    name = msg.get("name", "unknown")
+    from ..common.utils import shortname
+    from . import config as config_mod
+
+    uname = shortname(name)
+
+    if uname not in hbdcls.Host.hosts:
+        host = hbdcls.Host(uname)
+        # Use new config function to check dyndns
+        dyndnshosts = config_mod.get_dyndnshosts(cfg)
+        host.dyn = uname in dyndnshosts
+        # Apply user-access settings from config
+        access = config_mod.get_host_access(cfg, uname)
+        host.apply_access(access["owner"], access["managers"], access["monitors"])
+        logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
+        newh = True
+    else:
+        host = hbdcls.Host.hosts[uname]
+        newh = False
+    
+    cid = msg.get("id", 0)
+    try:
+        rtt = float(msg.get("rtt"))
+    except TypeError:
+        rtt = None
+
+    if msg.get("ID") == "HTB":
+        host.doesack = msg.get("acks", -1)
+        # send ACK back; ask client to resend plugin info when we have none yet
+        rmsg = {"time": time.time()}
+        if not host.plugin_data:
+            rmsg["request_update"] = 1
+        opkt = dicttos("ACK", rmsg)
+        try:
+            transport.sendto(opkt, addr)
+        except Exception as e:
+            if DEBUG > 0:
+                print(("cannot send ack: %s" % e))
+
+    elif msg.get("ID") == "PLG":
+        # Handle plugin data message
+        plugin_name = msg.get("plugin")
+        if plugin_name:
+            # Extract plugin fields, dropping protocol metadata fields
+            plugin_data = {k: v for k, v in msg.items()
+                           if k not in ("ID", "plugin", "id", "name")}
+            # Store plugin data with timestamp
+            host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
+
+            # If os_info reports an owner and none is configured server-side, apply it
+            if plugin_name == "os_info":
+                config_owner =  config_mod.get_host_access(cfg, uname).get("owner")
+                default_owner = config_mod.get_default_owner(cfg)
+                inferred_owner = plugin_data.get("owner", config_owner or default_owner)    
+                host.owner = inferred_owner
+                logger.info(f"owner for {uname} is '{host.owner}")
+            if DEBUG > 1:
+                print(f"Stored plugin data for {uname}: {plugin_name}")
+            
+            # Check thresholds if checker is available
+            threshold_checker = ctx.get("threshold_checker")
+            if threshold_checker:
+                try:
+                    state_changes = threshold_checker.check_plugin_data(
+                        host_name=uname,
+                        plugin_name=plugin_name,
+                        data=plugin_data,
+                        alert_states=host.alert_states,
+                    )
+                    if DEBUG > 1 and state_changes:
+                        print(f"Threshold state changes for {uname}: {state_changes}")
+                except Exception as e:
+                    logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
+            
+            # Notify websockets of plugin update
+            if msg_to_websockets:
+                try:
+                    msg_to_websockets("plugin", {
+                        "host": uname,
+                        "plugin": plugin_name,
+                        "data": plugin_data,
+                        "timestamp": now
+                    })
+                except Exception:
+                    pass
+
+    try:
+        conn, res = host.conndata(cid, ip, rtt, now)
+    except Exception as e:
+        if DEBUG > 0:
+            print("conndata failed: %s" % e)
+        return
+
+    if res:
+        eventlog(uname, "WARNING", res)
+        if host.watched:
+            asyncio.create_task(notify_mod.send_notification(
+                uname,
+                notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
+            ))
+
+    interval = int(msg.get("interval", 0) or 0)
+    shutdown = msg.get("shutdown", 0)
+    service = msg.get("service", "unknown")
+    message = msg.get("msg", None)
+    boot = msg.get("boot", 0)
+
+    if boot:
+        eventlog(uname, "INFO", "booted")
+        if host.watched:
+            asyncio.create_task(notify_mod.send_notification(
+                uname,
+                notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
+            ))
+    if message:
+        eventlog(uname, "INFO", "msg: %s" % message, service=service)
+
+    if conn.getstate() != hbdcls.Connection.UP:
+        lasts = conn.state
+        d = conn.newstate(hbdcls.Connection.UP, now)
+        # Clear connectivity alert now that the host is back up
+        _set_connectivity_alert(host, conn.afam, "OK")
+        # Don't log/notify RECOVER for a brand-new host seen for the first time —
+        # it was never down, it just hasn't been seen before.
+        if not newh:
+            if d == 0 or lasts == "unknown":
+                m = "%s is up" % (conn.afam)
+            elif d < 4:
+                # Transient blip (likely client restart) — skip log and notification
+                m = None
+            else:
+                m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
+            if m:
+                eventlog(uname, "RECOVER", m)
+                if host.watched:
+                    asyncio.create_task(notify_mod.send_notification(
+                        uname,
+                        notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
+                    ))
+
+    if boot or newh:
+        host.upcount = host.doesack
+    else:
+        host.upcount += 1
+
+    if shutdown:
+        m = "%s shutdown" % conn.afam
+        eventlog(uname, "INFO", m)
+        if host.watched:
+            asyncio.create_task(notify_mod.send_notification(
+                uname,
+                notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
+            ))
+        conn.newstate(hbdcls.Connection.DOWN, now)
+        _set_connectivity_alert(host, conn.afam, "CRITICAL")
+
+    if interval > 0:
+        host.interval = interval
+
+    # Timer-based reachability monitoring
+    # Reset overdue timer on every heartbeat
+    if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
+        grace = cfg.get("grace", 2)
+        timeout_seconds = interval + grace
+        on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
+        conn.reset_overdue_timer(timeout_seconds, on_overdue)
+    
+    # Check RTT thresholds using the threshold checker
+    threshold_checker = ctx.get("threshold_checker")
+    if threshold_checker and rtt and rtt > 0:
+        # Metric path for RTT is simply "rtt"
+        metric_path = "rtt"
+        
+        # Check against configured thresholds (handles alerts, notifications, etc.)
+        threshold_checker.check_value(
+            host_name=uname,
+            metric_path=metric_path,
+            value=rtt,
+            alert_states=host.alert_states
+        )
+
+    # send any commands we have queued
+    while len(host.cmds):
+        op, rmsg = host.cmds[0]
+        if op == "CMD":
+            del host.cmds[0]
+            eventlog(uname, "INFO", "command sent")
+        elif op == "UPD":
+            del host.cmds[0]
+            eventlog(uname, "INFO", "update initiated")
+        opkt = dicttos(op, rmsg)
+        try:
+            transport.sendto(opkt, addr)
+        except Exception as e:
+            if DEBUG > 0:
+                print(("cannot send cmd/update: %s" % e))
+
+    if msg_to_websockets:
+        try:
+            msg_to_websockets("host", host.stateinfo())
+        except Exception as e:
+            if DEBUG > 0:
+                print(("cannot send websocket message: %s" % e))
@@ -0,0 +1,271 @@
+"""User management: loading, authentication, and session tracking.
+
+Users are defined in the config file under the ``users`` key:
+
+    users:
+      alice:
+        full_name: Alice Smith
+        avatar: /path/to/avatar.png   # file path, URL, or base64 data URI
+        password: pbkdf2:sha256:...   # generated with: hbd passwd
+        admin: true                   # optional server-level admin
+        notification_channels: [pushover_standard]
+
+Roles are assigned per-host:
+
+    hosts:
+      webserver01:
+        owner: alice
+        managers: [bob]
+        monitors: [carol]
+
+If no users are defined the server runs in unauthenticated mode (backwards
+compatible).  When users are defined every API call must carry a valid session
+token in an ``Authorization: Bearer <token>`` or ``X-Auth-Token`` header,
+obtained via ``POST /api/0/auth/login``.
+"""
+
+import hashlib
+import hmac
+import logging
+import secrets
+import time
+
+logger = logging.getLogger(__name__)
+
+# Session lifetime in seconds (24 hours).
+SESSION_TTL = 86400
+
+# Global session store: token -> {"username": str, "expires": float, "created": float}
+_sessions: dict = {}
+
+
+# ---------------------------------------------------------------------------
+# User class
+# ---------------------------------------------------------------------------
+
+class User:
+    def __init__(
+        self,
+        username: str,
+        full_name: str = "",
+        avatar: str = "",
+        password_hash: str = "",
+        admin: bool = False,
+        notification_channels: list | None = None,
+    ):
+        self.username = username
+        self.full_name = full_name
+        self.avatar = avatar
+        self.password_hash = password_hash
+        self.admin = admin
+        self.notification_channels: list = notification_channels or []
+
+    def check_password(self, password: str) -> bool:
+        if not self.password_hash:
+            return False
+        return _verify_password(password, self.password_hash)
+
+    def avatar_is_local(self) -> bool:
+        """Return True when the avatar is a local filesystem path (starts with '/')."""
+        return bool(self.avatar and self.avatar.startswith("/"))
+
+    def avatar_url(self) -> str:
+        """Return the URL to use as an <img src>.
+
+        Local file paths are served via the /api/0/users/{username}/avatar
+        endpoint.  External URLs and data URIs are returned as-is.
+        """
+        if self.avatar_is_local():
+            return f"/api/0/users/{self.username}/avatar"
+        return self.avatar
+
+    def to_dict(self) -> dict:
+        return {
+            "username": self.username,
+            "full_name": self.full_name,
+            "avatar": self.avatar,
+            "avatar_url": self.avatar_url(),
+            "admin": self.admin,
+            "notification_channels": self.notification_channels,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Password hashing  (PBKDF2-HMAC-SHA256, stdlib only)
+# ---------------------------------------------------------------------------
+
+def hash_password(password: str) -> str:
+    """Return a storable hash for *password*.
+
+    Format: ``pbkdf2:sha256:<iterations>:<salt>:<hex-digest>``
+
+    Use this to generate the ``password`` value in the config file::
+
+        python -c "from hbd.server.users import hash_password; print(hash_password('secret'))"
+
+    Or via the CLI::
+
+        hbd passwd
+    """
+    salt = secrets.token_hex(16)
+    iterations = 260_000
+    dk = hashlib.pbkdf2_hmac(
+        "sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
+    )
+    return f"pbkdf2:sha256:{iterations}:{salt}:{dk.hex()}"
+
+
+def _verify_password(password: str, stored_hash: str) -> bool:
+    """Return True if *password* matches *stored_hash*."""
+    try:
+        parts = stored_hash.split(":")
+        if len(parts) != 5 or parts[0] != "pbkdf2" or parts[1] != "sha256":
+            return False
+        _, _, iterations_str, salt, expected_hex = parts
+        iterations = int(iterations_str)
+        dk = hashlib.pbkdf2_hmac(
+            "sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
+        )
+        return hmac.compare_digest(dk.hex(), expected_hex)
+    except Exception:
+        return False
+
+
+# ---------------------------------------------------------------------------
+# Global user registry
+# ---------------------------------------------------------------------------
+
+# username -> User
+users: dict = {}
+
+
+def load_users(config: dict) -> dict:
+    """Populate the global user registry from *config*.
+
+    Called once at startup and again on SIGHUP config reload.
+    Returns the new ``users`` dict.
+    """
+    global users
+    old_users = dict(users)  # snapshot before rebuild
+    users_cfg = config.get("users", {})
+    if not isinstance(users_cfg, dict):
+        users = {}
+        # Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
+        for username, existing_user in old_users.items():
+            if not existing_user.password_hash and username not in users:
+                users[username] = existing_user
+        return users
+
+    result: dict = {}
+    for username, attrs in users_cfg.items():
+        if not isinstance(attrs, dict):
+            logger.warning("Skipping user %r: expected a mapping", username)
+            continue
+        result[username] = User(
+            username=username,
+            full_name=attrs.get("full_name", ""),
+            avatar=attrs.get("avatar", ""),
+            password_hash=attrs.get("password", ""),
+            admin=bool(attrs.get("admin", False)),
+            notification_channels=attrs.get("notification_channels", []),
+        )
+
+    users = result
+    # Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
+    for username, existing_user in old_users.items():
+        if not existing_user.password_hash and username not in users:
+            users[username] = existing_user
+    logger.info("Loaded %d user(s) from config", len(users))
+    return users
+
+
+def users_enabled() -> bool:
+    """Return True if at least one user is configured (auth-required mode)."""
+    return bool(users)
+
+
+def get_user(username: str) -> "User | None":
+    return users.get(username)
+
+
+def authenticate(username: str, password: str) -> "User | None":
+    """Return the User if credentials are valid, else None."""
+    user = users.get(username)
+    if user and user.check_password(password):
+        return user
+    return None
+
+
+def provision_oauth_user(username: str, full_name: str, avatar: str) -> "User":
+    """Create or update a user sourced from an OAuth2 provider.
+
+    New users are inserted with no password_hash — they can only authenticate
+    via OAuth.  Existing users (e.g. defined in config with a password) have
+    their display name and avatar refreshed; all other attributes are preserved.
+    """
+    user = users.get(username)
+    if user is None:
+        user = User(username=username, full_name=full_name, avatar=avatar)
+        users[username] = user
+        logger.info("Provisioned OAuth user %r", username)
+    else:
+        if full_name:
+            user.full_name = full_name
+        if avatar:
+            user.avatar = avatar
+    return user
+
+
+# ---------------------------------------------------------------------------
+# Session management
+# ---------------------------------------------------------------------------
+
+def create_session(username: str) -> str:
+    """Create a new session for *username* and return the opaque token."""
+    _purge_expired_sessions()
+    token = secrets.token_hex(32)
+    _sessions[token] = {
+        "username": username,
+        "expires": time.time() + SESSION_TTL,
+        "created": time.time(),
+    }
+    return token
+
+
+def get_session_user(token: str) -> "User | None":
+    """Return the User for a valid *token*, or None if missing/expired."""
+    if not token:
+        return None
+    session = _sessions.get(token)
+    if not session:
+        return None
+    if session["expires"] < time.time():
+        del _sessions[token]
+        return None
+    return get_user(session["username"])
+
+
+def delete_session(token: str) -> None:
+    """Invalidate *token* (logout)."""
+    _sessions.pop(token, None)
+
+
+def _purge_expired_sessions() -> None:
+    now = time.time()
+    expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
+    for t in expired:
+        del _sessions[t]
+
+
+def save_sessions() -> dict:
+    """Return a snapshot of non-expired sessions suitable for pickling."""
+    _purge_expired_sessions()
+    return dict(_sessions)
+
+
+def load_sessions(snapshot: dict) -> None:
+    """Restore sessions from a pickled snapshot, dropping any that have expired."""
+    global _sessions
+    now = time.time()
+    _sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
+    logger.debug("Restored %d session(s) from pickle", len(_sessions))
@@ -0,0 +1,158 @@
+"""WebSocket handler and broadcast helpers for hbd.
+
+WebSocket connections are served through the regular HTTP port via the
+/ws route registered in http.py (aiohttp WebSocketResponse upgrade).
+The separate standalone WebSocket server on ws_port is no longer used.
+"""
+
+import asyncio
+import json
+import logging
+from typing import Callable, Iterable, Optional
+from . import data
+
+logger = logging.getLogger(__name__)
+
+# Map of WebSocket → User object (or None when auth is disabled)
+_connections: dict = {}
+_loop: Optional[asyncio.AbstractEventLoop] = None
+_get_hosts: Optional[Callable[[], Iterable]] = None
+_verbose: bool = False
+
+
+def setup(
+    loop: asyncio.AbstractEventLoop,
+    get_hosts: Optional[Callable[[], Iterable]] = None,
+    verbose: bool = False,
+):
+    """Register the running loop and initial-state callback.
+
+    Call this once from _run_async before starting the HTTP server.
+    """
+    global _loop, _get_hosts, _verbose
+    _loop = loop
+    _get_hosts = get_hosts
+    _verbose = verbose
+
+
+def _user_can_see_host(user, host_name: str) -> bool:
+    """Return True if *user* may see updates for *host_name* (manager or higher)."""
+    from . import hbdclass, users as users_mod
+    if user is None or not users_mod.users_enabled():
+        return True
+    if user.admin:
+        return True
+    host = hbdclass.Host.hosts.get(host_name)
+    if host is None:
+        return False
+    return host.is_manager(user.username)
+
+
+def _get_token(request) -> str:
+    """Extract session token from request (mirrors logic in http.py)."""
+    auth = request.headers.get("Authorization", "")
+    if auth.startswith("Bearer "):
+        return auth[7:].strip()
+    token = request.headers.get("X-Auth-Token", "")
+    if token:
+        return token
+    return request.cookies.get("hbd_session", "")
+
+
+async def handler(request):
+    """aiohttp WebSocket upgrade handler — register as GET /ws."""
+    from aiohttp import web
+    from . import users as users_mod
+
+    ws = web.WebSocketResponse()
+    await ws.prepare(request)
+
+    token = _get_token(request)
+    user = users_mod.get_session_user(token) if token else None
+
+    _connections[ws] = user
+    remote = request.remote
+    logger.info("WebSocket connected from %s", remote)
+
+    try:
+        # Send current host state, filtered to hosts this user may see
+        if _get_hosts:
+            try:
+                for h in list(_get_hosts()):
+                    host_name = h.get("raw_name") or h.get("name", "")
+                    if _user_can_see_host(user, host_name):
+                        await ws.send_str(json.dumps({"type": "host", "data": h}))
+            except Exception as e:
+                logger.error("Error sending initial hosts: %s", e)
+
+        # Send recent messages, filtered to hosts this user may see
+        if data.msgs:
+            try:
+                for m in data.msgs:
+                    host_name = m.get("host") if isinstance(m, dict) else None
+                    if not host_name or _user_can_see_host(user, host_name):
+                        await ws.send_str(json.dumps({"type": "message", "data": m}))
+            except Exception as e:
+                logger.error("Error sending initial messages: %s", e)
+
+        # Keep connection open, ignore incoming frames
+        async for msg in ws:
+            from aiohttp import WSMsgType
+            if msg.type == WSMsgType.TEXT:
+                if _verbose:
+                    logger.debug("ws recv from %s: %s", remote, msg.data)
+            elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
+                break
+
+    except Exception as e:
+        logger.exception("WebSocket handler error from %s: %s", remote, e)
+    finally:
+        _connections.pop(ws, None)
+        logger.info("WebSocket disconnected from %s", remote)
+
+    return ws
+
+
+def broadcast(typ: str, payload) -> bool:
+    """Thread-safe broadcast to all connected WebSocket clients.
+
+    For host and plugin updates, only sends to clients whose user has
+    manager-or-higher access to that host.  Other message types are
+    broadcast to all clients.
+
+    Can be called from any thread; schedules sends on the event loop.
+    Returns False if the loop is not running yet.
+    """
+    if not _loop:
+        return False
+
+    # Determine the host name for access-filtered message types
+    host_name: Optional[str] = None
+    if typ in ("host", "plugin"):
+        host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
+    elif typ == "message" and isinstance(payload, dict):
+        host_name = payload.get("host")
+
+    jmsg = json.dumps({"type": typ, "data": payload})
+
+    async def _send_all():
+        dead = set()
+        for ws, user in list(_connections.items()):
+            try:
+                if ws.closed:
+                    dead.add(ws)
+                    continue
+                if host_name is not None and not _user_can_see_host(user, host_name):
+                    continue
+                await ws.send_str(jmsg)
+            except Exception:
+                dead.add(ws)
+        for ws in dead:
+            _connections.pop(ws, None)
+
+    asyncio.run_coroutine_threadsafe(_send_all(), _loop)
+    return True
+
+
+def connection_count() -> int:
+    return len(_connections)
@@ -1,7 +0,0 @@
-<head>
-    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
-    <link rel="stylesheet" href="/static/style.css" type="text/css" />
-    <link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
-    <title>{{ title }}</title>
-    <script src="{{ extra_scripts }}"></script>
-</head>
@@ -1,281 +0,0 @@
-<!DOCTYPE html>
-<html>
-  {% include 'head.html' %}
-
-  <style>
-    .content {
-      display: flex;  
-      flex-direction: column;
-    }
-
-    .table {
-    /*  flex: 1; */
-      flex-grow: none;
-    }
-
-    .log {
-      flex: 2;
-      flex-grow: 1;
-
-    }
-
-    #ntable {
-      border-collapse: collapse;
-      font-size: 95%;
-      /*  width: 100%; */
-    }
-
-    #ntable td,
-    #ntable th {
-      border: 1px solid #ddd;
-      text-align: left;
-      padding: 0px;
-    }
-
-    #ntable tr:nth-child(even) {
-      background-color: #f2f2f2;
-    }
-
-    #ntable tr:hover {
-      background-color: #ddd;
-    }
-
-    #ntable th {
-      padding-top: 12px;
-      padding-bottom: 12px;
-      background-color: #9d9d9d;
-      color: white;
-    }
-
-    #ntable
-      th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
-      content: " \2195";
-    }
-
-    /* Modal for connection status messages */
-    .connection-modal {
-      display: none;
-      position: fixed;
-      z-index: 1000;
-      left: 0;
-      top: 0;
-      width: 100%;
-      height: 100%;
-      background-color: rgba(0, 0, 0, 0.4);
-    }
-
-    .connection-modal.show {
-      display: flex;
-      justify-content: center;
-      align-items: center;
-    }
-
-    .connection-modal-content {
-      background-color: #f9f9f9;
-      padding: 20px;
-      border: 1px solid #888;
-      border-radius: 5px;
-      text-align: center;
-      box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
-      min-width: 300px;
-    }
-
-    .connection-modal-content p {
-      margin: 10px 0;
-      font-size: 16px;
-      color: #333;
-    }
-  </style>
-  <script type="text/javascript">
-    var cnt = 0;
-    var nTable = document;
-    var name_idx = {};
-    var c = 0;
-
-    function setup() {
-      name_idx = {};
-      nTable = document.getElementById("ntable");
-      for (var i = 0, row; (row = nTable.rows[i]); i++) {
-        if (i == 0) continue;
-        name = nTable.rows[i].cells[0].innerText;
-        name_idx[name] = nTable.rows[i];
-        /* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
-      }
-    }
-
-    function createRow(data) {
-      var row = document.createElement("tr");
-      var c_name = document.createElement("td");
-      var c_ver = document.createElement("td");
-      var c_ipv4addr = document.createElement("td");
-      var c_ipv4state = document.createElement("td");
-      var c_ipv4latency = document.createElement("td");
-      c_ipv4latency.style.textAlign = "right";
-      var c_ipv4statets = document.createElement("td");
-      c_ipv4statets.style.textAlign = "right";
-      var c_ipv6addr = document.createElement("td");
-      var c_ipv6state = document.createElement("td");
-      var c_ipv6latency = document.createElement("td");
-      c_ipv6latency.style.textAlign = "right";
-      var c_ipv6statets = document.createElement("td");
-      c_ipv6statets.style.textAlign = "right";
-      row.appendChild(c_name);
-      row.appendChild(c_ver);
-      row.appendChild(c_ipv4addr);
-      row.appendChild(c_ipv4state);
-      row.appendChild(c_ipv4latency);
-      row.appendChild(c_ipv4statets);
-      row.appendChild(c_ipv6addr);
-      row.appendChild(c_ipv6state);
-      row.appendChild(c_ipv6latency);
-      row.appendChild(c_ipv6statets);
-      if (data.dyn) {
-        c_name.innerHTML = "<b>" + data.name + "</b>";
-      } else {
-        c_name.innerHTML = data.name;
-      }
-      c_ver.innerHTML = data.cver;
-      c_ipv4addr.innerHTML = data.connections[0].addr;
-      c_ipv4state.innerHTML = data.connections[0].state;
-      if (data.connections.length > 1) {
-        c_ipv6addr.innerHTML = data.connections[1].addr;
-        c_ipv6state.innerHTML = data.connections[1].state;
-      }
-      var table = document.getElementById("ntablebody"); // find table to append to
-      table.appendChild(row); // append row to table
-      name_idx[c_name] = row;
-    }
-
-    function formatTS(ts) {
-      const milliseconds = ts * 1000;
-      const dateObject = new Date(milliseconds);
-      return dateObject.toLocaleString("de-DE");
-    }
-
-    function update_table(data) {
-      if (!(data.name in name_idx)) {
-        createRow(data);
-        setup();
-      }
-
-      for  (var i = 0; i < data.connections.length; i++) {
-        name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
-        name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
-          data.connections[i].statetime
-        );
-        if (data.connections[i].state == "up") {
-          state = "up";
-          latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
-        } else {
-          if (data.connections[i].state == "unknown") {
-            state = "";
-            latency = "";
-            name_idx[data.name].cells[2 + i * 4].innerHTML = "";
-            name_idx[data.name].cells[5 + i * 4].innerHTML = "";
-          } else {
-            state = "<b>" + data.connections[i].state + "</b>";
-            latency = "-";
-          }
-        }
-        name_idx[data.name].cells[3 + i * 4].innerHTML = state;
-        name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
-      }
-    }
-
-    function WS_Connect() {
-      if ("WebSocket" in window) {
-        //N.B: subprotocol field causes chrome to error 1006
-        var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
-
-        ws_hbd.onopen = function () {
-          // Web Socket is connected, send data using send()
-          console.log("ws connect {{heartbeat_ws_url}}");
-          // Hide modal window if visible
-          var modal = document.getElementById("connectionModal");
-          if (modal) {
-            modal.classList.remove("show");
-          }
-          ws_hbd.send("heartbeat_web");
-        };
-
-        ws_hbd.onerror = function (event) {
-          console.log(event);
-        };
-
-        ws_hbd.onmessage = function (event) {
-          /*      console.log(event.data); */
-          var state = JSON.parse(event.data);
-          /* console.log("State: " + state.type); */
-          if (state.type == "host") {
-            update_table(state.data);
-          } else if (state.type == "message") {
-            var msgs = document.getElementById("messages");
-            msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
-          }
-          cnt++;
-        };
-
-        ws_hbd.onclose = function (event) {
-          /*     console.log(event); */
-          console.log("Connection is closed, reopening");
-          // Show modal window
-          var modal = document.getElementById("connectionModal");
-          if (modal) {
-            modal.classList.add("show");
-          }
-          setTimeout(function () {
-            WS_Connect();
-          }, 3000);
-        };
-      } else {
-        // The browser doesn't support WebSocket
-        console.log("WebSocket NOT supported by your Browser!");
-      }
-    }
-    WS_Connect();
-  </script>
-  <body>
-    {% include 'menu.html' %}
-
-    <div id="content" class="content" style="overflow: hidden">
-      <div id="table" class="table" style="overflow: hidden">
-        <!--  <h2>{{title}}</h2> -->
-        <table id="ntable" class="sortable">
-          <thead>
-            <tr>
-              <th>Name</th>
-              <th>Ver</th>
-              <th>IPv4 Addr</th>
-              <th>State</th>
-              <th style="text-align: right">Latencey</th>
-              <th style="text-align: right">Last State</th>
-              <th>IPv6 Addr</th>
-              <th>State</th>
-              <th style="text-align: right">Latencey</th>
-              <th style="text-align: right">Last State</th>
-            </tr>
-          </thead>
-          <tbody id="ntablebody"></tbody>
-        </table>
-      </div>
-      <div id="log" class="log" style="overflow: auto;">
-        <h2>Log of Events</h2>
-        <div id="messages">
-
-        </div>
-      </div>
-    </div>
-    {% include 'foot.html' %}
-    
-    <!-- Connection status modal -->
-    <div id="connectionModal" class="connection-modal">
-      <div class="connection-modal-content">
-        <p>⚠️ Connection is closed, reopening...</p>
-      </div>
-    </div>
-    
-    <script>
-      setup();
-    </script>
-  </body>
-</html>
@@ -1,3 +0,0 @@
-<label for="drawer-toggle" id="drawer-toggle-label"></label>
-<header>{{ header }}</header>
-
@@ -1,220 +0,0 @@
-"""UDP listener and datagram processing."""
-
-import asyncio
-import zlib
-import logging
-
-from .proto import stodict, oldmtodict
-from hbd.utils import dur
-
-logger = logging.getLogger(__name__)
-
-
-class EchoServerProtocol(asyncio.DatagramProtocol):
-    def __init__(self, config=None, handler=None):
-        super().__init__()
-        self.config = config or {}
-        self.handler = handler
-
-    def connection_made(self, transport):
-        self.transport = transport
-        logger.info("UDP Server listening...")
-
-    def datagram_received(self, data, addr):
-        logger.debug("Received from %s", addr)
-        try:
-            msg = parse_message(data)
-            if self.handler:
-                # handler can be a callable provided by the application
-                # pass the transport so handlers can send replies (ACKs/commands)
-                self.handler(msg, addr, self.transport)
-        except Exception:
-            logger.exception("Error while processing datagram from %s", addr)
-
-
-def parse_message(data: bytes):
-    """Parse a raw datagram into a message dict.
-
-    Uses the protocol decoding helpers and falls back to old format when
-    decoding returns an empty dict (compat with older clients).
-    """
-    msg = stodict(data)
-    if not msg:
-        # fallback to old format
-        msg = oldmtodict(data)
-    return msg
-
-
-def dicttos(ID, d, compress=False):
-    s = []
-    for k in d:
-        if isinstance(d[k], float):
-            s.append("%s=%0.5f" % (k, d[k]))
-        else:
-            s.append("%s=%s" % (k, d[k]))
-    pk = ";".join(s)
-    if compress:
-        zpk = zlib.compress(pk.encode(), 6)
-        ID = "!" + ID + ":"
-        opk = ID.encode() + zpk
-    else:
-        zpk = pk
-        opk = ID + ":" + zpk
-    return opk
-
-
-def handle_datagram(msg: dict, addr, transport, ctx: dict):
-    """Handle a parsed datagram message.
-
-    ctx is a dictionary with runtime dependencies:
-      - config: dict of configuration
-      - hbdclass: module providing Host/Connection classes
-      - log: callable(loghost, message)
-      - pushmsg: callable(message)
-      - msg_to_websockets: callable(typ, data)
-      - DEBUG, verbose
-    """
-    if not msg:
-        return
-    now = __import__("time").time()
-    cfg = ctx.get("config", {})
-    hbdcls = ctx.get("hbdclass")
-    log = ctx.get("log")
-    pushmsg = ctx.get("pushmsg")
-    msg_to_websockets = ctx.get("msg_to_websockets")
-    DEBUG = ctx.get("DEBUG", 0)
-    verbose = ctx.get("verbose", False)
-
-    # normalize addr (ip, port)
-    ip = addr[0] if isinstance(addr, (list, tuple)) else addr
-    name = msg.get("name", "unknown")
-    from hbd.utils import shortname
-
-    uname = shortname(name)
-
-    if uname not in hbdcls.Host.hosts:
-        host = hbdcls.Host(uname)
-        host.dyn = uname in cfg.get("dyndnshosts", [])
-        if verbose:
-            print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
-        newh = True
-    else:
-        host = hbdcls.Host.hosts[uname]
-        newh = False
-
-    cid = msg.get("id", 0)
-    try:
-        rtt = float(msg.get("rtt", None))
-    except Exception:
-        rtt = None
-
-    if msg.get("ID") == "HTB":
-        host.doesack = msg.get("acks", -1)
-    host.setcver(msg.get("ver", 0))
-
-    try:
-        conn, res = host.conndata(cid, ip, rtt, now)
-    except Exception as e:
-        if DEBUG > 0:
-            print("conndata failed: %s" % e)
-        return
-
-    if res:
-        if log:
-            log(uname, res)
-        if uname in cfg.get("watchhosts", []):
-            if pushmsg:
-                pushmsg("%s %s" % (host.name, res))
-
-    interval = int(msg.get("interval", 0) or 0)
-    shutdown = msg.get("shutdown", 0)
-    service = msg.get("service", "unknown")
-    message = msg.get("msg", None)
-    boot = msg.get("boot", 0)
-
-    if boot:
-        if log:
-            log(uname, "booted")
-        if uname in cfg.get("watchhosts", []):
-            m = "%s booted" % (host.name)
-            if pushmsg:
-                pushmsg(m)
-    if message:
-        if log:
-            log(uname, "msg: %s" % message, service=service)
-        if uname in cfg.get("watchhosts", []):
-            if pushmsg:
-                pushmsg(message)
-
-    if conn.getstate() != hbdcls.Connection.UP:
-        lasts = conn.state
-        d = conn.newstate(hbdcls.Connection.UP, now)
-        m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
-        if log:
-            log(uname, m)
-        if uname in cfg.get("watchhosts", []):
-            if pushmsg:
-                pushmsg("%s %s is back" % (uname, conn.afam))
-
-    if boot or newh:
-        host.upcount = host.doesack
-    else:
-        host.upcount += 1
-
-    if shutdown:
-        if log:
-            log(uname, "%s shutdown" % conn.afam)
-        if uname in cfg.get("watchhosts", []):
-            if pushmsg:
-                pushmsg("%s %s shutdown" % (uname, conn.afam))
-        conn.newstate(hbdcls.Connection.DOWN, now)
-
-    if interval > 0:
-        host.interval = interval
-
-    # send ACK back
-    rmsg = {"time": __import__("time").time()}
-    if host.cver < 1:
-        opkt = b"ACK"
-    else:
-        opkt = dicttos("ACK", rmsg, host.cver > 1)
-    try:
-        transport.sendto(opkt, addr)
-    except Exception as e:
-        if DEBUG > 0:
-            print(("cannot send ack: %s" % e))
-
-    # send any commands we have queued
-    while len(host.cmds):
-        op, rmsg = host.cmds[0]
-        if op == "CMD":
-            del host.cmds[0]
-            if log:
-                log(uname, "command sent")
-            if host.cver < 1:
-                rmsg = rmsg["cmd"]
-        elif op == "UPD":
-            del host.cmds[0]
-            if log:
-                log(uname, "update initiated")
-            if host.cver < 1:
-                if log:
-                    log(uname, " ver 0 does not support UPD")
-                continue
-        if host.cver < 1:
-            opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
-            if isinstance(opkt, str):
-                opkt = opkt.encode()
-        else:
-            opkt = dicttos(op, rmsg, True)
-        try:
-            transport.sendto(opkt, addr)
-        except Exception as e:
-            if DEBUG > 0:
-                print(("cannot send cmd/update: %s" % e))
-
-    if msg_to_websockets:
-        try:
-            msg_to_websockets("host", host.stateinfo())
-        except Exception:
-            pass
@@ -1,143 +0,0 @@
-"""WebSocket server and broadcast helpers for hbd.
-
-Provides an asyncio-based WebSocket server and a thread-safe broadcast
-function that other threads or synchronous code can call.
-"""
-
-import asyncio
-import json
-import logging
-from typing import Callable, Iterable, Optional
-
-import websockets
-
-logger = logging.getLogger(__name__)
-
-_connections = set()
-_loop: Optional[asyncio.AbstractEventLoop] = None
-_get_hosts: Optional[Callable[[], Iterable]] = None
-_get_msgs: Optional[Callable[[], Iterable]] = None
-_verbose = False
-
-
-async def _handler(websocket, path=None):
-    _connections.add(websocket)
-    remote_address = websocket.remote_address
-    if path is None:
-        path = getattr(websocket, "path", None)
-    if _verbose:
-        logger.info("DBG ws_serve: %s: %s", remote_address, path)
-    try:
-        # send initial hosts
-        if _get_hosts:
-            for h in _get_hosts():
-                jmsg = json.dumps({"type": "host", "data": h})
-                await websocket.send(jmsg)
-        # send recent messages
-        if _get_msgs:
-            for m in list(_get_msgs())[-100:]:
-                jmsg = json.dumps({"type": "message", "data": m})
-                await websocket.send(jmsg)
-
-        # keep connection open until client disconnects
-        async for _ in websocket:
-            # we don't expect meaningful incoming messages besides the initial
-            # client 'hello' that some clients send; ignore for now
-            if _verbose:
-                logger.debug("received ws data: %s", _)
-
-    except (
-        websockets.exceptions.ConnectionClosedOK,
-        websockets.exceptions.ConnectionClosedError,
-    ) as e:
-        if _verbose:
-            logger.info("ws closed: %r", e)
-    except Exception as e:
-        logger.exception("ws handler exception: %s", e)
-    finally:
-        try:
-            _connections.remove(websocket)
-        except KeyError:
-            pass
-        await websocket.wait_closed()
-
-
-async def start(
-    host: str,
-    ws_port: int,
-    wss_port: Optional[int] = None,
-    ssl_context=None,
-    get_hosts: Optional[Callable] = None,
-    get_msgs: Optional[Callable] = None,
-    verbose: bool = False,
-):
-    """Start WebSocket servers and block until cancelled.
-
-    This is intended to be awaited inside the main asyncio event loop.
-    If `wss_port` and `ssl_context` are provided, a WSS server will also be
-    started.
-    """
-    global _loop, _get_hosts, _get_msgs, _verbose
-    _loop = asyncio.get_running_loop()
-    _get_hosts = get_hosts
-    _get_msgs = get_msgs
-    _verbose = verbose
-
-    servers = []
-    # plain WebSocket
-    websockets_logger = logging.getLogger("websockets.server")
-    websockets_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-    # regular WebSocket
-    ws_server = websockets.serve(_handler, host, ws_port)  # , subprotocols=["hbd"])
-    servers.append(ws_server)
-    # secure WebSocket (optional)
-    if wss_port and ssl_context:
-        wss_server = websockets.serve(
-            _handler, host, wss_port, ssl=ssl_context
-        )  # , subprotocols=["hbd"])
-        servers.append(wss_server)
-
-    # await starting of all servers
-    for srv in servers:
-        await srv
-
-    if _verbose:
-        logger.info(
-            "WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
-        )
-
-    # block forever (until loop is stopped or cancelled)
-    await asyncio.Future()
-
-
-def broadcast(typ: str, data) -> bool:
-    """Thread-safe broadcast helper.
-
-    Schedules coroutine(s) on the running loop to send message to all
-    connected websockets. Returns False if server was not running.
-    """
-    if not _loop:
-        return False
-    jmsg = json.dumps({"type": typ, "data": data})
-    to_close = []
-    for ws in list(_connections):
-        if ws.state != websockets.protocol.State.OPEN:
-            to_close.append(ws)
-            continue
-        try:
-            asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
-        except Exception:
-            to_close.append(ws)
-            logger.debug("ws.send exception: closed")
-    for ws in to_close:
-        try:
-            asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
-        except Exception:
-            pass
-        if ws in _connections:
-            _connections.remove(ws)
-    return True
-
-
-def connection_count() -> int:
-    return len(_connections)
@@ -1,380 +0,0 @@
-"""
-host and connection class shared between hbd and
-the websit's heartbeat.py
-
-"""
-
-import time
-import json
-import copy
-import queue
-
-num = 0
-
-MAXRTTS = 10
-
-DEBUG = 2
-
-
-def log(host, m):
-    if DEBUG:
-        print("class log: %s %s" % (host, m))
-
-
-class Connection:
-    # map of addrs to names
-
-    htab = {}
-    UNKNOWN = "unknown"
-    UP = "up"
-    DOWN = "down"
-    OVERDUE = "overdue"
-
-    def __init__(self, host, cid, addr, afam):
-        self.host = host
-        self.cid = cid
-        if addr[0:7] == "::ffff:":
-            addr = addr[7:]
-        self.addr = addr
-        self.afam = afam
-        self.rtts = [0]
-        self.lastbeat = time.time()
-        self.statetime = self.lastbeat
-        self.deltastatetime = "computed"
-        self.state = Connection.UNKNOWN
-
-        if host:
-            Connection.htab[addr] = self.host.name
-            if self.host.isDynDns():
-                log(self.host.name, "dns update %s" % self.addr)
-                Host.dnsQ.put((self.host.name, self.addr))
-
-    def registerDns(self):
-        Host.dnsQ.put((self.host.name, self.addr))
-
-    def clearstate(self):
-        d = {}
-        d["addr"] = ""
-        d["rtt"] = ""
-        d["lastbeat"] = ""
-        d["state"] = ""
-        d["statetime"] = ""
-        d["deltastatetime"] = ""
-        d["rttstate"] = ""
-        return d
-
-    def statedict(self, Null=False):
-        d = self.clearstate()
-        now = time.time()
-        if not Null:
-            d["addr"] = self.addr
-            if self.rtts[-1]:
-                d["rtt"] = "%0.1f" % self.rtts[-1]
-            elif self.state == Connection.UNKNOWN:
-                d["rtt"] = ""
-            else:
-                d["rtt"] = "?"
-            d["lastbeat"] = self.lastbeat
-            if self.state == Connection.OVERDUE:
-                d["state"] = "<b>%s</b>" % self.state
-            else:
-                d["state"] = self.state
-            if self.state == Connection.UP:
-                d["rttstate"] = d["rtt"]
-            elif self.state == Connection.OVERDUE:
-                d["rttstate"] = ""
-            else:
-                d["rttstate"] = d["state"]
-            d["statetime"] = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
-            )
-            delta = now - self.statetime
-
-            if self.state == Connection.UNKNOWN:
-                d["deltastatetime"] = ""
-            elif delta > 86400:
-                # 				d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
-            elif delta > 3600:
-                # 				d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
-            # 				d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
-            elif delta > 60:
-                # 				d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
-                d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
-            # 				d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
-            else:
-                # 				d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
-                d["deltastatetime"] = "%i secs" % (delta)
-        if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
-            d = self.clearstate()
-
-        return d
-
-    def headerdict(self, afam):
-        d = {}
-        d["addr"] = "%s Addr" % afam
-        d["rtt"] = "Latencey"
-        d["lastbeat"] = "Last Contact"
-        d["state"] = "State"
-        d["statetime"] = "Last State"
-        d["rttstate"] = "Reach"
-        d["deltastatetime"] = "Last State"
-        return d
-
-    def jsons(self):
-        return json.dumps(self.__dict__)
-
-    # set new state, return number of secs in previous state
-    def newstate(self, state, now, when=0):
-        self.state = state
-        delta = now - when
-        s = delta - self.statetime
-        self.statetime = delta
-        return s
-
-    def getstate(self):
-        return self.state
-
-    def newaddr(self, addr, rtt, now):
-        self.lastbeat = now
-        self.rtts.append(rtt)
-        if len(self.rtts) > MAXRTTS:
-            del self.rtts[0]
-
-        if self.addr == addr:
-            r = None
-        else:
-            r = "changed from %s to %s" % (self.addr, addr)
-            try:
-                del Connection.htab[self.addr]
-            except:
-                pass
-            self.addr = addr
-            Connection.htab[addr] = self.host.name
-            if self.host.isDynDns():
-                Host.dnsQ.put((self.host.name, self.addr))
-        return r
-
-
-#
-class Host:
-    # Table of Hosts
-    hosts = {}
-    dnsQ = queue.Queue()
-
-    def __init__(self, name):
-        global num
-        self.name = name
-        if name:
-            num += 1
-            Host.hosts[name] = self
-        self.num = num
-        self.dyn = False
-        self.watched = False
-        self.upcount = 0
-        self.interval = 0
-        self.doesack = -1
-        self.cmds = []
-        self.cver = 0
-        self.connections = {}
-        self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
-
-    def statedict(self):
-        d = {}
-        d["name"] = self.name
-        if self.dyn:
-            d["name"] += "*"
-        if self.watched:
-            d["name"] = "<b>%s</b>" % d["name"]
-        d["dyn"] = str(self.dyn)
-        d["ver"] = str(self.cver)
-        d["num"] = self.num
-        for c in ["IPv4", "IPv6"]:
-            if c in self.connections:
-                cs = self.connections[c].statedict()
-            else:
-                cs = ubConnection.statedict(True)
-            for csv in cs:
-                d["%s.%s" % (c, csv)] = cs[csv]
-
-        return d
-
-    def headerdict(self):
-        d = {}
-        d["name"] = "Name"
-        d["dyn"] = "Dyn"
-        d["ver"] = "Ver"
-        d["num"] = "??"
-        for c in ["IPv4", "IPv6"]:
-            cs = ubConnection.headerdict(c)
-            for csv in cs:
-                d["%s.%s" % (c, csv)] = cs[csv]
-        return d
-
-    def registerDns(self):
-        for af in self.connections:
-            self.connections[af].registerDns()
-
-    def stateinfo(self):
-        ddict = {}
-        for d in self.__dict__:
-            if d == "connections":
-                cl = []
-                for c in self.connections:
-                    # dirty ugly hack: fix conn to host backpointer
-                    cld = copy.deepcopy(self.connections[c].__dict__)
-                    cld["host"] = cld["host"].name
-                    cl.append(cld)
-                ddict[d] = cl
-            else:
-                ddict[d] = self.__dict__[d]
-        return ddict
-
-    def jsons(self):
-        return json.dumps(self.stateinfo())
-
-    def setcver(self, cver):
-        self.cver = cver
-
-    def isDynDns(self):
-        return self.dyn
-
-    def isIPv4(self, addr):
-        if isinstance(addr, tuple):
-            return addr[0].find(".") > 0
-        else:
-            return addr.find(".") > 0
-
-    def conndata(self, cid, addr, rtt, now):
-        if addr[0:7] == "::ffff:":
-            addr = addr[7:]
-        if self.isIPv4(addr):
-            afam = "IPv4"
-        else:
-            afam = "IPv6"
-
-        if afam not in self.connections:
-            self.connections[afam] = Connection(self, cid, addr, afam)
-
-        conn = self.connections[afam]
-        res = conn.newaddr(addr, rtt, now)
-        return conn, res
-
-    # called when reloading class from pickle, add new fields here
-    def fixup(self):
-        for c in ["IPv4", "IPv6"]:
-            if c in self.connections:
-                addr = self.connections[c].addr
-                if addr[0:7] == "::ffff:":
-                    addr = addr[7:]
-                self.connections[c].addr = addr
-
-        pass
-
-    # def dispstate(self):
-    #    if self.state in ["down", "overdue"]:
-    #        state = "<b>%s</b>" % self.state
-    #    elif self.state in ["up", "UP"]:
-    #        state = ""
-    #        for x in list(self.connections.keys()):
-    #            try:
-    #                state += " %5.1f" % (self.connections[x].rtts[-1])
-    #            except:
-    #                state += " %5s" % (self.connections[x].rtts[-1])
-    #    elif self.state in ["unknown", "UNKNOWN"]:
-    #        state = ""
-    #    else:
-    #        state = "%s" % self.state
-    #    return state
-
-    def dispstats(self):
-        if self.doesack != -1:
-            if self.upcount > 0:
-                # 				return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
-                r = ""
-                for v in range(3):
-                    a, u = self.hdwcounts[v]
-                    if (self.upcount - u) != 0:
-                        vs = "%0.0f" % (
-                            100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
-                        )
-                        if vs == "0":
-                            vs = ""
-                    else:
-                        vs = "-"
-                    r += '<td align="right">%s</td>' % vs
-                return r
-            else:
-                return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
-        return '<td align="right">N/A</td><td></td<td></td>>'
-
-    hostfields_long = [
-        "name",
-        "IPv4.addr",
-        "IPv4.state",
-        ("IPv4.rtt", 'style="text-align: right;"'),
-        ("IPv4.statetime", 'style="text-align: right;"'),
-        "IPv6.addr",
-        "IPv6.state",
-        ("IPv6.rtt", 'style="text-align: right;"'),
-        ("IPv6.statetime", 'style="text-align: right;"'),
-        "ver",
-    ]
-
-    hostfields_short = [
-        "name",
-        ("IPv4.rttstate", 'style="text-align: right;"'),
-        ("IPv4.deltastatetime", 'style="text-align: right;"'),
-        ("IPv6.rttstate", 'style="text-align: right;"'),
-        ("IPv6.deltastatetime", 'style="text-align: right;"'),
-    ]
-
-    def gene(self, tag, v, attrib=None):
-        if attrib:
-            a = " %s" % attrib
-        else:
-            a = ""
-        return "<%s%s>%s</%s>" % (tag, a, v, tag)
-
-    def htmltable(self, tag, hd, short):
-        if short:
-            hostfields = Host.hostfields_short
-        else:
-            hostfields = Host.hostfields_long
-        h = []
-        for f in hostfields:
-            if isinstance(f, tuple):
-                h.append(self.gene(tag, hd[f[0]], f[1]))
-            else:
-                h.append(self.gene(tag, hd[f]))
-        return self.gene("tr", "\n".join(h))
-
-    def buildhosttable(self, short=False):
-        if DEBUG > 1:
-            print("DBG buildhosttable: start")
-        res = []
-        res.append('<table id="ntable" class="sortable">')
-        res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
-        hosts_sorted = list(Host.hosts.keys())
-        if len(hosts_sorted):
-            hosts_sorted.sort()
-            for h in hosts_sorted:
-                res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
-        res.append("</table>")
-        if DEBUG > 1:
-            print("DBG buildhosttable: %s" % res)
-        return res
-
-    def buildmsgtable(self, msgs):
-        res = []
-        le = max(40 - len(Host.hosts), 3)
-        res.append("<h4>Log of Events</h4>")
-        for m in msgs[len(msgs) - le:]:
-            res.append("%s<BR>" % m)
-        return res
-
-
-# create fake "unbound objects", remove in Python 3.0
-ubHost = Host(None)
-ubConnection = Connection(None, "", "", "")
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+#echo "OK - all is well"
+echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
@@ -4,26 +4,45 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "hbd"
-version = "5.0.4"
-description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
+version = "5.2.6"
+description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
 readme = "README.md"
 requires-python = ">=3.11"
 license = "MIT" 
-keywords = ["heartbeat", "monitoring", "dns", "websocket"]
+keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
 authors = [
  { name = "heartbeat contributors" }
 ]

+# Core dependencies (required for both client and server)
 dependencies = [
-  "websockets>=13.2",
-  "mattermostdriver>=7.3.0",
  "PyYAML>=6.0",
-  "aiohttp>=3.11",
-  "Jinja2>=3.1.6",
-  "fastapi>=0.128.0",
 ]

 [project.optional-dependencies]
+# Client-only dependencies (hbc - system monitoring client)
+client = [
+  "psutil>=5.9.0",
+]
+
+# Server-only dependencies (hbd - heartbeat daemon/server)
+server = [
+  "websockets>=13.2",
+  "mattermostdriver>=7.3.0",
+  "aiohttp>=3.11",
+  "Jinja2>=3.1.6",
+  "matrix-nio>=0.24",
+]
+
+# Minimal client — hbc_mini only, no external dependencies
+mini = []
+
+# Install both client and server
+all = [
+  "hbd[client,server]",
+]
+
+# Development dependencies
 dev = [
  "pytest>=7.0",
  "pytest-cov>=4.0",
@@ -35,15 +54,19 @@ dev = [
 ]

 [project.scripts]
-hbd = "hbd.cli:main"
-hbc = "hbd.hbc:main"
+hbd = "hbd.server.cli:main"
+hbc = "hbd.client.main:main"
+
+[tool.setuptools]
+script-files = ["scripts/hb_install.sh", "scripts/hbc_mini.py"]

 [tool.setuptools.packages.find]
 where = ["."]
 include = ["hbd*"]

 [tool.setuptools.package-data]
-"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
+"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
+"hbd.client" = ["*.yaml"]


 [tool.black]
@@ -3,11 +3,15 @@
 set -e
 uv version --bump patch 
 VER=$(uv  version  --short)
-sed -i ""  "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
+sed -i".bak"  "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
+sed -i".bak"  "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" scripts/hbc_mini.py

 # commit pyproject.toml
-git commit -m "version $VER" pyproject.toml hbd/__init__.py
+git commit -m "version $VER" pyproject.toml hbd/__init__.py scripts/hbc_mini.py
 git push 
 # tag version
 git tag -a v$VER -m "Version $VER"
 git push --tags
+
+rm hbd/__init__.py.bak
+rm scripts/hbc_mini.py.bak
@@ -0,0 +1,2 @@
+hbc_mini
+hbc_mini_dbg
@@ -0,0 +1,21 @@
+CC      ?= cc
+CFLAGS  = -O2 -Wall -Wextra -std=c11
+LDFLAGS = -lz -lpthread -lm
+TARGET  = hbc_mini
+SRC     = hbc_mini.c
+
+# FreeBSD/NetBSD keep zlib in base; no extra flags needed.
+# On some NetBSD installs pthreads may need -lpthread from pkgsrc.
+
+.PHONY: all clean debug
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+debug: $(SRC)
+	$(CC) -g -fsanitize=address,undefined -o $(TARGET)_dbg $< $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(TARGET)_dbg
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+"""
+Demo script for HTTP API endpoints.
+Tests and demonstrates the plugin data and alert APIs.
+"""
+
+import requests
+import json
+import sys
+from datetime import datetime
+from time import sleep
+
+BASE_URL = "http://localhost:50004"
+
+def print_section(title):
+    """Print a formatted section header."""
+    print(f"\n{'=' * 70}")
+    print(f"  {title}")
+    print('=' * 70)
+
+def format_timestamp(timestamp):
+    """Convert Unix timestamp to readable format."""
+    return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
+
+def format_duration(seconds):
+    """Format duration in human-readable format."""
+    if seconds < 60:
+        return f"{int(seconds)}s"
+    elif seconds < 3600:
+        minutes = int(seconds / 60)
+        secs = int(seconds % 60)
+        return f"{minutes}m {secs}s"
+    elif seconds < 86400:
+        hours = int(seconds / 3600)
+        minutes = int((seconds % 3600) / 60)
+        return f"{hours}h {minutes}m"
+    else:
+        days = int(seconds / 86400)
+        hours = int((seconds % 86400) / 3600)
+        return f"{days}d {hours}h"
+
+def test_hosts_api():
+    """Test GET /api/0/hosts endpoint."""
+    print_section("1. List All Monitored Hosts")
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5)
+        response.raise_for_status()
+        hosts = response.json()
+        
+        print(f"Found {len(hosts)} hosts:\n")
+        for host in hosts:
+            name = host.get('name', 'unknown')
+            dyn = host.get('dyn', False)
+            conn_count = len(host.get('connections', []))
+            
+            print(f"  • {name}")
+            print(f"    - Protocol: IPv{ver}")
+            print(f"    - Dynamic: {dyn}")
+            print(f"    - Connections: {conn_count}")
+        
+        return hosts
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return []
+
+def test_host_plugins_api(hostname):
+    """Test GET /api/0/hosts/{hostname}/plugins endpoint."""
+    print_section(f"2. Get All Plugins for Host: {hostname}")
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/plugins", timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        
+        plugins = data.get('plugins', {})
+        print(f"Found {len(plugins)} plugins:\n")
+        
+        for plugin_name, plugin_data in plugins.items():
+            timestamp = plugin_data.get('timestamp', 0)
+            sample_count = plugin_data.get('sample_count', 0)
+            metrics = plugin_data.get('data', {})
+            
+            print(f"  📦 {plugin_name}")
+            print(f"     Last update: {format_timestamp(timestamp)}")
+            print(f"     Samples: {sample_count}")
+            print(f"     Metrics: {len(metrics)}")
+            
+            # Show first few metrics
+            for i, (metric, value) in enumerate(metrics.items()):
+                if i < 3:  # Show only first 3 metrics
+                    if isinstance(value, float):
+                        print(f"       - {metric}: {value:.2f}")
+                    elif isinstance(value, dict):
+                        print(f"       - {metric}: [nested data, {len(value)} keys]")
+                    else:
+                        print(f"       - {metric}: {value}")
+            
+            if len(metrics) > 3:
+                print(f"       ... and {len(metrics) - 3} more")
+            print()
+        
+        return list(plugins.keys())
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return []
+
+def test_plugin_detail_api(hostname, plugin_name, limit=5):
+    """Test GET /api/0/hosts/{hostname}/plugins/{plugin_name} endpoint."""
+    print_section(f"3. Get Detailed Data: {hostname}/{plugin_name}")
+    
+    try:
+        url = f"{BASE_URL}/api/0/hosts/{hostname}/plugins/{plugin_name}"
+        params = {'limit': limit}
+        response = requests.get(url, params=params, timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        
+        samples = data.get('samples', [])
+        print(f"Retrieved {len(samples)} samples (limit={limit}):\n")
+        
+        for i, sample in enumerate(samples):
+            timestamp = sample.get('timestamp', 0)
+            metrics = sample.get('data', {})
+            
+            print(f"  [{i+1}] {format_timestamp(timestamp)}")
+            for metric, value in sorted(metrics.items())[:5]:  # Show first 5 metrics
+                if isinstance(value, float):
+                    print(f"      {metric}: {value:.2f}")
+                elif isinstance(value, dict):
+                    print(f"      {metric}: [nested: {len(value)} keys]")
+                else:
+                    print(f"      {metric}: {value}")
+            print()
+        
+        return samples
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return []
+
+def test_host_alerts_api(hostname):
+    """Test GET /api/0/hosts/{hostname}/alerts endpoint."""
+    print_section(f"4. Get Alerts for Host: {hostname}")
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/alerts", timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        
+        alerts = data.get('alerts', [])
+        summary = data.get('summary', {})
+        
+        print(f"Summary:")
+        print(f"  ✓ OK: {summary.get('ok', 0)}")
+        print(f"  ⚠️ Warning: {summary.get('warning', 0)}")
+        print(f"  🔴 Critical: {summary.get('critical', 0)}")
+        print(f"  ❓ Unknown: {summary.get('unknown', 0)}")
+        print()
+        
+        # Show non-OK alerts
+        active_alerts = [a for a in alerts if a.get('level') != 'OK']
+        if active_alerts:
+            print(f"Active Alerts ({len(active_alerts)}):")
+            for alert in active_alerts:
+                metric = alert.get('metric_path', 'unknown')
+                level = alert.get('level', 'UNKNOWN')
+                value = alert.get('last_value', 0)
+                since = alert.get('since', 0)
+                duration = datetime.now().timestamp() - since
+                
+                icon = '⚠️' if level == 'WARNING' else '🔴'
+                print(f"  {icon} {metric}")
+                print(f"     Level: {level}")
+                print(f"     Value: {value:.2f}" if isinstance(value, float) else f"     Value: {value}")
+                print(f"     Duration: {format_duration(duration)}")
+                print()
+        else:
+            print("✓ No active alerts - all systems normal!")
+        
+        return data
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return {}
+
+def test_all_alerts_api():
+    """Test GET /api/0/alerts endpoint."""
+    print_section("5. Get All Active Alerts Across All Hosts")
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        
+        alerts = data.get('alerts', [])
+        summary = data.get('summary', {})
+        host_count = data.get('host_count', 0)
+        
+        print(f"Monitoring {host_count} hosts")
+        print(f"Active Alerts: {summary.get('total', 0)}")
+        print(f"  🔴 Critical: {summary.get('critical', 0)}")
+        print(f"  ⚠️ Warning: {summary.get('warning', 0)}")
+        print()
+        
+        if alerts:
+            print("Alert Details:")
+            for alert in alerts:
+                hostname = alert.get('hostname', 'unknown')
+                metric = alert.get('metric_path', 'unknown')
+                level = alert.get('level', 'UNKNOWN')
+                value = alert.get('last_value', 0)
+                since = alert.get('since', 0)
+                duration = datetime.now().timestamp() - since
+                notification_count = alert.get('notification_count', 0)
+                
+                icon = '⚠️' if level == 'WARNING' else '🔴'
+                print(f"  {icon} {hostname} / {metric}")
+                print(f"     Level: {level}")
+                print(f"     Value: {value:.2f}" if isinstance(value, float) else f"     Value: {value}")
+                print(f"     Duration: {format_duration(duration)}")
+                print(f"     Notifications: {notification_count}")
+                print()
+        else:
+            print("✅ All systems normal - no active alerts!")
+        
+        return data
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return {}
+
+def test_messages_api():
+    """Test GET /api/0/messages endpoint."""
+    print_section("6. Get Recent Messages")
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/messages", timeout=5)
+        response.raise_for_status()
+        messages = response.json()
+        
+        print(f"Last {len(messages)} messages:\n")
+        for msg in messages[-5:]:  # Show last 5
+            timestamp = msg.get('time', 0)
+            host = msg.get('host', 'unknown')
+            text = msg.get('msg', '')
+            
+            print(f"  [{format_timestamp(timestamp)}] {host}: {text}")
+        
+        return messages
+    
+    except requests.RequestException as e:
+        print(f"❌ Error: {e}")
+        return []
+
+def test_error_handling():
+    """Test API error handling."""
+    print_section("7. Error Handling Tests")
+    
+    # Test non-existent host
+    print("Testing non-existent host...")
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/hosts/nonexistenthost/plugins", timeout=5)
+        if response.status_code == 404:
+            error_data = response.json()
+            print(f"  ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
+        else:
+            print(f"  ⚠️ Unexpected status code: {response.status_code}")
+    except Exception as e:
+        print(f"  ❌ Error: {e}")
+    
+    # Test non-existent plugin
+    print("\nTesting non-existent plugin...")
+    try:
+        # Get first host
+        hosts = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5).json()
+        if hosts:
+            hostname = hosts[0]['name']
+            response = requests.get(
+                f"{BASE_URL}/api/0/hosts/{hostname}/plugins/nonexistentplugin",
+                timeout=5
+            )
+            if response.status_code == 404:
+                error_data = response.json()
+                print(f"  ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
+            else:
+                print(f"  ⚠️ Unexpected status code: {response.status_code}")
+    except Exception as e:
+        print(f"  ❌ Error: {e}")
+
+def demo_monitoring_loop():
+    """Demonstrate continuous monitoring."""
+    print_section("8. Continuous Monitoring Demo (5 iterations)")
+    
+    print("Monitoring alerts every 3 seconds (Ctrl+C to stop)...\n")
+    
+    try:
+        for i in range(5):
+            response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
+            response.raise_for_status()
+            data = response.json()
+            
+            summary = data.get('summary', {})
+            critical = summary.get('critical', 0)
+            warning = summary.get('warning', 0)
+            
+            timestamp = datetime.now().strftime('%H:%M:%S')
+            status = "🔴 CRITICAL" if critical > 0 else "⚠️ WARNING" if warning > 0 else "✅ OK"
+            
+            print(f"[{timestamp}] {status} - Critical: {critical}, Warning: {warning}")
+            
+            if i < 4:  # Don't sleep after last iteration
+                sleep(3)
+    
+    except KeyboardInterrupt:
+        print("\n\nMonitoring stopped by user")
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+
+def main():
+    """Run all API tests."""
+    print("""
+╔══════════════════════════════════════════════════════════════╗
+║         Heartbeat Daemon HTTP API Demo & Test Suite         ║
+╚══════════════════════════════════════════════════════════════╝
+""")
+    
+    print(f"Testing API at: {BASE_URL}")
+    print(f"Ensure the heartbeat daemon is running!")
+    
+    # Test basic connectivity
+    try:
+        response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=2)
+        response.raise_for_status()
+        print("✅ API is reachable\n")
+    except Exception as e:
+        print(f"❌ Cannot connect to API: {e}")
+        print("\nPlease ensure:")
+        print("  1. Heartbeat daemon is running")
+        print("  2. HTTP server is enabled in configuration")
+        print(f"  3. Server is listening on port {BASE_URL.split(':')[-1]}")
+        sys.exit(1)
+    
+    # Run test suite
+    hosts = test_hosts_api()
+    
+    if not hosts:
+        print("\n⚠️ No hosts found. Ensure clients are sending heartbeats.")
+        return
+    
+    # Pick first host for detailed testing
+    hostname = hosts[0].get('name', '')
+    
+    if hostname:
+        plugins = test_host_plugins_api(hostname)
+        
+        if plugins:
+            # Test detailed plugin data
+            test_plugin_detail_api(hostname, plugins[0], limit=3)
+        
+        # Test alert endpoints
+        test_host_alerts_api(hostname)
+    
+    # Test global endpoints
+    test_all_alerts_api()
+    test_messages_api()
+    
+    # Test error handling
+    test_error_handling()
+    
+    # Continuous monitoring demo
+    demo_monitoring_loop()
+    
+    print_section("Test Suite Complete")
+    print("""
+Next Steps:
+  • View the web UI at http://localhost:50004/live
+  • Check plugin metrics at http://localhost:50004/plugins
+  • Monitor alerts at http://localhost:50004/alerts
+  • Read API documentation: docs/HTTP_API.md
+""")
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nDemo interrupted by user")
+        sys.exit(0)
@@ -0,0 +1,115 @@
+#!/bin/sh
+
+# Helper script to install the heartbeat tools. By default, it will only
+# install the heartbeat client, hbc. The server is installed when the arg 'server' is passed 
+# to the script. The script will install the heartbeat tools in a python 
+# virtual environment in ~/venvs/hbd. The hbd and hbc commands will be
+# installed from the wheel and symlinked to ~/bin/hbd and ~/bin/hbc,
+# respectively. If the virtual environment already exists, it will be
+# reused. The script will also remove any existing symlinks for hbd and hbc
+# in ~/bin before creating new ones.
+
+set -e
+what=$1
+on_ha=0
+where=""
+venv=""
+[ "$2" = "HA" ] && on_ha=1
+[ -z "$what" ] && what="client"
+
+if [ -d /homeassistant ]; then  # if running from HA command line
+    echo "HA, running \"docker exec homeassistant /config/bin/hb_install.sh $@\""
+    docker exec homeassistant /config/bin/hb_install.sh $@ HA
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        echo "Failed to install heartbeat in HA, please check the logs for more details"
+        exit 1
+    fi
+    exit 0
+fi
+
+if [ $on_ha -eq 1 ] || [ -r /.dockerenv ] && [ -d /config/bin ]; then
+    # Installing under docker on Home Assistant OS, using /config/bin for executables and /config/venvs for virtual environments 
+    echo "Home Assistant OS detected, installing under docker"
+    where="/config/bin"
+    venv="/config/venvs"
+else
+    if [ ! -d $HOME/.local/bin ] && [ ! -d $HOME/bin ]; then
+        echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
+        exit 1
+    fi
+    for where in $HOME/bin $HOME/.local/bin notset ; do
+        if echo ":$PATH:" | grep -q ":$where:" ; then
+            break
+        fi
+    done
+    if [ "$where" = "notset" ]; then
+        echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
+        exit 1
+    fi
+    if [ "$what" = "mini" ]; then
+        venv=""
+    else
+        venv="$HOME/venvs"
+    fi
+fi
+echo "Installing $what to $where"
+if [ ! -z "$venv" ]; then
+    echo "Using virtual environment at $venv/hbd"
+fi
+
+if [ "$venv" != "" ] && [ ! -d  $venv/hbd ]; then
+    arg=""
+    have_pip=$(python3 -c "import pip" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
+    if [ "$have_pip" = "Not Installed" ]; then
+        # some systems do not have pip installed by default, so we need to fetch get-pip.py and install pip
+        echo "pip is not installed, fetching get-pip.py and installing pip"
+        arg="--without-pip"
+    fi
+    mkdir -p $venv
+    have_venv=$(python3 -c "import venv" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
+    if [ "$have_venv" = "Not Installed" ]; then
+        if [ "$have_pip" = "Not Installed" ]; then
+            echo "python has no venv, and no pip to install virtualenv, cannot continue"
+            exit 1
+        fi
+        echo "python venv module not found, installing virtualenv"
+        python3 -m pip install --user virtualenv
+        python3 -m virtualenv $venv/hbd --system-site-packages $arg
+    else
+        python3 -m venv $venv/hbd --system-site-packages $arg
+    fi
+    . $venv/hbd/bin/activate
+    if [ -n "$arg" ]; then  
+        curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py
+    fi
+    deactivate
+fi
+
+if [ ! -z "$venv" ]; then
+    . $venv/hbd/bin/activate
+fi
+if [ "$what" = "mini" ]; then
+    curl -s -o $where/hbc_mini https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hbc_mini.py
+    chmod +x $where/hbc_mini
+else
+    python3 -mpip install --upgrade --index-url https://git.wrede.ca/api/packages/andreas/pypi/simple/ --extra-index-url https://pypi.org/simple hbd[$what]
+fi
+
+if [ ! -z "$venv" ]; then
+    echo "linking executables to $where"
+    if [ "$what" = "server" ]; then
+        rm -f $where/hbd
+        ln -sf $(which hbd) $where/hbd
+    elif [ "$what" = "client" ]; then
+        rm -f $where/hbc
+        ln -sf $(which hbc) $where/hbc
+    fi
+    rm -f $where/hb_install.sh
+    ln -sf $(which hb_install.sh) $where/hb_install.sh
+fi
+echo "Installation complete. To upgrade, run the following:"
+echo "    $where/hb_install.sh $what"
+echo "To install on another machine, run the following obtain the install script and run it:"
+echo "from https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hb_install.sh"
+echo "and then run sh hb_install.sh [mini|client]"
@@ -1,15 +0,0 @@
-#!/bin/sh
-
-# install hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
-
-set -e
-if [ ! -d  ~/venvs/hbd ]; then
-    mkdir -p ~/venvs
-    python3 -m venv ~/venvs/hbd --system-site-packages
-fi
-. ~/venvs/hbd/bin/activate
-pip install 'git+ssh://git@git.wrede.ca/andreas/heartbeat.git'
-rm -f ~/bin/hbd
-rm -f ~/bin/hbc
-ln -sf $(which hbd) ~/bin/hbd
-ln -sf $(which hbc) ~/bin/hbc
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""
+Test all plugins together.
+"""
+
+import asyncio
+import logging
+from pathlib import Path
+
+# Setup path
+import sys
+sys.path.insert(0, str(Path(__file__).parent))
+
+from hbd.plugin import PluginRegistry, PluginLoader
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(name)s: %(message)s"
+)
+
+async def test_all_plugins():
+    """Test loading all plugins."""
+    print("=" * 70)
+    print("Testing All Plugins")
+    print("=" * 70)
+    
+    # Create registry and loader
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+    
+    # Configuration for plugins
+    config = {
+        "cpu_monitor": {
+            "interval": 30,
+            "per_core": False
+        },
+        "nagios_runner": {
+            "interval": 60,
+            "commands": [
+                {
+                    "name": "test_ok",
+                    "command": "echo 'OK - test passed | metric=100%;;;0;100'"
+                },
+                {
+                    "name": "test_warning",
+                    "command": "echo 'WARNING - test result | value=85%;80;90;0;100' && exit 1"
+                }
+            ]
+        }
+    }
+    
+    # Load plugins
+    plugin_dir = Path(__file__).parent / "hbd" / "plugins"
+    print(f"\n1. Loading plugins from: {plugin_dir}")
+    
+    count = await loader.load_from_directory(plugin_dir, config)
+    print(f"   ✓ Loaded {count} plugins")
+    
+    # List loaded plugins
+    print(f"\n2. Loaded plugins:")
+    for plugin in registry.get_all():
+        print(f"   - {plugin.name} v{plugin.version}")
+        print(f"     Type: {plugin.__class__.__name__}")
+        print(f"     Interval: {plugin.interval}s")
+        print(f"     Description: {plugin.description}")
+    
+    # Test collection for each plugin
+    print(f"\n3. Testing data collection:")
+    for plugin in registry.get_all():
+        print(f"\n   {plugin.name}:")
+        try:
+            data = await plugin.collect()
+            print(f"     ✓ Collected {len(data)} fields")
+            
+            # Show sample of data
+            sample_count = min(5, len(data))
+            for key, value in list(data.items())[:sample_count]:
+                value_str = str(value)
+                if len(value_str) > 50:
+                    value_str = value_str[:47] + "..."
+                print(f"       {key}: {value_str}")
+            
+            if len(data) > sample_count:
+                print(f"       ... and {len(data) - sample_count} more fields")
+                
+        except Exception as e:
+            print(f"     ✗ Error: {e}")
+    
+    # Cleanup
+    print(f"\n4. Cleanup...")
+    await loader.unload_all()
+    print(f"   ✓ All plugins unloaded")
+    
+    print(f"\n" + "=" * 70)
+    print(f"Successfully tested {count} plugins!")
+    print("=" * 70)
+
+if __name__ == "__main__":
+    asyncio.run(test_all_plugins())
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""
+Test script for all monitoring plugins.
+
+Tests all available plugins including the new ones:
+- memory_monitor
+- disk_monitor
+- network_monitor
+- filesystem_info
+"""
+
+import asyncio
+import sys
+import os
+import logging
+
+# Add parent directory to path so we can import hbd
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from hbd.plugin import PluginLoader
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+logger = logging.getLogger(__name__)
+
+
+def format_bytes(bytes_val):
+    """Format bytes into human readable format."""
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if bytes_val < 1024.0:
+            return f"{bytes_val:.2f} {unit}"
+        bytes_val /= 1024.0
+    return f"{bytes_val:.2f} PB"
+
+
+def print_plugin_data(plugin_name, data, indent=2):
+    """Pretty print plugin data."""
+    prefix = " " * indent
+    
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if isinstance(value, dict):
+                print(f"{prefix}{key}:")
+                print_plugin_data(plugin_name, value, indent + 2)
+            elif isinstance(value, list):
+                print(f"{prefix}{key}: [{len(value)} items]")
+                if len(value) <= 5:  # Only show small lists
+                    for item in value:
+                        if isinstance(item, dict):
+                            print_plugin_data(plugin_name, item, indent + 2)
+                        else:
+                            print(f"{prefix}  - {item}")
+            else:
+                # Format output based on key name for better readability
+                if '_bytes' in key or key.endswith('_sent') or key.endswith('_recv') or 'memory_' in key or 'swap_' in key:
+                    if isinstance(value, (int, float)) and value > 1024:
+                        print(f"{prefix}{key}: {format_bytes(value)} ({value:,})")
+                    else:
+                        print(f"{prefix}{key}: {value}")
+                elif 'percent' in key:
+                    print(f"{prefix}{key}: {value:.1f}%")
+                elif isinstance(value, float):
+                    print(f"{prefix}{key}: {value:.2f}")
+                elif isinstance(value, int) and value > 1000:
+                    print(f"{prefix}{key}: {value:,}")
+                else:
+                    print(f"{prefix}{key}: {value}")
+    else:
+        print(f"{prefix}{data}")
+
+
+async def main():
+    """Main test function."""
+    print("="*60)
+    print("Plugin System Test Suite")
+    print("="*60)
+    
+    # Load all available plugins using the plugin loader
+    from hbd.plugin import PluginRegistry, PluginLoader
+    from pathlib import Path
+    
+    registry = PluginRegistry()
+    loader = PluginLoader(registry)
+    
+    plugin_dir = Path(__file__).parent / "hbd" / "plugins"
+    if not plugin_dir.exists():
+        print(f"✗ Plugin directory not found: {plugin_dir}")
+        return 1
+    
+    # Load plugins from directory
+    count = await loader.load_from_directory(plugin_dir, {})
+    
+    print(f"\nLoaded {count} plugins:")
+    plugins = registry.get_all()
+    for plugin in plugins:
+        print(f"  - {plugin.name}: {plugin.__class__.__doc__.split('.')[0] if plugin.__class__.__doc__ else 'No description'}")
+    
+    # Test each plugin
+    results = {}
+    for plugin in plugins:
+        # Skip nagios_runner as it needs specific configuration
+        if plugin.name == 'nagios_runner':
+            print(f"\n{'='*60}")
+            print(f"Skipping: {plugin.name} (requires specific configuration)")
+            print(f"{'='*60}")
+            results[plugin.name] = True  # Mark as success since it loaded OK
+            continue
+        
+        print(f"\n{'='*60}")
+        print(f"Testing: {plugin.name}")
+        print(f"{'='*60}")
+        
+        try:
+            # Collect data
+            data = await plugin.collect()
+            if data:
+                if 'error' in data:
+                    print(f"✗ Collection error: {data['error']}")
+                    results[plugin.name] = False
+                else:
+                    print(f"✓ Data collected: {len(data)} top-level fields")
+                    print_plugin_data(plugin.name, data)
+                    results[plugin.name] = True
+            else:
+                print(f"⚠ No data collected")
+                results[plugin.name] = False
+        except Exception as e:
+            print(f"✗ Failed to collect data: {e}")
+            import traceback
+            traceback.print_exc()
+            results[plugin.name] = False
+    
+    # Summary
+    print(f"\n{'='*60}")
+    print("Test Summary")
+    print(f"{'='*60}")
+    
+    success_count = sum(1 for v in results.values() if v)
+    total_count = len(results)
+    
+    print(f"\nResults: {success_count}/{total_count} plugins successful")
+    for name, success in results.items():
+        status = "✓" if success else "✗"
+        print(f"  {status} {name}")
+    
+    if success_count == total_count:
+        print("\n🎉 All plugins passed!")
+        return 0
+    else:
+        print(f"\n⚠ {total_count - success_count} plugin(s) failed")
+        return 1
+
+
+if __name__ == '__main__':
+    exit_code = asyncio.run(main())
+    sys.exit(exit_code)
--- a/Show More
+++ b/Show More