diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 681df41..0000000 --- a/.claude/settings.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "permissions": { - "allow": [ - "Edit(*)", - "Bash(pytest *)", - "Bash(python *)", - "Bash(python3 *)", - "Bash(.venv/bin/pytest *)", - "Bash(npm *)", - "Bash(git *)", - "Bash(ls *)", - "Bash(cat *)", - "Bash(grep *)", - "Bash(find *)", - "Bash(mkdir *)", - "Bash(touch *)", - "Bash(uv *)" - ] - } -} diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 5a9b4c2..8155a4b 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -10,6 +10,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Set up Python run: | @@ -18,22 +20,38 @@ jobs: - name: Install build tools run: | - python3 -m pip install --upgrade pip - python3 -m pip install build twine + python3 -m venv .venv + .venv/bin/pip install --upgrade pip + .venv/bin/pip install build twine - name: Build package - run: python3 -m build + run: .venv/bin/python -m build - name: Extract version from tag id: get_version run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT - + - name: Generate changelog + id: changelog + run: | + PREV_TAG=$(git tag --sort=-version:refname | grep -m 1 -v "^${GITHUB_REF#refs/tags/}$") + if [ -n "$PREV_TAG" ]; then + CHANGELOG=$(git log --pretty=format:"- %s" "${PREV_TAG}..HEAD") + else + CHANGELOG="Initial release" + fi + # Write multiline to output + { + echo "CHANGELOG<> $GITHUB_OUTPUT + - name: Upload to Gitea PyPI registry env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | - python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/* + .venv/bin/python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/* - name: Create release uses: actions/gitea-release-action@v1 @@ -42,4 +60,4 @@ jobs: dist/*.whl dist/*.tar.gz title: "Release ${{ steps.get_version.outputs.VERSION }}" - body: "Release version ${{ steps.get_version.outputs.VERSION }}" + body: "${{ steps.changelog.outputs.CHANGELOG }}" diff --git a/.gitignore b/.gitignore index be4e25d..d1220de 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ *.pyo .flake8 .venv/ +.continue/ test/ build/ dist/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d4823a3 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,445 @@ +# Changelog + +All notable changes to this project are documented here, organized by release. + +## [5.3.9] + +### Added +- auto-update CHANGELOG and README in bumpminor.sh + +--- + +## [5.3.8] + +### Added +- Wiki home page with overview and getting started guide + +### Fixed +- Release workflow: use `GITHUB_REF`/`GITHUB_OUTPUT` (Gitea Actions uses GitHub-compatible variable names) +- Release workflow: replace `head -1` with `grep -m 1` to avoid SIGPIPE (exit 141) in changelog step + +--- + +## [5.3.7] + +### Added +- Dark mode with light/dark/auto theme setting +- UNKNOWN level filter in Log of Events +- Per-metric grace period input in threshold settings +- Replace Dynamic DNS YAML editor with a web form +- Sort hosts, thresholds, and channels alphabetically on settings page +- Suppress alerts for unwatched hosts + +### Fixed +- Preserve log message order when replaying history on connect + +--- + +## [5.3.6] + +### Added +- MIT license + +### Fixed +- Correct ZFS pool status threshold operator and add per-metric grace +- Normalize email and domain fields +- Move dependencies back under `[project]` in pyproject.toml + +--- + +## [5.3.4] + +### Fixed +- Run full reload after HTTP config publish, not just `config.reload()` + +--- + +## [5.3.3] + +### Added +- Replace YAML threshold editor with a form-based UI +- Replace multi-select fields with dual-panel picker on settings page +- Nav bar button to publish pending config changes +- Host, level, and message filters in Log of Events + +### Fixed +- Remove container max-width; stop stretching inputs on settings page + +### Removed +- Legacy `dyndnshosts`/`drophosts` config keys + +--- + +## [5.3.2] + +### Added +- Retry DNS resolution indefinitely; add `-4`/`-6` address-family flags to `hbc` and `hbc_mini` +- Replace YAML hosts editor with form-based CRUD table +- Replace YAML notification channel editor with form-based UI + +### Fixed +- Support list-valued `threshold_config` in hosts table +- Derive hosts threshold config list from config file keys +- Replace channel checkboxes in Users table with multi-select +- Support plugin-level `enabled: false` in threshold config +- Always populate glance strip for all hosts on page load +- Fetch host info on initial page load + +--- + +## [5.3.1] + +### Added +- Host info section in Host Overview (fetched and rendered on card expand) +- `GET /api/0/hosts/{hostname}/info` endpoint +- Show suffix-matched metric coverage in host info threshold table +- Move `hbc_version` and `hbc_type` out of `os_info` into the host info section + +### Fixed +- Correct `THRESHOLD_DEFAULTS` metric keys and add missing defaults + +--- + +## [5.3.0] + +### Added +- Profile page self-service: change identity, password, and notification channels +- Settings page editor with form sections, YAML editors, stage/publish/rollback workflow +- Config read API: `GET /api/0/config`, `/section/{name}`, `/backups` +- Config write API: `POST /api/0/config`, `POST /api/0/config/rollback` +- `configio` module for comment-preserving YAML round-trip writes +- Multi-provider OAuth2 login page and generic provider routes +- Log login/logout events to the event log with auth source + +### Fixed +- ZFS monitor alerts dropped on restart with wildcard pool thresholds +- Preserve OAuth users across config reload +- Config API error handling, consistent 403 messages, deduplicated key lists +- Validate password body type; coerce `notification_channels` to strings in profile API +- Preserve OAuth `client_secret` on roundtrip; harden rollback path validation + +--- + +## [5.2.6] + +### Added +- Alerts host-filter field with URL query parameter and notify URL +- Optional logo on Gitea OAuth login button + +### Fixed +- Show human-readable duration in re-notification messages + +--- + +## [5.2.5] + +### Added +- Alert CRITICAL on degraded or suspended ZFS pools (ONLINE=OK, DEGRADED=WARNING, all else=CRITICAL) +- Sign in with Gitea button on login page with OAuth2 redirect/callback routes +- OAuth2 CSRF state management +- Host owner shown in glance strip for admin users +- C port of `hbc_mini` (single-file client in `scripts/c/`) + +### Fixed +- Use `base_url` config for OAuth redirect URI to handle reverse proxy deployments +- Preserve OAuth users across config reload +- Escape HTML in login page error display + +--- + +## [5.2.4] + +### Added +- `hbc`/`hbc_mini`: `owner` config field included in `os_info`; server applies to host record +- Server requests InfoPlugin refresh when a host has no plugin data +- Event log stores structured dicts; filter by user + +### Fixed +- Strip `_status_code` suffix from displayed metric names in threshold alerts +- Use plain URL in Mattermost plugin metrics link +- Fall back to `default_owner` when `os_info` has no owner + +--- + +## [5.2.3] + +### Added +- `hbc`/`hbc_mini`: log name and version at startup +- Show metric name inline with hostname in alerts and notifications + +### Fixed +- Send shutdown message only if a boot message was previously sent; suppress both on restart + +--- + +## [5.2.2] + +### Fixed +- Retry connection on network error instead of permanently dropping it +- Silence `aiohttp.access` log; strip plugin prefix in alerts UI + +--- + +## [5.2.1] + +### Fixed +- Threshold and logging improvements + +--- + +## [5.2.0] + +### Added +- `nagios` operator for direct exit-code severity mapping + +### Fixed +- Always show `THRESHOLD_DEFAULTS` in Settings threshold config + +--- + +## [5.1.21] + +### Added +- `nagios_runner` improvements and alerts page fixes + +--- + +## [5.1.20] + +### Added +- Generic threshold matching for `nagios_runner` with `{check_name}` display support + +### Fixed +- Reduce default hysteresis from 10% to 2% +- Show recovery threshold in alerts UI + +--- + +## [5.1.19] + +### Added +- Exclude ZFS ARC from `memory_percent` +- Add `uptime_seconds` to `cpu_monitor` + +### Fixed +- Send boot/shutdown message on the first open connection, not blindly on the first in list + +--- + +## [5.1.18] + +### Added +- Fetch-based Update/Delete buttons with toast notifications on Host Overview + +### Fixed +- Settings thresholds show correct per-config metrics; miscellaneous `hbc` fixes + +--- + +## [5.1.17] + +### Added +- Owner Update/Delete buttons on Host Overview; purge stale alerts on reload +- Retry `AsyncConnection.open()` indefinitely; drop IPv6 only on early startup failure +- Alert pie chart in the nav bar + +### Fixed +- Make Alerts page scrollable + +--- + +## [5.1.16] + +### Added +- Generic `ping_monitor` thresholds; round RTT to nearest ms + +--- + +## [5.1.15] + +### Added +- Link hostnames in Live Dashboard to Host Overview +- Threshold Configurations section on settings page + +### Fixed +- Suppress notifications on alert de-escalation (e.g. CRITICAL→WARNING) +- Suppress recover messages for down durations under 4 seconds + +--- + +## [5.1.14] + +### Added +- ZFS pool renderer in Host Overview + +--- + +## [5.1.13] + +### Added +- ZFS monitor plugin +- Host-level watch flag to suppress notifications +- Filter Live Dashboard and Host Overview by owner/manager +- Composable `threshold_config` list for per-host threshold layering +- Restart on SIGHUP in `hbc` and `hbc_mini` + +### Fixed +- Mask `api_password` and `access_token` in settings page + +--- + +## [5.1.12] + +Internal release — no user-visible changes. + +--- + +## [5.1.11] + +### Fixed +- Install under Docker +- Clean up install script + +--- + +## [5.1.10] + +### Fixed +- Synchronize version in `hbc_mini` +- Install script no longer overwrites itself + +--- + +## [5.1.9] + +### Added +- Install `hbc_mini` via package or install script + +--- + +## [5.1.8] + +### Added +- Track `hbc` type and version + +### Fixed +- Nav bar position + +--- + +## [5.1.7] + +### Added +- `hbc_mini`: single-file heartbeat client + +### Fixed +- Drop dead connections on protocol error + +--- + +## [5.1.6] + +### Fixed +- Simplify event log usage; fix argument handling + +--- + +## [5.1.5] + +### Added +- Update `hbc` via `hb_install.sh` instead of code patching + +--- + +## [5.1.4] + +### Added +- Redesign Plugin Metrics page as Host Overview + +--- + +## [5.1.3] + +### Added +- Validate absolute command paths at `nagios_runner` init +- Async subprocess in `nagios_runner` with stderr capture and signal handling +- `skip_reason` field on `Plugin`; surface in `PluginLoader` init messaging + +### Fixed +- Use `shlex.split()` for `nagios_runner` path validation to handle quoted paths +- Reconfigure logging to syslog after `daemonize()` + +--- + +## [5.1.2] + +### Fixed +- Plugin config lookup shadowed by `CLIENT_DEFAULTS` plugins key +- Apply grace period to all threshold alerts before logging/notifying +- RECOVER routing: use consistent level name and route via alerted channel +- Early reminder notifications and lost recovery notifications +- Non-alerting of overdue hosts + +### Added +- Swiss clock widget in the UI + +--- + +## [5.1.1] + +### Added +- SMS and Matrix notification channels +- CLI commands `stop`, `restart`, and `reload` for `hbd` +- WebSocket endpoint at `http://.../ws` +- Mobile HTML pages + +### Fixed +- Profile not updating +- Sortable columns in tables + +--- + +## [5.1.0] + +### Added +- Ping monitor plugin +- Persist state to pickle file; restart timers on server restart +- SIGHUP config reload for `hbd` +- Renotify on CRITICAL only; persistent user sessions +- RTT count threshold + +### Fixed +- Bogus notification on new clients +- Show "overdue" in alerts instead of null + +--- + +## [5.0.12] + +### Added +- User management and settings page + +--- + +## [5.0.10] + +### Added +- Publish package to Gitea PyPI registry + +--- + +## [5.0.9] + +### Added +- Use `SO_TIMESTAMP` for RTT measurement (Linux, FreeBSD, macOS) +- Persist state to pickle file; restart timers on restart + +--- + +## [5.0.6] + +### Added +- Major codebase refactoring: restructured into client/server components +- Per-client threshold configuration +- Display and acknowledge alerts in the UI +- Proper `hbc` termination; `hbd` config reloadable at runtime diff --git a/Home.md b/Home.md new file mode 100644 index 0000000..09ce3cf --- /dev/null +++ b/Home.md @@ -0,0 +1,210 @@ +# Heartbeat + +Heartbeat is a lightweight host monitoring system built around a simple idea: each machine you want to monitor runs a small client (`hbc`) that sends a UDP "heartbeat" packet to a central server (`hbd`) on a regular interval. If a heartbeat stops arriving, you get notified. Alongside reachability, clients can ship system metrics — CPU, memory, disk, network — and the server will alert you when any of those cross a threshold. + +## How it works + +``` + [ monitored host ] [ your server ] + ┌─────────────┐ UDP 50003 ┌────────────────────────┐ + │ hbc │ ────────────> │ hbd │ + │ │ │ host state tracking │ + │ plugins: │ <──────────── │ threshold alerting │ + │ cpu, mem, │ ACK / CMD │ notifications │ + │ disk, ... │ │ web dashboard + API │ + └─────────────┘ └────────────────────────┘ +``` + +- **hbd** — the server daemon. Tracks which hosts are alive, evaluates metric thresholds, fires notifications, serves the web dashboard and REST API. +- **hbc** — the client. Sends heartbeats and plugin data over UDP. Runs on any Linux/BSD/macOS host. +- **hbc_mini** — a zero-dependency single-file alternative (`hbc_mini.py` or `hbc_mini.c`) for hosts where you can't install Python packages. + +Notifications can go to Pushover, email, Mattermost, Matrix, Signal, or VoIP.ms SMS. The dashboard shows host connectivity, RTT graphs, active alerts, and per-host plugin metrics in real time via WebSocket. + +--- + +## Getting started + +This tutorial sets up a server on one machine and a client on a second machine. You'll end up with a working dashboard and your first host being monitored. + +### 1. Install the server + +On the machine that will run `hbd`: + +```bash +git clone https://git.wrede.ca/andreas/heartbeat.git +cd heartbeat +python3 -m venv .venv +source .venv/bin/activate +pip install . +``` + +Verify the install: + +```bash +hbd --help +``` + +### 2. Create a server config + +Create `~/.hb.yaml`: + +```yaml +hb_port: 50003 # UDP port — clients send heartbeats here +hbd_port: 50004 # HTTP port — web dashboard and API +ws_port: 50005 # WebSocket port — live dashboard updates + +interval: 20 # Expected heartbeat interval (seconds) +grace: 2 # Seconds of slack before a host is considered overdue + +pickfile: ~/.hb.pick +pidfile: ~/.hb.pid +logfile: ~/.hb.log +``` + +That's enough to get started. No hosts, no users, no notifications needed yet — the server will accept any client that connects. + +### 3. Start the server + +```bash +hbd serve -c ~/.hb.yaml -f -v +``` + +`-f` keeps it in the foreground so you can watch the log. You should see: + +``` +Heartbeat daemon starting on UDP :50003, HTTP :50004, WS :50005 +``` + +Open `http://your-server:50004/live` in a browser. The dashboard is empty for now. + +### 4. Install the client on a host to monitor + +On the machine you want to monitor (must be able to reach the server on UDP 50003): + +```bash +pip install hbd # or: copy scripts/hbc_mini.py if you can't install packages +``` + +#### Quick start — no config file + +```bash +hbc your-server.example.com +``` + +Within a few seconds the server log will show the host checking in, and it will appear on the dashboard. + +#### With a config file + +Create `~/.hbc.yaml` on the client host: + +```yaml +hb_port: 50003 +interval: 10 # Send a heartbeat every 10 seconds + +plugins: + cpu_monitor: + interval: 60 + memory_monitor: + interval: 60 + disk_monitor: + interval: 60 +``` + +Then start the client: + +```bash +hbc -c ~/.hbc.yaml your-server.example.com +``` + +Send a boot message at startup so the server logs when the host came up: + +```bash +hbc -b -c ~/.hbc.yaml your-server.example.com +``` + +Run as a daemon (logs go to syslog): + +```bash +hbc -d -b -c ~/.hbc.yaml your-server.example.com +``` + +### 5. View the dashboard + +Open `http://your-server:50004/live`. You'll see the monitored host, its last heartbeat time, and RTT. Click the host name to see plugin metrics. + +Navigate to `/plugins/` for CPU, memory, and disk graphs. + +### 6. Add a notification channel (optional) + +Edit `~/.hb.yaml` on the server: + +```yaml +notification_channels: + pushover_ops: + type: pushover + token: YOUR_APP_TOKEN + user: YOUR_USER_KEY + +users: + alice: + password: pbkdf2:sha256:... # generate: hbd passwd alice + admin: true + notification_channels: [pushover_ops] + +default_owner: alice +``` + +Generate the password hash: + +```bash +hbd passwd alice +``` + +Paste the output into the config, then reload: + +```bash +hbd reload +``` + +Test the channel: + +```bash +hbd notify +``` + +### 7. Set a threshold alert (optional) + +Add to `~/.hb.yaml`: + +```yaml +thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + disk_monitor: + partitions: + /: + percent: + warning: 80.0 + critical: 90.0 +``` + +Reload: `hbd reload`. The server will now alert when a monitored host crosses these values. + +--- + +## What's next + +| Topic | Where to look | +|---|---| +| Full server config reference | [README — Server](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#server-hbd) | +| Client options and all plugins | [README — Client](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#client-hbc) | +| Threshold alerting details | [THRESHOLD_ALERTING.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/THRESHOLD_ALERTING.md) | +| Notification channels | [NOTIFICATIONS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NOTIFICATIONS.md) | +| User accounts and roles | [USERS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/USERS.md) | +| Writing a custom plugin | [PLUGIN_DEVELOPMENT.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/PLUGIN_DEVELOPMENT.md) | +| Nagios check integration | [NAGIOS_INTEGRATION.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NAGIOS_INTEGRATION.md) | +| REST API | [HTTP_API.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/HTTP_API.md) | +| Zero-dependency client | [README — hbc_mini](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#hbc_mini--zero-dependency-client) | diff --git a/README.md b/README.md index 86d973b..27bd27b 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ A lightweight UDP-based host monitoring system. Monitored hosts run a client (`h └────────────────────┘ └────────────────────────────┘ ``` -**Package:** `hbd` v5.3.4 +**Package:** `hbd` v5.3.9 **Python:** 3.11+ ### Subpackages diff --git a/docs/DARK_MODE.md b/docs/DARK_MODE.md new file mode 100644 index 0000000..04ed5b3 --- /dev/null +++ b/docs/DARK_MODE.md @@ -0,0 +1,66 @@ +# Dark Mode + +Every page in the Heartbeat web UI supports light mode, dark mode, and automatic (follows the OS/browser setting). Each user picks their preference independently; it is stored in the browser and takes effect immediately without a page reload. + +--- + +## Choosing a theme + +Open your profile page (`/profile`) and scroll to the **Appearance** section. Click one of the three buttons: + +| Button | Behaviour | +|--------|-----------| +| **Auto** | Follows the OS or browser dark-mode preference. Updates live if the system setting changes. | +| **Light** | Always light, regardless of system setting. | +| **Dark** | Always dark, regardless of system setting. | + +The preference is stored in `localStorage` under the key `hbd_theme` and applies to the current browser only. Clearing browser storage resets it to **Auto**. + +--- + +## Implementation notes + +### No flash of unstyled content + +A small synchronous `{% endif %} +