Compare commits
306 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 76e11b92f2 | |||
| d39c0da5fe | |||
| 832b9d04d8 | |||
| 44d5f15a67 | |||
| 37b8e35a26 | |||
| fa317a3b78 | |||
| 8729fe7038 | |||
| f4231dd5f3 | |||
| c47576637f | |||
| 2b9523ec28 | |||
| 610ad0af30 | |||
| 69b5b410ed | |||
| 8b2b0fd9d0 | |||
| 756b2323be | |||
| 6e7156b42d | |||
| 928035df50 | |||
| 0f90be659e | |||
| 4160e34a96 | |||
| 6430d2ddf3 | |||
| 4b87a90e76 | |||
| 450814daca | |||
| e7786ac5da | |||
| fed71d97d6 | |||
| ba96da9622 | |||
| 7f17ddc2ff | |||
| 7750c5a303 | |||
| e58530df7d | |||
| fe7143759c | |||
| 236b40cfe4 | |||
| 4e5bafd26c | |||
| 817ae064af | |||
| a00282913b | |||
| d699a29fa9 | |||
| 4ce7eacfdd | |||
| 1cefc2676e | |||
| 668a135e53 | |||
| 59e256a042 | |||
| 708508157f | |||
| f67fa9baff | |||
| 588eb2a792 | |||
| b907343e36 | |||
| e50a3996ae | |||
| e1056a0365 | |||
| 1dbe0f8e64 | |||
| 12e8812070 | |||
| 9b5d8ac9b1 | |||
| 500d256d76 | |||
| a7a45bf8c3 | |||
| 3e9b052f71 | |||
| 7444262985 | |||
| 3401cc0dbb | |||
| ab0132a38d | |||
| 9e389736f8 | |||
| b64a2a9313 | |||
| a52744a448 | |||
| 5e2b04b811 | |||
| 8e07b09d7e | |||
| 653e018e4f | |||
| c7326da7d9 | |||
| 0426a75d8c | |||
| 539f25d877 | |||
| 3e3099fc6d | |||
| c9f15a3f1c | |||
| 6e396ad760 | |||
| 2800de0b4a | |||
| 15f7e6a64d | |||
| 9768d13b88 | |||
| 8640d731aa | |||
| de81751e59 | |||
| 60c692cefc | |||
| 9a0baf3c78 | |||
| 55bdb9593a | |||
| 2009626fb4 | |||
| 18769afd37 | |||
| 31db5cf35e | |||
| 326f53f23d | |||
| 4f9bc8c868 | |||
| 259b4a3594 | |||
| 8646f68957 | |||
| a4a6c1e3d9 | |||
| 0e8250362e | |||
| 2f5da9fc5e | |||
| 87aeec5999 | |||
| f24500a6b5 | |||
| a7bb183222 | |||
| 8207cd7b5f | |||
| 11f1eefa8c | |||
| 62f496e9f8 | |||
| aef9e7769b | |||
| 58c2b9d996 | |||
| 2e8bcb630d | |||
| 338711181b | |||
| 43487f17e7 | |||
| 40205bf5c7 | |||
| b95f1a5bb7 | |||
| 12f7eb722b | |||
| 217bba1b76 | |||
| 967e05ed74 | |||
| c20245b0ab | |||
| b9db0c552e | |||
| 05045bafa2 | |||
| 39f1b5de30 | |||
| b06de6fdd3 | |||
| 940d0af35e | |||
| d6d31aa2e3 | |||
| 76edfe7577 | |||
| d190029728 | |||
| b8307e7a9d | |||
| a2fdf091f5 | |||
| 1914e6f28e | |||
| 82cbce9615 | |||
| dbb779b013 | |||
| ca908ee967 | |||
| 73c697b6c5 | |||
| 3e2357380b | |||
| cc4a103bae | |||
| 53fb10fdf5 | |||
| 2df2ad18c9 | |||
| b81a0d2a6c | |||
| 1a19088cfe | |||
| 172f6e950f | |||
| 4349ae217a | |||
| b3aa7b585f | |||
| 88a3c09b51 | |||
| 0504402a8a | |||
| ca58c18802 | |||
| 1ddc4b8132 | |||
| 5e1720ed32 | |||
| 77f127fe60 | |||
| 54fbd8d73d | |||
| 7ab17e26e2 | |||
| 28f5fa951c | |||
| 37f1c58969 | |||
| f006077a71 | |||
| d9fc8d632f | |||
| f640574e4f | |||
| 9a19424279 | |||
| ca8ba84e65 | |||
| f3d08d1c9e | |||
| 1e4263b793 | |||
| e931acb9f5 | |||
| 018409e71d | |||
| 1824f637b4 | |||
| a534c06b26 | |||
| d7b5c97a4e | |||
| ae447ac4a6 | |||
| d44ce3d124 | |||
| b1985d0eb2 | |||
| de778f680f | |||
| d7b368c7c6 | |||
| e790663f9f | |||
| 475319e248 | |||
| ca5ef384a8 | |||
| c93dbdc0f4 | |||
| 3a546a1e5c | |||
| 74c89d098c | |||
| 3301dbfe34 | |||
| d00d903e7d | |||
| babb5d61aa | |||
| 11d1c718b3 | |||
| a99b6b54c7 | |||
| 8da3d550eb | |||
| a76d0fc840 | |||
| 94cbb31c48 | |||
| ae60844a8a | |||
| 49fa310361 | |||
| 28e2180f7b | |||
| ce0590f015 | |||
| f50acca509 | |||
| 72fc82b91f | |||
| 46f8c32c0b | |||
| 691f62aa69 | |||
| cffc9805f9 | |||
| 917d6a401b | |||
| 2bd3a9beb6 | |||
| 5523c60866 | |||
| ab37ac7194 | |||
| f811a19d80 | |||
| 6239825f43 | |||
| b56245bb23 | |||
| 331c4e804d | |||
| 9fd945a481 | |||
| 26df08eeff | |||
| 5819dd6b25 | |||
| 6fb67f8615 | |||
| e70ae6f176 | |||
| a77f6d380c | |||
| 6aae2a1dab | |||
| 85ee0e1040 | |||
| c4f09e9ced | |||
| 64710fd4cd | |||
| 1f5e7465a3 | |||
| b290b21e23 | |||
| 65c4267847 | |||
| 462a445235 | |||
| 368e178f93 | |||
| 6905bf266a | |||
| b6dcce4f35 | |||
| e6436fc236 | |||
| c5ce41762e | |||
| 26ca0c095f | |||
| 1eecd67594 | |||
| caf3c2c0ac | |||
| 9af4006097 | |||
| ddf7067d13 | |||
| 505353a8a8 | |||
| 0402d33c71 | |||
| 7d8ca5d8db | |||
| 56037a036d | |||
| 65ceb31d8d | |||
| 1c9b6c1ca9 | |||
| d7e6b478e1 | |||
| 535dbda47d | |||
| c9567dddae | |||
| b5963badd6 | |||
| a76a39b4a0 | |||
| 94e1597978 | |||
| c9c2ed772f | |||
| aeb78dcb8e | |||
| 77b337e4dd | |||
| 293461f3f6 | |||
| c70a4807dc | |||
| 1a470e7cfa | |||
| 990c658e65 | |||
| b78d6ac0fe | |||
| afd5060f59 | |||
| f61f7aebc2 | |||
| 5c382d2b8d | |||
| 35bba451f5 | |||
| 80edfba0c0 | |||
| 6bc8de192e | |||
| 2d8166d04a | |||
| ab33d81b30 | |||
| 2c0328f36d | |||
| fb8e27825d | |||
| 1366c69cdc | |||
| d0c8c186f4 | |||
| 19f7c8312e | |||
| 24b0e362fb | |||
| 3a030548c0 | |||
| 094cb7ed9d | |||
| 0199ca4693 | |||
| 75344ebbbd | |||
| 7f049a4e26 | |||
| 6559f5462c | |||
| 6556d35f97 | |||
| dec96a0da6 | |||
| 8d3de01117 | |||
| 5bedf026b1 | |||
| daf5277507 | |||
| ee3b72878f | |||
| 6217f7a124 | |||
| 2468386f24 | |||
| 2015195112 | |||
| 3426185383 | |||
| 9eedbafe97 | |||
| a5f31c5cb5 | |||
| 2f72cf0118 | |||
| c56e77c2c1 | |||
| e9aa7a6f8b | |||
| a75a8a4087 | |||
| ba27d2e300 | |||
| 381e37efce | |||
| 97dfc08f4d | |||
| d281ac5a70 | |||
| 812bbf8555 | |||
| e6b7a1aa27 | |||
| 90f47ad018 | |||
| cc458e8972 | |||
| 79bf00abfd | |||
| d77277857f | |||
| 3232239a85 | |||
| 014781de5e | |||
| 68b1c65384 | |||
| e8bb553349 | |||
| e4ecb8723f | |||
| 5edbaacf81 | |||
| 8421f472f2 | |||
| 51f9bdc2b5 | |||
| 02bc42fbf0 | |||
| 832a8b0bda | |||
| 57c4b86430 | |||
| 43fad7beed | |||
| 8dd002d159 | |||
| 2373b55d8b | |||
| 81530636ec | |||
| 190199b36d | |||
| 73aa89f8f4 | |||
| 941f3ea4b0 | |||
| c5770006f7 | |||
| 84c1aef51f | |||
| 460d2be9e9 | |||
| 090d341244 | |||
| 079e84f729 | |||
| dd23d9d163 | |||
| ad7178ebcb | |||
| 0543266c92 | |||
| 7e2038ecac | |||
| 75e41eafc4 | |||
| 73b9d05357 | |||
| 9d81f96f31 | |||
| d2e1c7a629 | |||
| 83d5ead471 | |||
| d339133981 | |||
| 7be129ad40 | |||
| 179048e565 |
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Edit(*)",
|
||||
"Bash(pytest *)",
|
||||
"Bash(python *)",
|
||||
"Bash(python3 *)",
|
||||
"Bash(.venv/bin/pytest *)",
|
||||
"Bash(npm *)",
|
||||
"Bash(git *)",
|
||||
"Bash(ls *)",
|
||||
"Bash(cat *)",
|
||||
"Bash(grep *)",
|
||||
"Bash(find *)",
|
||||
"Bash(mkdir *)",
|
||||
"Bash(touch *)",
|
||||
"Bash(uv *)"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
name: Release
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: FreeBSD
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
run: |
|
||||
python3 --version
|
||||
python3 -m ensurepip --upgrade
|
||||
|
||||
- name: Install build tools
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install --upgrade pip
|
||||
.venv/bin/pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: .venv/bin/python -m build
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
||||
- name: Generate changelog
|
||||
id: changelog
|
||||
run: |
|
||||
PREV_TAG=$(git tag --sort=-version:refname | grep -v "^${GITHUB_REF#refs/tags/}$" | head -1)
|
||||
if [ -n "$PREV_TAG" ]; then
|
||||
CHANGELOG=$(git log --pretty=format:"- %s" "${PREV_TAG}..HEAD")
|
||||
else
|
||||
CHANGELOG="Initial release"
|
||||
fi
|
||||
# Write multiline to output
|
||||
{
|
||||
echo "CHANGELOG<<EOF"
|
||||
echo "$CHANGELOG"
|
||||
echo "EOF"
|
||||
} >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Upload to Gitea PyPI registry
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
.venv/bin/python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/*
|
||||
|
||||
- name: Create release
|
||||
uses: actions/gitea-release-action@v1
|
||||
with:
|
||||
files: |
|
||||
dist/*.whl
|
||||
dist/*.tar.gz
|
||||
title: "Release ${{ steps.get_version.outputs.VERSION }}"
|
||||
body: "${{ steps.changelog.outputs.CHANGELOG }}"
|
||||
@@ -10,3 +10,8 @@ build/
|
||||
dist/
|
||||
*.egg-info/
|
||||
ssl/
|
||||
uv.lock
|
||||
.hb.yaml
|
||||
.superpowers/
|
||||
rndc-key
|
||||
docs/superpowers/
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
#name: "w02"
|
||||
hb_port: 50003
|
||||
hbd_host: ''
|
||||
#logfile: "/home/andreas/public_html/messages/andreas"
|
||||
logfile: "/Users/andreas/public_html/messages/andreas"
|
||||
logfmt: "msg"
|
||||
grace: 40
|
||||
interval: 10
|
||||
watchhosts:
|
||||
# "localhost":
|
||||
# "haschloss" :
|
||||
# "cotgate":
|
||||
"wentworth":
|
||||
notify: +4915123456789
|
||||
src: "signal"
|
||||
"y":
|
||||
notify: +4915123456789
|
||||
src: "signal"
|
||||
"winter":
|
||||
notify: +14168226179
|
||||
src: "signal"
|
||||
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
|
||||
drophosts: {"unknown", "wookie15", "wort"}
|
||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
||||
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
|
||||
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
|
||||
pushsrv: "pushover"
|
||||
|
||||
dyndomains: {"wrede.org"}
|
||||
toemail: ["aew.hbd.notify@wrede.ca"]
|
||||
fromemail: "aew.hbd@wrede.ca"
|
||||
smtpserver: "smtp.fastmail.com"
|
||||
smtpuser: "andreas@wrede.ca"
|
||||
smtppassword: "r8psra6wj6gcakkp"
|
||||
smtpport: 587
|
||||
|
||||
ws_port: 50005
|
||||
wss_port: 50006
|
||||
cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
|
||||
cert_path: "ssl/"
|
||||
# CERT_PATH = "./test/"
|
||||
wss_pem: "fullchain.pem"
|
||||
wss_key: "privkey.pem"
|
||||
|
||||
Vendored
+7
-6
@@ -4,12 +4,13 @@
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python: Run hbd (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "hbd.cli",
|
||||
"args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
|
||||
"module": "hbd.server.cli",
|
||||
"args": ["-c", "~/.hb.yaml", "-f", "-v"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
@@ -28,14 +29,14 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Python: Run hbd with debugpy (listen)",
|
||||
"name": "Python: Run hbc (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "debugpy",
|
||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
|
||||
"module": "hbd.client.main",
|
||||
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Vendored
+4
-1
@@ -2,5 +2,8 @@
|
||||
"python.pythonPath": "/usr/bin/python3",
|
||||
"python.linting.enabled": true,
|
||||
"python.formatting.provider": "black",
|
||||
"python.linting.flake8Enabled": true
|
||||
"python.linting.flake8Enabled": true,
|
||||
"chat.tools.terminal.autoApprove": {
|
||||
"mv": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
1. Don't assume. Don't hide confusion. Surface tradeoffs.
|
||||
2. Minimum code that solves the problem. Nothing speculative.
|
||||
3. Touch only what you must. Clean up only your own mess.
|
||||
4. Define success criteria. Loop until verified.
|
||||
@@ -0,0 +1,210 @@
|
||||
# Heartbeat
|
||||
|
||||
Heartbeat is a lightweight host monitoring system built around a simple idea: each machine you want to monitor runs a small client (`hbc`) that sends a UDP "heartbeat" packet to a central server (`hbd`) on a regular interval. If a heartbeat stops arriving, you get notified. Alongside reachability, clients can ship system metrics — CPU, memory, disk, network — and the server will alert you when any of those cross a threshold.
|
||||
|
||||
## How it works
|
||||
|
||||
```
|
||||
[ monitored host ] [ your server ]
|
||||
┌─────────────┐ UDP 50003 ┌────────────────────────┐
|
||||
│ hbc │ ────────────> │ hbd │
|
||||
│ │ │ host state tracking │
|
||||
│ plugins: │ <──────────── │ threshold alerting │
|
||||
│ cpu, mem, │ ACK / CMD │ notifications │
|
||||
│ disk, ... │ │ web dashboard + API │
|
||||
└─────────────┘ └────────────────────────┘
|
||||
```
|
||||
|
||||
- **hbd** — the server daemon. Tracks which hosts are alive, evaluates metric thresholds, fires notifications, serves the web dashboard and REST API.
|
||||
- **hbc** — the client. Sends heartbeats and plugin data over UDP. Runs on any Linux/BSD/macOS host.
|
||||
- **hbc_mini** — a zero-dependency single-file alternative (`hbc_mini.py` or `hbc_mini.c`) for hosts where you can't install Python packages.
|
||||
|
||||
Notifications can go to Pushover, email, Mattermost, Matrix, Signal, or VoIP.ms SMS. The dashboard shows host connectivity, RTT graphs, active alerts, and per-host plugin metrics in real time via WebSocket.
|
||||
|
||||
---
|
||||
|
||||
## Getting started
|
||||
|
||||
This tutorial sets up a server on one machine and a client on a second machine. You'll end up with a working dashboard and your first host being monitored.
|
||||
|
||||
### 1. Install the server
|
||||
|
||||
On the machine that will run `hbd`:
|
||||
|
||||
```bash
|
||||
git clone https://git.wrede.ca/andreas/heartbeat.git
|
||||
cd heartbeat
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install .
|
||||
```
|
||||
|
||||
Verify the install:
|
||||
|
||||
```bash
|
||||
hbd --help
|
||||
```
|
||||
|
||||
### 2. Create a server config
|
||||
|
||||
Create `~/.hb.yaml`:
|
||||
|
||||
```yaml
|
||||
hb_port: 50003 # UDP port — clients send heartbeats here
|
||||
hbd_port: 50004 # HTTP port — web dashboard and API
|
||||
ws_port: 50005 # WebSocket port — live dashboard updates
|
||||
|
||||
interval: 20 # Expected heartbeat interval (seconds)
|
||||
grace: 2 # Seconds of slack before a host is considered overdue
|
||||
|
||||
pickfile: ~/.hb.pick
|
||||
pidfile: ~/.hb.pid
|
||||
logfile: ~/.hb.log
|
||||
```
|
||||
|
||||
That's enough to get started. No hosts, no users, no notifications needed yet — the server will accept any client that connects.
|
||||
|
||||
### 3. Start the server
|
||||
|
||||
```bash
|
||||
hbd serve -c ~/.hb.yaml -f -v
|
||||
```
|
||||
|
||||
`-f` keeps it in the foreground so you can watch the log. You should see:
|
||||
|
||||
```
|
||||
Heartbeat daemon starting on UDP :50003, HTTP :50004, WS :50005
|
||||
```
|
||||
|
||||
Open `http://your-server:50004/live` in a browser. The dashboard is empty for now.
|
||||
|
||||
### 4. Install the client on a host to monitor
|
||||
|
||||
On the machine you want to monitor (must be able to reach the server on UDP 50003):
|
||||
|
||||
```bash
|
||||
pip install hbd # or: copy scripts/hbc_mini.py if you can't install packages
|
||||
```
|
||||
|
||||
#### Quick start — no config file
|
||||
|
||||
```bash
|
||||
hbc your-server.example.com
|
||||
```
|
||||
|
||||
Within a few seconds the server log will show the host checking in, and it will appear on the dashboard.
|
||||
|
||||
#### With a config file
|
||||
|
||||
Create `~/.hbc.yaml` on the client host:
|
||||
|
||||
```yaml
|
||||
hb_port: 50003
|
||||
interval: 10 # Send a heartbeat every 10 seconds
|
||||
|
||||
plugins:
|
||||
cpu_monitor:
|
||||
interval: 60
|
||||
memory_monitor:
|
||||
interval: 60
|
||||
disk_monitor:
|
||||
interval: 60
|
||||
```
|
||||
|
||||
Then start the client:
|
||||
|
||||
```bash
|
||||
hbc -c ~/.hbc.yaml your-server.example.com
|
||||
```
|
||||
|
||||
Send a boot message at startup so the server logs when the host came up:
|
||||
|
||||
```bash
|
||||
hbc -b -c ~/.hbc.yaml your-server.example.com
|
||||
```
|
||||
|
||||
Run as a daemon (logs go to syslog):
|
||||
|
||||
```bash
|
||||
hbc -d -b -c ~/.hbc.yaml your-server.example.com
|
||||
```
|
||||
|
||||
### 5. View the dashboard
|
||||
|
||||
Open `http://your-server:50004/live`. You'll see the monitored host, its last heartbeat time, and RTT. Click the host name to see plugin metrics.
|
||||
|
||||
Navigate to `/plugins/<hostname>` for CPU, memory, and disk graphs.
|
||||
|
||||
### 6. Add a notification channel (optional)
|
||||
|
||||
Edit `~/.hb.yaml` on the server:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
pushover_ops:
|
||||
type: pushover
|
||||
token: YOUR_APP_TOKEN
|
||||
user: YOUR_USER_KEY
|
||||
|
||||
users:
|
||||
alice:
|
||||
password: pbkdf2:sha256:... # generate: hbd passwd alice
|
||||
admin: true
|
||||
notification_channels: [pushover_ops]
|
||||
|
||||
default_owner: alice
|
||||
```
|
||||
|
||||
Generate the password hash:
|
||||
|
||||
```bash
|
||||
hbd passwd alice
|
||||
```
|
||||
|
||||
Paste the output into the config, then reload:
|
||||
|
||||
```bash
|
||||
hbd reload
|
||||
```
|
||||
|
||||
Test the channel:
|
||||
|
||||
```bash
|
||||
hbd notify
|
||||
```
|
||||
|
||||
### 7. Set a threshold alert (optional)
|
||||
|
||||
Add to `~/.hb.yaml`:
|
||||
|
||||
```yaml
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
```
|
||||
|
||||
Reload: `hbd reload`. The server will now alert when a monitored host crosses these values.
|
||||
|
||||
---
|
||||
|
||||
## What's next
|
||||
|
||||
| Topic | Where to look |
|
||||
|---|---|
|
||||
| Full server config reference | [README — Server](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#server-hbd) |
|
||||
| Client options and all plugins | [README — Client](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#client-hbc) |
|
||||
| Threshold alerting details | [THRESHOLD_ALERTING.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/THRESHOLD_ALERTING.md) |
|
||||
| Notification channels | [NOTIFICATIONS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NOTIFICATIONS.md) |
|
||||
| User accounts and roles | [USERS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/USERS.md) |
|
||||
| Writing a custom plugin | [PLUGIN_DEVELOPMENT.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/PLUGIN_DEVELOPMENT.md) |
|
||||
| Nagios check integration | [NAGIOS_INTEGRATION.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NAGIOS_INTEGRATION.md) |
|
||||
| REST API | [HTTP_API.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/HTTP_API.md) |
|
||||
| Zero-dependency client | [README — hbc_mini](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#hbc_mini--zero-dependency-client) |
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
# MIT License
|
||||
|
||||
Copyright (c) 2002 - 2026 Andreas Wrede
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -1,192 +1,755 @@
|
||||
# Heartbeat Daemon (hbd) ✅
|
||||
# Heartbeat Daemon (hbd)
|
||||
|
||||
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
||||
A lightweight UDP-based host monitoring system. Monitored hosts run a client (`hbc`) that sends periodic heartbeat packets and system metrics to a central server (`hbd`). The server tracks host reachability, evaluates metric thresholds, sends notifications, and serves a web dashboard.
|
||||
|
||||
---
|
||||
|
||||
## 📌 Features
|
||||
## Architecture
|
||||
|
||||
- Receive and parse heartbeat datagrams (text or zlib-compressed) ✅
|
||||
- Maintain host state and detect up/down transitions ✅
|
||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
||||
- WebSocket API for live updates (hosts & messages) ✅
|
||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
||||
- Modular codebase suitable for unit testing and CI ✅
|
||||
```
|
||||
[ host running hbc ] [ server running hbd ]
|
||||
┌────────────────────┐ ┌────────────────────────────┐
|
||||
│ heartbeat client │ UDP 50003 │ heartbeat daemon │
|
||||
│ │ ──────────> │ │
|
||||
│ plugins: │ HTB / PLG │ host state tracking │
|
||||
│ - cpu_monitor │ │ threshold evaluation │
|
||||
│ - memory_monitor │ <────────── │ DNS updates (nsupdate) │
|
||||
│ - disk_monitor │ ACK/CMD/UPD │ notifications │
|
||||
│ - nagios_runner │ │ web dashboard + REST API │
|
||||
│ - ... │ │ WebSocket live updates │
|
||||
└────────────────────┘ └────────────────────────────┘
|
||||
```
|
||||
|
||||
**Package:** `hbd` v5.3.4
|
||||
**Python:** 3.11+
|
||||
|
||||
### Subpackages
|
||||
|
||||
| Package | Purpose |
|
||||
|---|---|
|
||||
| `hbd.common` | Protocol encoding/decoding, shared utilities |
|
||||
| `hbd.server` | The `hbd` daemon |
|
||||
| `hbd.client` | The `hbc` client |
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Quickstart
|
||||
## Installation
|
||||
|
||||
Prerequisites:
|
||||
|
||||
- Python 3.10+ (project uses language features from recent Python)
|
||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
||||
|
||||
Install dependencies (recommended into a venv):
|
||||
|
||||
This project now declares its dependencies in `pyproject.toml`. Instead
|
||||
of the old `requirements.txt` flow, install the package into a virtualenv
|
||||
using `pip`:
|
||||
|
||||
See `scripts/install.sh` for a way to install.
|
||||
|
||||
Run the daemon (example):
|
||||
Dependencies are declared in `pyproject.toml`. Install into a virtualenv:
|
||||
|
||||
```bash
|
||||
# run with default config lookup (~/.hb.yaml)
|
||||
hbd -c .hb.yaml -f -v
|
||||
# Server + client
|
||||
pip install .
|
||||
|
||||
# Using the install script
|
||||
scripts/hb_install.sh
|
||||
```
|
||||
|
||||
You can also run it directly via the package entrypoint after installation:
|
||||
**Entry points:**
|
||||
- `hbd` — server (`hbd.server.cli:main`)
|
||||
- `hbc` — client (`hbd.client.main:main`)
|
||||
|
||||
```bash
|
||||
python -m hbd.cli -c /path/to/config.yaml
|
||||
```
|
||||
**Runtime dependencies:**
|
||||
|
||||
## 🐞 Debugging in VS Code
|
||||
|
||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
||||
|
||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
||||
- Use **F5** and pick one of these configurations from the Run view:
|
||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
||||
|
||||
To start `hbd` manually and wait for the debugger to attach, run:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
||||
```
|
||||
|
||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
||||
| Component | Packages |
|
||||
|---|---|
|
||||
| Both | PyYAML ≥6.0 |
|
||||
| Client | psutil ≥5.9.0 |
|
||||
| Server | aiohttp ≥3.11, websockets ≥13.2, Jinja2 ≥3.1.6, ruamel.yaml ≥0.18, mattermostdriver ≥7.3.0, matrix-nio ≥0.24 |
|
||||
|
||||
---
|
||||
|
||||
## 🛠 Configuration
|
||||
## Server (`hbd`)
|
||||
|
||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
||||
### Starting the server
|
||||
|
||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
||||
- `hbd_port`: internal control port (default: 50004)
|
||||
- `hbd_host`: bind address for HTTP/WSS
|
||||
- `pickfile`: path for persisted state
|
||||
- `logfile`: path to log file
|
||||
- `logfmt`: `text` or `msg`
|
||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
||||
- `interval` / `grace`: heartbeat timing configuration
|
||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
||||
- `nsupdate_bin`: path to nsupdate binary
|
||||
- `ws_port`: port for plain WebSocket connections (default: 50005)
|
||||
- `wss_port`: port for secure WebSocket (WSS) connections (default: none).
|
||||
If set, `hbd` will attempt to serve WSS on this port when `wss_pem` and
|
||||
`wss_key` SSL files are available under `cert_path` (see below).
|
||||
- `cert_path`: directory where TLS certificate and key are looked up (default: /usr/local/etc/ssl/)
|
||||
- `wss_pem`: filename for the certificate chain (default: fullchain.pem)
|
||||
- `wss_key`: filename for the private key (default: privkey.pem)
|
||||
```bash
|
||||
# Foreground, verbose, with config file
|
||||
hbd serve -c /etc/hb.yaml -f -v
|
||||
|
||||
Example `.hb.yaml` (minimal):
|
||||
# As a module
|
||||
python -m hbd.server.cli serve -c /etc/hb.yaml
|
||||
```
|
||||
|
||||
### CLI subcommands
|
||||
|
||||
| Command | Description |
|
||||
|---|---|
|
||||
| `hbd serve` | Start the daemon (default) |
|
||||
| `hbd passwd <username>` | Generate a password hash for config |
|
||||
| `hbd notify` | Test notification channels |
|
||||
| `hbd stop` | Stop a running daemon |
|
||||
| `hbd reload` | Reload config (send SIGHUP) |
|
||||
| `hbd restart` | Restart daemon |
|
||||
|
||||
### Configuration (`~/.hb.yaml`)
|
||||
|
||||
```yaml
|
||||
hbd_host: 0.0.0.0
|
||||
hbd_port: 50004
|
||||
# Network
|
||||
hb_port: 50003 # UDP port for heartbeat messages
|
||||
hbd_port: 50004 # HTTP API / web UI port
|
||||
hbd_host: "" # Bind address (empty = all interfaces)
|
||||
ws_port: 50005 # WebSocket port (plain)
|
||||
wss_port: ~ # WebSocket port (TLS; requires cert_path/wss_pem/wss_key)
|
||||
|
||||
# Timing
|
||||
interval: 20 # Expected heartbeat interval (seconds)
|
||||
grace: 2 # Extra seconds before declaring a host overdue
|
||||
|
||||
# Persistence
|
||||
pickfile: ~/.hb.pick # Host state persistence
|
||||
pidfile: ~/.hb.pid
|
||||
logfile: ~/.hb.log
|
||||
|
||||
# Message journal
|
||||
journal_enabled: true
|
||||
journal_dir: /var/log/heartbeat
|
||||
journal_file: messages.journal
|
||||
journal_max_size: 104857600 # 100 MB
|
||||
journal_max_backups: 10
|
||||
|
||||
# DNS
|
||||
nsupdate_bin: /usr/bin/nsupdate
|
||||
dyndomains:
|
||||
- example.com
|
||||
nsupdate_bin: /usr/bin/nsupdate
|
||||
pushsrv: pushover
|
||||
|
||||
# Threshold alert re-notification interval (seconds)
|
||||
threshold_renotify_interval: 3600
|
||||
|
||||
# Notification channels
|
||||
notification_channels:
|
||||
pushover_ops:
|
||||
type: pushover
|
||||
token: YOUR_APP_TOKEN
|
||||
user: YOUR_USER_KEY
|
||||
email_ops:
|
||||
type: email
|
||||
smtp_server: smtp.example.com
|
||||
port: 587
|
||||
user: alerts@example.com
|
||||
password: secret
|
||||
recipients: [ops@example.com]
|
||||
|
||||
# Users
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
password: pbkdf2:sha256:... # generate with: hbd passwd alice
|
||||
admin: true
|
||||
notification_channels: [pushover_ops]
|
||||
bob:
|
||||
password: pbkdf2:sha256:...
|
||||
notification_channels: [email_ops]
|
||||
|
||||
default_owner: alice
|
||||
|
||||
# Hosts
|
||||
hosts:
|
||||
webserver01:
|
||||
dyndns: true # Update DNS when address changes
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
monitors: []
|
||||
database01:
|
||||
watch: false # Suppress all notifications for this host
|
||||
```
|
||||
|
||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
||||
Send SIGHUP (or `hbd reload`) to reload configuration without restarting. Changes to ports, certificates, pickle path, and journal path require a full restart.
|
||||
|
||||
### Persistence
|
||||
|
||||
Host state (reachability, plugin data, alert states) is saved to `pickfile` every 5 minutes and on clean shutdown. The server loads this state on startup.
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Architecture & Modules
|
||||
## Client (`hbc`)
|
||||
|
||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
|
||||
The DNS worker now runs as an `asyncio` task and the package exposes a
|
||||
small thread-safe bridge so legacy synchronous code can `put()` updates
|
||||
into the queue; there is no longer a permanently-blocking background
|
||||
`threading.Thread`.
|
||||
- `hbd.notify` — email and push notification helpers
|
||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
||||
|
||||
This modular layout makes the code easier to test and maintain.
|
||||
|
||||
**Runtime & Shutdown**
|
||||
|
||||
- The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks.
|
||||
- On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit.
|
||||
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.hbdclass.Host.dnsQ`.
|
||||
|
||||
**Templates & Static Files**
|
||||
|
||||
- Template files are located under `hbd/templates` by default. The HTTP server resolves templates relative to the `hbd` package but the path can be overridden with the `templates_dir` config key.
|
||||
- Static assets (CSS/JS/images) are served from `hbd/static` via the `/static/<path>` HTTP route. Place your static files in that directory or configure the HTTP server as needed.
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing & Dev
|
||||
|
||||
Tests are implemented using `unittest` and additional tests rely on `pytest` if you prefer. To run tests locally without installing anything beyond the dev requirements:
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Basic — send heartbeats to a server
|
||||
hbc your-server.example.com
|
||||
|
||||
# Multiple servers
|
||||
hbc server1.example.com server2.example.com
|
||||
|
||||
# With config file, running as a daemon
|
||||
hbc -d -c /etc/hbc.yaml your-server.example.com
|
||||
|
||||
# Send a boot message, then heartbeat normally
|
||||
hbc -b your-server.example.com
|
||||
|
||||
# One-off message
|
||||
hbc -m "maintenance starting" your-server.example.com
|
||||
|
||||
# Force IPv4 or IPv6 only
|
||||
hbc -4 your-server.example.com
|
||||
hbc -6 your-server.example.com
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
| Flag | Description |
|
||||
|---|---|
|
||||
| `-b`, `--boot` | Send a boot message at startup |
|
||||
| `-c`, `--config FILE` | Config file path (default: `~/.hbc.yaml`) |
|
||||
| `-d`, `--daemon` | Daemonize (logs go to syslog) |
|
||||
| `-m`, `--message TEXT` | Send a one-off message and exit |
|
||||
| `-n`, `--name NAME` | Override reported hostname |
|
||||
| `-v`, `--verbose` | Verbose output |
|
||||
| `-x`, `--debug` | Debug level (repeatable) |
|
||||
| `-4` / `-6` | Restrict to IPv4 or IPv6 |
|
||||
|
||||
### Configuration (`~/.hbc.yaml`)
|
||||
|
||||
```yaml
|
||||
hb_port: 50003 # Server UDP port
|
||||
interval: 10 # Heartbeat interval (seconds)
|
||||
owner: alice # Optional: claim ownership of this host
|
||||
|
||||
plugins:
|
||||
cpu_monitor:
|
||||
interval: 300 # Override collection interval
|
||||
per_core: true # Report per-core CPU usage
|
||||
memory_monitor:
|
||||
interval: 300
|
||||
disk_monitor:
|
||||
interval: 300
|
||||
network_monitor:
|
||||
interval: 300
|
||||
ping_monitor:
|
||||
interval: 60
|
||||
hosts: [8.8.8.8, 192.168.1.1]
|
||||
nagios_runner:
|
||||
interval: 300
|
||||
commands:
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
zfs_monitor:
|
||||
interval: 300
|
||||
```
|
||||
|
||||
### Connection behaviour
|
||||
|
||||
- The client sends heartbeats over UDP to each server address resolved from the hostname (IPv4 and IPv6).
|
||||
- If a connection fails to open at startup, IPv6 connections are dropped after 3 consecutive failures. IPv4 connections retry indefinitely.
|
||||
- In daemon mode (`-d`), all log output goes to syslog (`LOG_DAEMON` facility).
|
||||
|
||||
---
|
||||
|
||||
## UDP Protocol
|
||||
|
||||
All messages are zlib-compressed key=value pairs with an ID prefix.
|
||||
|
||||
```
|
||||
!<ID>: <zlib-compressed payload>
|
||||
```
|
||||
|
||||
Payload format: `key=value;key=value;...`
|
||||
|
||||
| Message | Direction | Purpose |
|
||||
|---|---|---|
|
||||
| `HTB` | client → server | Heartbeat (name, timestamp, RTT, acks, interval) |
|
||||
| `PLG` | client → server | Plugin data (plugin name + metrics) |
|
||||
| `ACK` | server → client | Acknowledgment |
|
||||
| `CMD` | server → client | Execute a shell command on the client |
|
||||
| `UPD` | server → client | Trigger self-update via `hb_install.sh` |
|
||||
|
||||
Value encoding:
|
||||
- Floats: 5 decimal places
|
||||
- Lists/dicts: JSON prefixed with `@`
|
||||
- Booleans: `1` / `0`
|
||||
|
||||
RTT is measured using kernel SO_TIMESTAMP when available (Linux, macOS, FreeBSD), falling back to application-layer timing.
|
||||
|
||||
---
|
||||
|
||||
## Plugin System
|
||||
|
||||
Plugins run on the client and collect system metrics that are sent to the server as `PLG` messages.
|
||||
|
||||
### Plugin types
|
||||
|
||||
| Type | `interval` | When collected |
|
||||
|---|---|---|
|
||||
| `InfoPlugin` | 0 | Once at startup; re-collected on server request |
|
||||
| `MonitorPlugin` | 30 (default) | Periodically on the configured interval |
|
||||
|
||||
### Built-in plugins
|
||||
|
||||
| Plugin | Type | Data collected |
|
||||
|---|---|---|
|
||||
| `os_info` | Info | OS, kernel, distro, architecture, Python version, hbc version |
|
||||
| `cpu_monitor` | Monitor | cpu_percent, per-core usage, load averages, process count, frequency |
|
||||
| `memory_monitor` | Monitor | RAM and swap usage (ZFS ARC-aware) |
|
||||
| `disk_monitor` | Monitor | Per-partition usage, disk I/O stats |
|
||||
| `network_monitor` | Monitor | Per-interface byte/packet counts, connection count |
|
||||
| `ping_monitor` | Monitor | RTT, packet loss, jitter per configured host |
|
||||
| `filesystem_info` | Info | Mounted filesystems (excludes pseudo filesystems) |
|
||||
| `nagios_runner` | Monitor | Output of configured Nagios-compatible check commands |
|
||||
| `zfs_monitor` | Monitor | ZFS pool health, capacity, fragmentation, dedup ratio, I/O |
|
||||
|
||||
### Custom plugins
|
||||
|
||||
Create a `.py` file in `hbd/client/plugins/`:
|
||||
|
||||
```python
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
class MyPlugin(MonitorPlugin):
|
||||
name = "my_plugin"
|
||||
interval = 60
|
||||
|
||||
async def collect(self):
|
||||
return {"my_metric": 42}
|
||||
```
|
||||
|
||||
`initialize()` is called once at load time; return `False` to disable the plugin (e.g., if a required binary is missing).
|
||||
|
||||
### Nagios integration
|
||||
|
||||
The `nagios_runner` plugin executes any Nagios-compatible check binary:
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
nagios_runner:
|
||||
commands:
|
||||
- name: check_http
|
||||
command: /usr/lib/nagios/plugins/check_http -H example.com
|
||||
```
|
||||
|
||||
- Commands are validated (absolute paths, executable) at startup.
|
||||
- Exit codes map to OK / WARNING / CRITICAL / UNKNOWN.
|
||||
- Performance data fields are extracted and stored individually.
|
||||
- The `nagios` threshold operator maps exit codes directly to alert levels (see Threshold Alerting).
|
||||
|
||||
---
|
||||
|
||||
## Threshold Alerting
|
||||
|
||||
The server evaluates plugin metrics against configurable thresholds and fires notifications on state changes.
|
||||
|
||||
### Configuration
|
||||
|
||||
```yaml
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">" # >, >=, <, <=, ==, != (default: >)
|
||||
hysteresis: 0.1 # 10%: recover at 81 when critical=90
|
||||
count: 1 # Require N consecutive breaches before alerting
|
||||
display: "CPU {cpu_percent}% (threshold: {op_symbol}{threshold_value})"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
free_gb:
|
||||
warning: 10.0
|
||||
critical: 5.0
|
||||
operator: "<"
|
||||
|
||||
nagios_runner:
|
||||
status_code:
|
||||
operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||
display: "{check_name}: {output}"
|
||||
```
|
||||
|
||||
### Per-host threshold profiles
|
||||
|
||||
Named profiles let different hosts use different thresholds. A single name or a list is accepted; lists are applied left-to-right.
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 80, critical: 90}
|
||||
|
||||
tight_cpu:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 60, critical: 75}
|
||||
|
||||
hosts:
|
||||
web-01:
|
||||
threshold_config: default
|
||||
db-01:
|
||||
threshold_config: [default, tight_cpu]
|
||||
```
|
||||
|
||||
### Alert states
|
||||
|
||||
| State | Meaning |
|
||||
|---|---|
|
||||
| OK | Metric within normal range |
|
||||
| WARNING | Metric crossed warning threshold |
|
||||
| CRITICAL | Metric crossed critical threshold |
|
||||
| UNKNOWN | Cannot determine (e.g. Nagios exit code 3) |
|
||||
|
||||
Notifications are sent on state transitions (OK → WARNING, WARNING → CRITICAL, CRITICAL → OK). De-escalations (CRITICAL → WARNING) do not trigger a notification. Ongoing alerts generate a re-notification every `threshold_renotify_interval` seconds (default: 3600). Alerts can be acknowledged via the web UI or API to suppress re-notifications.
|
||||
|
||||
### RTT thresholds
|
||||
|
||||
The server measures heartbeat round-trip time and supports RTT thresholds using the same format:
|
||||
|
||||
```yaml
|
||||
thresholds:
|
||||
rtt:
|
||||
webserver01:
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0
|
||||
```
|
||||
|
||||
### Generic threshold matching
|
||||
|
||||
When a metric has no exact threshold entry, the server strips leading segments and retries. This allows one entry to cover all Nagios checks:
|
||||
|
||||
```
|
||||
nagios_runner.check_disk_root_status_code → no match
|
||||
nagios_runner.disk_root_status_code → no match
|
||||
nagios_runner.root_status_code → no match
|
||||
nagios_runner.status_code → matched ✓
|
||||
```
|
||||
|
||||
The stripped prefix (`check_disk_root`) is available as `{check_name}` in the `display` template.
|
||||
|
||||
### Display template variables
|
||||
|
||||
| Variable | Description |
|
||||
|---|---|
|
||||
| `{value}` | Current metric value |
|
||||
| `{threshold_value}` | Threshold that was crossed |
|
||||
| `{op_symbol}` | Comparison operator |
|
||||
| `{check_name}` | Prefix stripped by generic matching |
|
||||
| `{metric_name}` | Full field name |
|
||||
| `{output}` | Nagios check output text |
|
||||
| `{status}` | Nagios status name (OK/WARNING/CRITICAL/UNKNOWN) |
|
||||
| any plugin field | Any field present in the plugin's data |
|
||||
|
||||
---
|
||||
|
||||
## Notification Channels
|
||||
|
||||
Notifications are dispatched to the host's owner, managers, and monitors. Each user specifies which channels to use.
|
||||
|
||||
### Supported channel types
|
||||
|
||||
| Type | Required fields |
|
||||
|---|---|
|
||||
| `pushover` | `token`, `user` |
|
||||
| `email` | `smtp_server`, `recipients`, `sender`, `user`, `password`, `port` |
|
||||
| `mattermost` | `webhook_url`, `channel` |
|
||||
| `matrix` | `homeserver`, `user`, `password`, `room_id` |
|
||||
| `signal` | `phone_number`, `recipient` |
|
||||
| `sms_voipms` | `api_key`, `recipient` |
|
||||
|
||||
Each channel can set a `min_level` (`WARNING` or `CRITICAL`) to filter low-severity alerts.
|
||||
|
||||
Recovery notifications are only sent to channels that received the original alert.
|
||||
|
||||
---
|
||||
|
||||
## Web Dashboard & HTTP API
|
||||
|
||||
The server exposes a web UI and REST API on `hbd_port` (default 50004).
|
||||
|
||||
### Web pages
|
||||
|
||||
| Path | Description |
|
||||
|---|---|
|
||||
| `/login` | Login form (shown automatically when auth is configured) |
|
||||
| `/live` | Real-time host connectivity, RTT, and message stream |
|
||||
| `/plugins/<host>` | Per-host plugin metrics |
|
||||
| `/alerts` | Active alerts with severity filtering |
|
||||
| `/settings` | Server config, users, notification channels, thresholds |
|
||||
|
||||
Live views use WebSocket connections for real-time updates.
|
||||
|
||||
Non-admin users see only hosts where they have a role (monitor, manager, or owner). Admins see all hosts.
|
||||
|
||||
### REST API
|
||||
|
||||
All endpoints are under `/api/0/`. When authentication is configured, include a session token:
|
||||
|
||||
```bash
|
||||
# Log in, get a token
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||
|
||||
# Use the token
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||
```
|
||||
|
||||
| Method | Endpoint | Description |
|
||||
|---|---|---|
|
||||
| GET | `/api/0/hosts` | All visible hosts |
|
||||
| GET | `/api/0/alerts` | All active alerts |
|
||||
| GET | `/api/0/alert_summary` | Count of ok/warning/critical |
|
||||
| GET | `/api/0/messages` | Last 30 messages |
|
||||
| GET | `/api/0/hosts/{host}/plugins` | All plugin data for host |
|
||||
| GET | `/api/0/hosts/{host}/plugins/{plugin}?limit=N` | Plugin samples |
|
||||
| GET | `/api/0/hosts/{host}/alerts` | Alert states for host |
|
||||
| GET | `/api/0/hosts/{host}/access` | Access roles |
|
||||
| PUT | `/api/0/hosts/{host}/access` | Update access roles |
|
||||
| GET | `/api/0/hosts/{host}/info` | Host info (hbc version, thresholds) |
|
||||
| POST | `/api/0/alerts/acknowledge` | Acknowledge alert |
|
||||
| GET | `/api/0/users` | All users (admin only) |
|
||||
| GET | `/api/0/users/me` | Current user profile |
|
||||
| PUT | `/api/0/users/me` | Update own profile |
|
||||
| POST | `/api/0/auth/login` | Create session |
|
||||
| POST | `/api/0/auth/logout` | Destroy session |
|
||||
| GET | `/api/0/config` | Server config (secrets redacted) |
|
||||
| POST | `/api/0/config` | Update config |
|
||||
| GET | `/api/0/config/backups` | List config backups |
|
||||
| POST | `/api/0/config/rollback` | Roll back to previous config |
|
||||
| GET | `/api/0/notification_channels` | List channels |
|
||||
| POST | `/api/0/notification_channels` | Create channel |
|
||||
| PUT | `/api/0/notification_channels/{name}` | Update channel |
|
||||
| DELETE | `/api/0/notification_channels/{name}` | Delete channel |
|
||||
|
||||
---
|
||||
|
||||
## User Management & Authentication
|
||||
|
||||
When no `users:` block is in config, the server runs unauthenticated — all existing behaviour is preserved.
|
||||
|
||||
### Roles
|
||||
|
||||
| Role | Capabilities |
|
||||
|---|---|
|
||||
| monitor | View status, plugin data, alerts |
|
||||
| manager | monitor + queue commands, trigger DNS, queue upgrades |
|
||||
| owner | manager + drop host, transfer ownership, update access |
|
||||
| admin | Owner-level on all hosts + access to server config and users |
|
||||
|
||||
### Setup
|
||||
|
||||
```yaml
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
password: pbkdf2:sha256:... # hbd passwd alice
|
||||
admin: true
|
||||
notification_channels: [pushover_ops]
|
||||
|
||||
default_owner: alice # Owns any host with no explicit owner
|
||||
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
```
|
||||
|
||||
Password hashing uses PBKDF2-HMAC-SHA256 (260,000 iterations). Sessions expire after 24 hours.
|
||||
|
||||
OAuth2 login (Gitea) is supported:
|
||||
|
||||
```yaml
|
||||
oauth:
|
||||
gitea:
|
||||
url: https://git.example.com
|
||||
client_id: xxx
|
||||
client_secret: yyy
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dynamic DNS
|
||||
|
||||
When `dyndns: true` is set on a host and `dyndomains` is configured, the server updates DNS via `nsupdate` whenever the host's source address changes.
|
||||
|
||||
```yaml
|
||||
nsupdate_bin: /usr/bin/nsupdate
|
||||
dyndomains:
|
||||
- example.com
|
||||
|
||||
hosts:
|
||||
webserver01:
|
||||
dyndns: true
|
||||
```
|
||||
|
||||
DNS updates run asynchronously in a background worker.
|
||||
|
||||
---
|
||||
|
||||
## Message Journal
|
||||
|
||||
All received messages are logged in JSONL format with automatic size-based rotation.
|
||||
|
||||
```yaml
|
||||
journal_enabled: true
|
||||
journal_dir: /var/log/heartbeat
|
||||
journal_file: messages.journal
|
||||
journal_max_size: 104857600 # 100 MB
|
||||
journal_max_backups: 10
|
||||
```
|
||||
|
||||
Example entry:
|
||||
|
||||
```json
|
||||
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver01","interval":10}}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## `hbc_mini` — Zero-dependency client
|
||||
|
||||
`scripts/hbc_mini.py` is a single-file client requiring only Python 3.8+ and no external packages. Copy it to any host and run directly.
|
||||
|
||||
```bash
|
||||
python3 hbc_mini.py your-server.example.com
|
||||
python3 hbc_mini.py -d your-server.example.com # daemon mode
|
||||
python3 hbc_mini.py -b your-server.example.com # send boot message
|
||||
```
|
||||
|
||||
Config: `~/.hbc.json` (JSON format, same keys as `~/.hbc.yaml`).
|
||||
|
||||
**Available plugins:**
|
||||
|
||||
| Plugin | Platform |
|
||||
|---|---|
|
||||
| `os_info` | All |
|
||||
| `ping_monitor` | All |
|
||||
| `nagios_runner` | All (not Windows) |
|
||||
| `cpu_monitor` | Linux (`/proc/stat`; no per-core, no frequency) |
|
||||
| `memory_monitor` | Linux (`/proc/meminfo`) |
|
||||
| `disk_monitor` | Linux, macOS, BSD (`df -P`) |
|
||||
| `network_monitor` | Linux (`/proc/net/dev`) |
|
||||
|
||||
Not available vs full `hbc`: no YAML config, no `filesystem_info`, no `zfs_monitor`, no IPv6 early-fail protection.
|
||||
|
||||
---
|
||||
|
||||
## `hbc_mini.c` — C client
|
||||
|
||||
`scripts/c/hbc_mini.c` is a single-file C port of `hbc_mini.py`. It has no runtime dependencies beyond libc, zlib, pthreads, and libm, and runs on Linux, FreeBSD, NetBSD, and DragonFly BSD.
|
||||
|
||||
### Build
|
||||
|
||||
```bash
|
||||
cc -O2 -o hbc_mini scripts/c/hbc_mini.c -lz -lpthread -lm
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
The CLI is identical to `hbc_mini.py`:
|
||||
|
||||
```bash
|
||||
./hbc_mini your-server.example.com
|
||||
./hbc_mini -d your-server.example.com # daemon mode (logs to syslog)
|
||||
./hbc_mini -b your-server.example.com # send boot message
|
||||
./hbc_mini -m "note" your-server.example.com # send one-shot message
|
||||
./hbc_mini -4 your-server.example.com # IPv4 only
|
||||
./hbc_mini -6 your-server.example.com # IPv6 only
|
||||
```
|
||||
|
||||
Config: `~/.hbc.json` (JSON, same keys as the Python version).
|
||||
|
||||
### Architecture
|
||||
|
||||
The C client uses two threads:
|
||||
|
||||
- **Main thread** — heartbeat sender loop + `select()`-based receive loop (1 s timeout). Sends `HTB` at the configured interval, receives `ACK`/`CMD` messages, and re-sends `os_info` on server request.
|
||||
- **Monitor thread** — all periodic plugins in a single thread with a 1-second sleep loop. Each plugin has its own next-run timestamp tracked independently.
|
||||
|
||||
SIGHUP causes the process to restart itself via `execv()`. SIGTERM/SIGINT trigger a clean shutdown (sends a shutdown heartbeat if `-b` was used).
|
||||
|
||||
### Available plugins
|
||||
|
||||
| Plugin | Platform | Data source |
|
||||
|---|---|---|
|
||||
| `os_info` | Linux, FreeBSD, NetBSD, DragonFly | `uname(2)`, `/etc/os-release`, `kern.osrelease` sysctl |
|
||||
| `cpu_monitor` | Linux | `/proc/stat` |
|
||||
| `cpu_monitor` | FreeBSD, DragonFly, NetBSD | `kern.cp_time` sysctl |
|
||||
| `memory_monitor` | Linux | `/proc/meminfo` (ZFS ARC-aware) |
|
||||
| `memory_monitor` | FreeBSD, DragonFly | `vm.stats.vm.*` sysctl |
|
||||
| `memory_monitor` | NetBSD | `VM_UVMEXP` sysctl |
|
||||
| `disk_monitor` | All | `df -P` subprocess |
|
||||
| `network_monitor` | Linux | `/proc/net/dev` |
|
||||
| `network_monitor` | FreeBSD, NetBSD, DragonFly | `getifaddrs()` + `AF_LINK` |
|
||||
| `ping_monitor` | All | `ping` subprocess |
|
||||
| `nagios_runner` | All | `popen()` subprocess |
|
||||
|
||||
`cpu_monitor` reports: `cpu_percent`, `cpu_user`, `cpu_system`, `cpu_idle`, `cpu_iowait` (Linux only), load averages, `cpu_core_count`, `uptime_seconds`.
|
||||
|
||||
`memory_monitor` reports: `memory_total`, `memory_used`, `memory_available`, `memory_free`, `memory_percent`, and swap fields when swap is present.
|
||||
|
||||
`network_monitor` reports per-interface cumulative `bytes_recv`/`bytes_sent` and interval deltas. The loopback interface (`lo`) is skipped by default; this is configurable:
|
||||
|
||||
```json
|
||||
{
|
||||
"plugins": {
|
||||
"network_monitor": {
|
||||
"skip_interfaces": ["lo", "docker0"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`disk_monitor` reports per-mount `total`, `used`, `free`, `percent`. An optional mount filter restricts reporting to specific paths:
|
||||
|
||||
```json
|
||||
{
|
||||
"plugins": {
|
||||
"disk_monitor": {
|
||||
"mounts": ["/", "/data"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Differences from `hbc_mini.py`
|
||||
|
||||
- No `filesystem_info` or `zfs_monitor` plugins
|
||||
- `UPD` (self-update) messages are logged but not acted on
|
||||
- No IPv6 early-fail protection
|
||||
- Config is JSON only (`~/.hbc.json`), no YAML
|
||||
|
||||
---
|
||||
|
||||
## Development
|
||||
|
||||
### Running tests
|
||||
|
||||
```bash
|
||||
# with project root on PYTHONPATH
|
||||
PYTHONPATH=. python -m unittest discover -v
|
||||
# or with pytest if installed
|
||||
# or
|
||||
pytest -q
|
||||
```
|
||||
|
||||
Developer tooling included:
|
||||
|
||||
- `pyproject.toml` — project metadata and dependencies
|
||||
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
||||
|
||||
To run linters and type checks locally:
|
||||
### Linting and type checking
|
||||
|
||||
```bash
|
||||
# after installing dev deps
|
||||
tox -e lint
|
||||
tox -e mypy
|
||||
```
|
||||
|
||||
---
|
||||
### Debugging in VS Code
|
||||
|
||||
## 🚀 Running in production
|
||||
A `.vscode/launch.json` is included with configurations for running and attaching the debugger. Select the project `.venv` as the Python interpreter, then use F5.
|
||||
|
||||
- Use your system service manager (systemd, launchd, etc.) to run `hbd` in the background.
|
||||
- Ensure `nsupdate` and necessary credentials are available for dynamic DNS updates.
|
||||
- Configure TLS for WSS if you enable secure websockets.
|
||||
To start with debugpy and wait for attach:
|
||||
|
||||
> Note: The project contains a small example for obtaining DNS-verified certs (certbot with RFC2136) — see earlier commit history or ask me to re-add the example to this README if you want it documented here.
|
||||
```bash
|
||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli serve -c .hb.yaml -f -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🤝 Contributing
|
||||
## License
|
||||
|
||||
Contributions welcome! Please:
|
||||
|
||||
1. Open an issue to discuss larger changes.
|
||||
2. Create a topic branch and a clear PR.
|
||||
3. Add tests for new features and run linters.
|
||||
4. Keep changes focused and documented.
|
||||
|
||||
---
|
||||
|
||||
## 📜 License
|
||||
|
||||
This repository is licensed under the MIT license. See `LICENSE` for details.
|
||||
|
||||
---
|
||||
|
||||
If you'd like, I can also:
|
||||
|
||||
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
||||
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
||||
|
||||
Which one should I do next? ✨
|
||||
MIT. See `LICENSE` for details.
|
||||
|
||||
@@ -0,0 +1,291 @@
|
||||
# Configuration Reload
|
||||
|
||||
The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
|
||||
|
||||
## How to Reload Configuration
|
||||
|
||||
Send a SIGHUP signal to the running hbd process:
|
||||
|
||||
```bash
|
||||
# Find the process ID
|
||||
ps aux | grep hbd
|
||||
|
||||
# Or use pidof/pgrep
|
||||
pidof hbd
|
||||
pgrep -f hbd
|
||||
|
||||
# Send SIGHUP signal
|
||||
kill -HUP <pid>
|
||||
|
||||
# Or if using systemd
|
||||
systemctl reload heartbeat
|
||||
```
|
||||
|
||||
## What Can Be Reloaded
|
||||
|
||||
The following configuration sections can be reloaded without restarting:
|
||||
|
||||
### ✅ Fully Reloadable
|
||||
|
||||
- **Notification Channels** (`notification_channels`)
|
||||
- Add, remove, or modify notification channel definitions
|
||||
- Update tokens, API keys, SMTP credentials
|
||||
- Change recipient lists
|
||||
|
||||
- **Threshold Configurations** (`threshold_configs`)
|
||||
- Modify warning and critical thresholds
|
||||
- Add or remove threshold rules
|
||||
- Change operators and hysteresis values
|
||||
- Update display formats
|
||||
|
||||
- **Host Configuration** (`hosts`)
|
||||
- Change watch status
|
||||
- Update notification channel assignments
|
||||
- Modify threshold config assignments
|
||||
- Change dyndns status
|
||||
|
||||
- **Host Lists**
|
||||
- `watchhosts` - hosts to monitor
|
||||
- `dyndnshosts` - hosts with dynamic DNS
|
||||
- `drophosts` - hosts to ignore
|
||||
|
||||
- **Runtime Settings**
|
||||
- `grace` - grace period multiplier
|
||||
- `interval` - expected heartbeat interval
|
||||
- `threshold_renotify_interval` - re-notification interval
|
||||
- `debug` - debug level
|
||||
- `verbose` - verbose output
|
||||
|
||||
- **DNS Settings**
|
||||
- `dyndomains` - dynamic DNS domains
|
||||
- `nsupdate_bin` - nsupdate binary path
|
||||
- `rndc_key` - RNDC key path
|
||||
|
||||
### ⚠️ Requires Restart
|
||||
|
||||
The following settings **cannot** be reloaded and require a service restart:
|
||||
|
||||
- **Network Ports**
|
||||
- `hb_port` - UDP heartbeat port
|
||||
- `hbd_port` - HTTP API port
|
||||
- `ws_port` - WebSocket port
|
||||
- `wss_port` - Secure WebSocket port
|
||||
|
||||
- **SSL/TLS Settings**
|
||||
- `cert_path` - SSL certificate path
|
||||
- `wss_pem` - SSL certificate file
|
||||
- `wss_key` - SSL key file
|
||||
|
||||
- **Persistence**
|
||||
- `pickfile` - Pickle file path
|
||||
|
||||
- **Logging**
|
||||
- `logfile` - Log file path
|
||||
|
||||
- **Journal Settings**
|
||||
- `journal_enabled` - Enable/disable journaling
|
||||
- `journal_dir` - Journal directory
|
||||
- `journal_file` - Journal filename
|
||||
- `journal_max_size` - Maximum journal size
|
||||
- `journal_max_backups` - Number of backup files
|
||||
|
||||
## Reload Process
|
||||
|
||||
When a SIGHUP signal is received:
|
||||
|
||||
1. **Configuration File Loading**
|
||||
- The config file is re-read from disk
|
||||
- YAML parsing is performed
|
||||
- Validation checks are run
|
||||
|
||||
2. **Component Updates**
|
||||
- Notification system is updated with new channel definitions
|
||||
- Threshold checker reloads all threshold configurations
|
||||
- Alert states are preserved to maintain hysteresis
|
||||
|
||||
3. **Error Handling**
|
||||
- If reload fails, the previous configuration is kept
|
||||
- Error messages are logged
|
||||
- Service continues running with old configuration
|
||||
|
||||
4. **Logging**
|
||||
- Reload start and completion are logged
|
||||
- Each component reports its reload status
|
||||
- Total number of thresholds is reported
|
||||
|
||||
## Example Reload Session
|
||||
|
||||
```bash
|
||||
# Terminal 1: Watch the logs
|
||||
tail -f /var/log/heartbeat.log
|
||||
|
||||
# Terminal 2: Edit configuration
|
||||
vim /path/to/.hb.yaml
|
||||
|
||||
# Make changes to notification channels or thresholds
|
||||
# Save the file
|
||||
|
||||
# Terminal 3: Trigger reload
|
||||
kill -HUP $(pgrep -f hbd)
|
||||
|
||||
# Terminal 1: See reload messages
|
||||
2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Starting configuration reload...
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
|
||||
2026-04-01 12:34:56 INFO: Notification configuration reloaded
|
||||
2026-04-01 12:34:56 INFO: Reloading threshold configuration...
|
||||
2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Configuration reload completed successfully
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Update Notification Credentials
|
||||
|
||||
If you need to rotate API keys or update SMTP passwords:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
pushover_standard:
|
||||
type: pushover
|
||||
token: new-token-here # Updated
|
||||
user: new-user-key-here # Updated
|
||||
```
|
||||
|
||||
Just edit the config file and send SIGHUP - no restart needed.
|
||||
|
||||
### 2. Adjust Threshold Values
|
||||
|
||||
Fine-tune alerting thresholds based on observed behavior:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 85.0 # Increased from 80.0
|
||||
critical: 95.0 # Increased from 90.0
|
||||
```
|
||||
|
||||
Send SIGHUP to apply the new thresholds immediately.
|
||||
|
||||
### 3. Add New Notification Channels
|
||||
|
||||
Add a new notification destination:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
email_oncall:
|
||||
type: email
|
||||
recipients: [oncall@example.com]
|
||||
sender: alerts@example.com
|
||||
smtp_server: smtp.example.com
|
||||
|
||||
hosts:
|
||||
critical_server:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard, email_oncall] # Added
|
||||
```
|
||||
|
||||
The new channel becomes active immediately after SIGHUP.
|
||||
|
||||
### 4. Update Watch List
|
||||
|
||||
Start or stop monitoring hosts without restart:
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
new_server:
|
||||
threshold_config: default
|
||||
watch: true # Start watching
|
||||
notification_channels: [pushover_standard]
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Test Configuration Before Reload**
|
||||
- Validate YAML syntax before sending SIGHUP
|
||||
- Check for typos in channel names
|
||||
- Verify threshold values are reasonable
|
||||
|
||||
2. **Monitor Reload Logs**
|
||||
- Always check logs after reload to confirm success
|
||||
- Look for error messages if reload fails
|
||||
- Verify expected number of thresholds loaded
|
||||
|
||||
3. **Backup Before Changes**
|
||||
- Keep a backup of working configuration
|
||||
- Use version control (git) for config files
|
||||
- Document why changes were made
|
||||
|
||||
4. **Gradual Rollout**
|
||||
- Test changes on development server first
|
||||
- Apply to one production server at a time
|
||||
- Verify behavior before applying everywhere
|
||||
|
||||
5. **Plan for Restart-Required Changes**
|
||||
- Schedule downtime for port or SSL changes
|
||||
- Use blue-green deployment if possible
|
||||
- Keep service downtime minimal
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Reload Doesn't Apply Changes
|
||||
|
||||
**Check:**
|
||||
- Is the config file path correct?
|
||||
- Did you save the file after editing?
|
||||
- Are there YAML syntax errors?
|
||||
- Check the logs for error messages
|
||||
|
||||
**Solution:**
|
||||
```bash
|
||||
# Validate YAML syntax
|
||||
python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
|
||||
|
||||
# Check file modification time
|
||||
ls -l .hb.yaml
|
||||
|
||||
# View logs
|
||||
journalctl -u heartbeat -f
|
||||
```
|
||||
|
||||
### Partial Configuration Applied
|
||||
|
||||
**Cause:** Some sections reloaded, others didn't.
|
||||
|
||||
**Solution:** Check logs to see which components failed. Common issues:
|
||||
- Invalid channel type
|
||||
- Missing required threshold fields
|
||||
- Invalid host references
|
||||
|
||||
### Service Becomes Unresponsive
|
||||
|
||||
**Cause:** Malformed configuration caused an exception.
|
||||
|
||||
**Solution:**
|
||||
1. Revert to backup configuration
|
||||
2. Send SIGHUP again to reload the good config
|
||||
3. If service is completely stuck, restart it
|
||||
|
||||
## Implementation Details
|
||||
|
||||
The reload mechanism uses:
|
||||
|
||||
- **Signal Handling**: SIGHUP triggers reload event
|
||||
- **Async-Safe Reloading**: Configuration is loaded asynchronously
|
||||
- **Component Coordination**: All affected components are updated atomically
|
||||
- **State Preservation**: Alert states and hysteresis information are maintained
|
||||
- **Error Recovery**: Failed reloads don't affect running configuration
|
||||
|
||||
## See Also
|
||||
|
||||
- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
|
||||
- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
|
||||
- Configuration examples in `hbd/config_*.yaml`
|
||||
@@ -0,0 +1,66 @@
|
||||
# Dark Mode
|
||||
|
||||
Every page in the Heartbeat web UI supports light mode, dark mode, and automatic (follows the OS/browser setting). Each user picks their preference independently; it is stored in the browser and takes effect immediately without a page reload.
|
||||
|
||||
---
|
||||
|
||||
## Choosing a theme
|
||||
|
||||
Open your profile page (`/profile`) and scroll to the **Appearance** section. Click one of the three buttons:
|
||||
|
||||
| Button | Behaviour |
|
||||
|--------|-----------|
|
||||
| **Auto** | Follows the OS or browser dark-mode preference. Updates live if the system setting changes. |
|
||||
| **Light** | Always light, regardless of system setting. |
|
||||
| **Dark** | Always dark, regardless of system setting. |
|
||||
|
||||
The preference is stored in `localStorage` under the key `hbd_theme` and applies to the current browser only. Clearing browser storage resets it to **Auto**.
|
||||
|
||||
---
|
||||
|
||||
## Implementation notes
|
||||
|
||||
### No flash of unstyled content
|
||||
|
||||
A small synchronous `<script>` runs at the very top of `<head>`, before any CSS is parsed, and sets `data-theme="dark"` on `<html>` when the stored preference (or the system setting in auto mode) calls for dark. Because it runs before paint, there is no visible flicker on page load.
|
||||
|
||||
### CSS custom properties
|
||||
|
||||
All colours are expressed as CSS custom properties defined in `head.html`:
|
||||
|
||||
```
|
||||
:root — light-mode values (default)
|
||||
html[data-theme="dark"] — dark-mode overrides
|
||||
```
|
||||
|
||||
Key variables:
|
||||
|
||||
| Variable | Purpose |
|
||||
|----------|---------|
|
||||
| `--bg` | Page background |
|
||||
| `--surface` | Card / panel background |
|
||||
| `--surface-2` / `--surface-3` | Slightly lighter/darker surfaces (table rows, hover states) |
|
||||
| `--text` / `--text-sec` / `--text-muted` | Primary, secondary, muted text |
|
||||
| `--border` / `--border-2`…`4` | Border shades from prominent to faint |
|
||||
| `--link` | Hyperlink and interactive-element colour |
|
||||
| `--nav-bg` | Navigation bar background |
|
||||
| `--input-bg` / `--input-border` | Form control colours |
|
||||
| `--shadow` / `--shadow-sm` | Box-shadow alphas |
|
||||
|
||||
A single global rule in `head.html` themes all `<input>`, `<select>`, and `<textarea>` elements across every page at once:
|
||||
|
||||
```css
|
||||
html[data-theme="dark"] input:not([type=checkbox]):not([type=radio]),
|
||||
html[data-theme="dark"] select,
|
||||
html[data-theme="dark"] textarea { … }
|
||||
```
|
||||
|
||||
Each page template adds its own `html[data-theme="dark"]` block for page-specific elements (cards, tables, badges, etc.).
|
||||
|
||||
### Auto-mode live updates
|
||||
|
||||
A `matchMedia` change listener in `head.html` updates `data-theme` whenever the OS preference changes, so users in **Auto** mode see the theme switch without reloading.
|
||||
|
||||
### Semantic colours are unchanged
|
||||
|
||||
Alert colours (red for critical, orange for warning, green for ok) and status indicators are intentionally left as fixed values — they are semantic signals, not surface colours, and look correct on both light and dark backgrounds.
|
||||
@@ -0,0 +1,738 @@
|
||||
# HTTP API and Web UI Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
|
||||
|
||||
## Base URL
|
||||
|
||||
All API endpoints are relative to the server base URL:
|
||||
```
|
||||
http://your-server:50004
|
||||
```
|
||||
|
||||
Default port is `50004` (configurable via `hbd_port` in configuration).
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
When [user accounts are configured](USERS.md), every request must be authenticated.
|
||||
|
||||
- **Browser requests** to HTML pages are redirected to `/login` automatically. JavaScript `fetch()` calls on the dashboards send the session cookie automatically — no JS changes are needed.
|
||||
- **API / programmatic requests** must include the token in an `Authorization: Bearer <token>` header or an `X-Auth-Token` header.
|
||||
|
||||
Unauthenticated API requests receive `401 Unauthorized`. When no users are configured the server runs in unauthenticated mode and all endpoints are open.
|
||||
|
||||
### Login
|
||||
|
||||
```bash
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||
```
|
||||
|
||||
See [User Management](USERS.md) for full authentication documentation.
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Authentication
|
||||
|
||||
| Method | Path | Description | Auth required |
|
||||
|--------|------|-------------|---------------|
|
||||
| `POST` | `/api/0/auth/login` | Obtain session token | No |
|
||||
| `POST` | `/api/0/auth/logout` | Invalidate session | Token |
|
||||
|
||||
### Users
|
||||
|
||||
| Method | Path | Description | Role |
|
||||
|--------|------|-------------|------|
|
||||
| `GET` | `/api/0/users` | List all users | Admin |
|
||||
| `GET` | `/api/0/users/me` | Own profile | Authenticated |
|
||||
| `PUT` | `/api/0/users/me` | Update own profile | Authenticated |
|
||||
|
||||
### Notification Channels
|
||||
|
||||
| Method | Path | Description | Role |
|
||||
|--------|------|-------------|------|
|
||||
| `GET` | `/api/0/notification_channel_types` | Channel type schemas | Authenticated |
|
||||
| `GET` | `/api/0/notification_channels` | List visible channels | Authenticated |
|
||||
| `POST` | `/api/0/notification_channels` | Create a channel | Authenticated |
|
||||
| `PUT` | `/api/0/notification_channels/{name}` | Update a channel | Owner or Admin |
|
||||
| `DELETE` | `/api/0/notification_channels/{name}` | Delete a channel | Owner or Admin |
|
||||
|
||||
### Host Management
|
||||
|
||||
#### GET /api/0/hosts
|
||||
Get list of all monitored hosts with their state information. When auth is enabled, only hosts the caller has at least **monitor** access to are returned.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "webserver01",
|
||||
"dyn": false,
|
||||
"owner": "alice",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"],
|
||||
"connections": [...]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
#### GET /api/0/messages
|
||||
Get recent heartbeat messages (last 30).
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"time": 1711234567.123,
|
||||
"host": "webserver01",
|
||||
"msg": "heartbeat received"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Plugin Data Endpoints
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/plugins
|
||||
Get all plugin data for a specific host.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"plugins": {
|
||||
"cpu_monitor": {
|
||||
"timestamp": 1711234567.123,
|
||||
"data": {
|
||||
"cpu_percent": 45.2,
|
||||
"load_1min": 2.5,
|
||||
"load_5min": 2.1,
|
||||
"load_15min": 1.8
|
||||
},
|
||||
"sample_count": 100
|
||||
},
|
||||
"memory_monitor": {
|
||||
"timestamp": 1711234568.456,
|
||||
"data": {
|
||||
"percent": 65.4,
|
||||
"available_mb": 4096,
|
||||
"total_mb": 16384
|
||||
},
|
||||
"sample_count": 100
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||
```
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
|
||||
Get detailed historical data for a specific plugin.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
- `plugin_name` (path): Name of the plugin
|
||||
- `limit` (query, optional): Number of recent samples to return (default: 10)
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"plugin": "cpu_monitor",
|
||||
"samples": [
|
||||
{
|
||||
"timestamp": 1711234567.123,
|
||||
"data": {
|
||||
"cpu_percent": 45.2,
|
||||
"load_1min": 2.5
|
||||
}
|
||||
},
|
||||
{
|
||||
"timestamp": 1711234267.123,
|
||||
"data": {
|
||||
"cpu_percent": 42.1,
|
||||
"load_1min": 2.3
|
||||
}
|
||||
}
|
||||
],
|
||||
"sample_count": 2
|
||||
}
|
||||
```
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Get last 1 sample (most recent)
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
|
||||
|
||||
# Get last 50 samples
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
|
||||
|
||||
# Get disk monitor data
|
||||
curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Host Access
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/access
|
||||
Get owner/managers/monitors for a host. Requires **monitor** role or higher.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"owner": "alice",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"]
|
||||
}
|
||||
```
|
||||
|
||||
#### PUT /api/0/hosts/{hostname}/access
|
||||
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||
|
||||
**Request body** (all fields optional):
|
||||
```json
|
||||
{ "owner": "bob", "managers": ["carol"], "monitors": [] }
|
||||
```
|
||||
|
||||
Changes take effect immediately but are not written back to the config file. Update the config file and send `SIGHUP` to make them permanent.
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
### Notification Channel Endpoints
|
||||
|
||||
Channels are visible to all users by default. Channels marked `private: true` are only visible to their owner. Admins see all channels.
|
||||
|
||||
#### GET /api/0/notification_channel_types
|
||||
Return the schema for every supported notifier type. Used by the web UI to dynamically render the channel creation form.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"pushover": {
|
||||
"label": "Pushover",
|
||||
"fields": [
|
||||
{"key": "token", "label": "App token", "type": "secret", "required": true},
|
||||
{"key": "user", "label": "User key", "type": "secret", "required": true},
|
||||
{"key": "sound", "label": "Sound", "type": "text", "required": false}
|
||||
]
|
||||
},
|
||||
"email": { "label": "E-mail", "fields": [ ... ] },
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### GET /api/0/notification_channels
|
||||
List channels visible to the current user (public channels + own private channels). Admins receive all channels.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "pushover_ops",
|
||||
"type": "pushover",
|
||||
"type_label": "Pushover",
|
||||
"owner": null,
|
||||
"private": false,
|
||||
"min_level": "WARNING",
|
||||
"fields": [
|
||||
{"key": "token", "label": "App token", "value": "•••", "sensitive": true},
|
||||
{"key": "user", "label": "User key", "value": "•••", "sensitive": true}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Sensitive fields (`type: "secret"`) are always returned as `"•••"`.
|
||||
|
||||
---
|
||||
|
||||
#### POST /api/0/notification_channels
|
||||
Create a new channel. The creating user becomes the channel's `owner`.
|
||||
|
||||
**Request body:**
|
||||
```json
|
||||
{
|
||||
"name": "my_pushover",
|
||||
"type": "pushover",
|
||||
"token": "app-token",
|
||||
"user": "user-key",
|
||||
"min_level": "WARNING",
|
||||
"private": true
|
||||
}
|
||||
```
|
||||
|
||||
**Response:** `{"ok": true, "name": "my_pushover"}`
|
||||
|
||||
**Status codes:** `200 OK`, `400` (missing required field or unknown type), `409` (name already exists)
|
||||
|
||||
---
|
||||
|
||||
#### PUT /api/0/notification_channels/{name}
|
||||
Update an existing channel. Only the channel owner or an admin may update it.
|
||||
|
||||
Secret fields sent as `"•••"` are preserved from the existing config (same pattern as OAuth secrets in the admin config editor).
|
||||
|
||||
**Request body:** same shape as POST, `name` ignored (taken from URL).
|
||||
|
||||
**Response:** `{"ok": true}`
|
||||
|
||||
**Status codes:** `200 OK`, `403 Forbidden`, `404 Not Found`
|
||||
|
||||
---
|
||||
|
||||
#### DELETE /api/0/notification_channels/{name}
|
||||
Delete a channel. Only the channel owner or an admin may delete it.
|
||||
|
||||
**Response:** `{"ok": true}`
|
||||
|
||||
**Status codes:** `200 OK`, `403 Forbidden`, `404 Not Found`
|
||||
|
||||
---
|
||||
|
||||
### Alert Endpoints
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/alerts
|
||||
Get alert states for a specific host.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"alerts": [
|
||||
{
|
||||
"metric_path": "cpu_monitor.cpu_percent",
|
||||
"level": "WARNING",
|
||||
"since": 1711234000.0,
|
||||
"last_value": 85.5,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 2
|
||||
},
|
||||
{
|
||||
"metric_path": "disk_monitor./.percent",
|
||||
"level": "OK",
|
||||
"since": 1711230000.0,
|
||||
"last_value": 65.0,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 0
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"ok": 15,
|
||||
"warning": 1,
|
||||
"critical": 0,
|
||||
"unknown": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||
```
|
||||
|
||||
#### GET /api/0/alerts
|
||||
Get all active alerts across all monitored hosts.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"alerts": [
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"metric_path": "cpu_monitor.cpu_percent",
|
||||
"level": "CRITICAL",
|
||||
"since": 1711234000.0,
|
||||
"last_value": 95.5,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 3
|
||||
},
|
||||
{
|
||||
"hostname": "database01",
|
||||
"metric_path": "memory_monitor.percent",
|
||||
"level": "WARNING",
|
||||
"since": 1711233000.0,
|
||||
"last_value": 88.2,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 1
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"critical": 1,
|
||||
"warning": 1,
|
||||
"unknown": 0,
|
||||
"total": 2
|
||||
},
|
||||
"host_count": 5
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/alerts | jq .
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Web UI Pages
|
||||
|
||||
### Login
|
||||
**URL:** `/login`
|
||||
|
||||
Shown automatically when a browser request is made without a valid session (when users are configured). After successful login the browser is redirected to the originally requested page.
|
||||
|
||||
### Logout
|
||||
**URL:** `/logout`
|
||||
|
||||
Clears the session cookie and redirects to `/login`.
|
||||
|
||||
### Live Dashboard
|
||||
**URL:** `/live`
|
||||
|
||||
Real-time dashboard showing:
|
||||
- Host connection states
|
||||
- IPv4/IPv6 connectivity
|
||||
- Latency metrics
|
||||
- Recent messages
|
||||
|
||||
**Features:**
|
||||
- WebSocket-powered live updates
|
||||
- Sortable columns
|
||||
- Color-coded status indicators
|
||||
|
||||
### Plugin Metrics
|
||||
**URL:** `/plugins`
|
||||
|
||||
Interactive visualization of plugin metrics:
|
||||
- Select host and plugin from dropdown
|
||||
- View current metric values
|
||||
- Automatic refresh every 30 seconds
|
||||
- Support for nested metrics (e.g., per-partition disk stats)
|
||||
|
||||
**Features:**
|
||||
- Card-based metric display
|
||||
- Unit formatting (%, MB, GB)
|
||||
- Nested object visualization
|
||||
- Auto-refresh
|
||||
|
||||
**Screenshots of available data:**
|
||||
- CPU usage, load average, frequency
|
||||
- Memory usage, available memory, swap
|
||||
- Disk usage per partition, I/O statistics
|
||||
- Network interface statistics, connection counts
|
||||
- Custom plugin data
|
||||
|
||||
### Alerts Dashboard
|
||||
**URL:** `/alerts`
|
||||
|
||||
Comprehensive alert monitoring:
|
||||
- Summary cards (Critical, Warning, Total Hosts)
|
||||
- Filter by severity (All, Critical, Warning)
|
||||
- Alert details with duration
|
||||
- Auto-refresh every 15 seconds
|
||||
|
||||
**Features:**
|
||||
- Color-coded alert levels
|
||||
- Duration tracking
|
||||
- Filterable list
|
||||
- Real-time updates
|
||||
- Summary statistics
|
||||
|
||||
---
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Monitoring Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Check for critical alerts and send notification
|
||||
|
||||
# Log in first (when auth is configured)
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"monitor","password":"secret"}' | jq -r .token)
|
||||
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||
|
||||
RESPONSE=$(curl -s $AUTH http://localhost:50004/api/0/alerts)
|
||||
CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
|
||||
|
||||
if [ "$CRITICAL_COUNT" -gt 0 ]; then
|
||||
echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
|
||||
echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
|
||||
# Send notification
|
||||
# mail -s "Critical Alerts" admin@example.com < alert_details.txt
|
||||
fi
|
||||
```
|
||||
|
||||
### Python Client
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
BASE = 'http://localhost:50004'
|
||||
|
||||
# Log in (skip if auth not configured)
|
||||
resp = requests.post(f'{BASE}/api/0/auth/login',
|
||||
json={"username": "alice", "password": "secret"})
|
||||
token = resp.json().get("token")
|
||||
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
||||
|
||||
# Get all plugin data for a host
|
||||
response = requests.get(f'{BASE}/api/0/hosts/webserver01/plugins', headers=headers)
|
||||
data = response.json()
|
||||
|
||||
print(f"Host: {data['hostname']}")
|
||||
print(f"Plugins: {', '.join(data['plugins'].keys())}")
|
||||
|
||||
for plugin, info in data['plugins'].items():
|
||||
print(f"\n{plugin}:")
|
||||
for metric, value in info['data'].items():
|
||||
print(f" {metric}: {value}")
|
||||
|
||||
# Check for alerts
|
||||
response = requests.get(f'{BASE}/api/0/alerts', headers=headers)
|
||||
alerts = response.json()
|
||||
|
||||
if alerts['summary']['critical'] > 0:
|
||||
print(f"\n⚠️ {alerts['summary']['critical']} CRITICAL ALERTS!")
|
||||
for alert in alerts['alerts']:
|
||||
if alert['level'] == 'CRITICAL':
|
||||
print(f" - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
|
||||
```
|
||||
|
||||
### Grafana Integration
|
||||
|
||||
The API endpoints can be used with Grafana's JSON datasource plugin:
|
||||
|
||||
1. Install the SimpleJSON datasource plugin
|
||||
2. Configure datasource URL: `http://your-server:50004`
|
||||
3. Create queries:
|
||||
- Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
|
||||
- Alerts: `/api/0/alerts`
|
||||
|
||||
### Prometheus Integration
|
||||
|
||||
Export metrics in Prometheus format (future enhancement):
|
||||
|
||||
```python
|
||||
# Example prometheus exporter
|
||||
from prometheus_client import Gauge, generate_latest
|
||||
import requests
|
||||
|
||||
cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
|
||||
memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
|
||||
|
||||
def collect_metrics():
|
||||
hosts = requests.get('http://localhost:50004/api/0/hosts').json()
|
||||
for host in hosts:
|
||||
hostname = host['name']
|
||||
plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
|
||||
|
||||
if 'cpu_monitor' in plugins['plugins']:
|
||||
cpu_data = plugins['plugins']['cpu_monitor']['data']
|
||||
cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
|
||||
|
||||
if 'memory_monitor' in plugins['plugins']:
|
||||
mem_data = plugins['plugins']['memory_monitor']['data']
|
||||
memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Response Formats
|
||||
|
||||
### Success Response
|
||||
All successful API calls return HTTP 200 with JSON body:
|
||||
```json
|
||||
{
|
||||
"field": "value",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Error Response
|
||||
API errors return appropriate HTTP status codes with JSON:
|
||||
```json
|
||||
{
|
||||
"error": "Host 'unknown-host' not found"
|
||||
}
|
||||
```
|
||||
|
||||
**Common Status Codes:**
|
||||
- `200 OK` - Success
|
||||
- `400 Bad Request` - Invalid parameters
|
||||
- `401 Unauthorized` - Missing or invalid session token
|
||||
- `403 Forbidden` - Authenticated but insufficient role
|
||||
- `404 Not Found` - Resource not found
|
||||
- `500 Internal Server Error` - Server error
|
||||
|
||||
---
|
||||
|
||||
## WebSocket API
|
||||
|
||||
For real-time updates, connect to the WebSocket endpoint:
|
||||
|
||||
**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
|
||||
|
||||
**Messages:**
|
||||
```json
|
||||
{
|
||||
"type": "host",
|
||||
"data": {
|
||||
"name": "webserver01",
|
||||
"state": "UP"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "plugin",
|
||||
"data": {
|
||||
"host": "webserver01",
|
||||
"plugin": "cpu_monitor",
|
||||
"data": {...},
|
||||
"timestamp": 1711234567.123
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Enable HTTP Server
|
||||
|
||||
```yaml
|
||||
# In your hbd configuration file
|
||||
hbd_host: "" # Listen on all interfaces
|
||||
hbd_port: 50004 # HTTP port
|
||||
ws_port: 50005 # WebSocket port (optional)
|
||||
# wss_port: 50006 # Secure WebSocket (requires SSL)
|
||||
```
|
||||
|
||||
### SSL/TLS Configuration
|
||||
|
||||
For secure WebSocket connections:
|
||||
|
||||
```yaml
|
||||
wss_port: 50006
|
||||
cert_path: /etc/heartbeat/certs/
|
||||
wss_pem: server.pem
|
||||
wss_key: server.key
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
The API currently does not implement rate limiting. For production use, consider:
|
||||
|
||||
- Placing behind a reverse proxy (nginx, Apache)
|
||||
- Using API gateway for rate limiting
|
||||
- Implementing caching for frequently accessed endpoints
|
||||
|
||||
---
|
||||
|
||||
## CORS Support
|
||||
|
||||
By default, CORS is not enabled. To enable for web applications:
|
||||
|
||||
```python
|
||||
# In http.py, add CORS middleware
|
||||
from aiohttp_cors import setup as cors_setup
|
||||
|
||||
app = web.Application()
|
||||
cors = cors_setup(app)
|
||||
|
||||
# Configure CORS for all routes
|
||||
for route in list(app.router.routes()):
|
||||
cors.add(route, {
|
||||
"*": aiohttp_cors.ResourceOptions(
|
||||
allow_credentials=True,
|
||||
expose_headers="*",
|
||||
allow_headers="*",
|
||||
)
|
||||
})
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Caching
|
||||
- Plugin data is cached in memory (last 100 samples per plugin)
|
||||
- No database queries required
|
||||
- Responses are fast (<10ms typical)
|
||||
|
||||
### Scalability
|
||||
- Each host stores its own data independently
|
||||
- Memory usage: ~1KB per host + ~1KB per plugin sample
|
||||
- For 100 hosts with 5 plugins: ~50MB memory
|
||||
|
||||
### Best Practices
|
||||
1. Use `limit` parameter to control response size
|
||||
2. Cache responses on client side when appropriate
|
||||
3. Use WebSocket for real-time updates instead of polling
|
||||
4. Consider pagination for large deployments (future enhancement)
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### API Returns 401
|
||||
- Auth is configured — include `Authorization: Bearer <token>` header
|
||||
- Token may have expired (24 h TTL) — log in again
|
||||
|
||||
### API Returns 403
|
||||
- Authenticated user lacks the required role for this host/action
|
||||
- Check host's `owner`, `managers`, `monitors` config
|
||||
|
||||
### API Returns 404
|
||||
- Verify hostname in URL matches actual host name
|
||||
- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
|
||||
|
||||
### No Plugin Data
|
||||
- Verify client is configured with plugins
|
||||
- Check client logs for plugin errors
|
||||
- Ensure plugins are sending data (check journal logs)
|
||||
|
||||
### Empty Alerts
|
||||
- Verify thresholds are configured
|
||||
- Check host is in `watchhosts` list
|
||||
- Ensure plugins are collecting metrics
|
||||
- Review server logs for threshold checker errors
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [User Management](USERS.md)
|
||||
- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
|
||||
- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
|
||||
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
||||
- Configuration examples: `hbd/config_example.yaml`
|
||||
@@ -0,0 +1,413 @@
|
||||
# Message Journal
|
||||
|
||||
The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
|
||||
|
||||
## Overview
|
||||
|
||||
The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
|
||||
- Audit message history
|
||||
- Debug connection issues
|
||||
- Analyze traffic patterns
|
||||
- Replay messages for testing
|
||||
- Create historical reports
|
||||
|
||||
## Features
|
||||
|
||||
- **JSON Format**: Each message is logged as a single JSON line for easy parsing
|
||||
- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
|
||||
- **Automatic Cleanup**: Keeps only a configurable number of backup files
|
||||
- **Thread-Safe**: Safe for concurrent access from multiple async tasks
|
||||
- **Configurable**: All settings controllable via configuration file
|
||||
- **Performance**: Non-blocking async operation with minimal overhead
|
||||
|
||||
## Configuration
|
||||
|
||||
Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
|
||||
|
||||
```yaml
|
||||
# Message journal configuration
|
||||
journal_enabled: true # Enable/disable journaling
|
||||
journal_dir: /var/log/heartbeat # Directory for journal files
|
||||
journal_file: messages.journal # Base filename
|
||||
journal_max_size: 104857600 # Max size in bytes (100MB default)
|
||||
journal_max_backups: 10 # Number of backup files to keep
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
| Option | Default | Description |
|
||||
|--------|---------|-------------|
|
||||
| `journal_enabled` | `true` | Enable or disable message journaling |
|
||||
| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
|
||||
| `journal_file` | `messages.journal` | Base filename for the journal |
|
||||
| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
|
||||
| `journal_max_backups` | `10` | Number of rotated backup files to keep |
|
||||
|
||||
## File Format
|
||||
|
||||
Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
|
||||
|
||||
```json
|
||||
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||
{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
|
||||
```
|
||||
|
||||
### Entry Structure
|
||||
|
||||
Each journal entry contains:
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `timestamp` | float | Unix timestamp (seconds since epoch) |
|
||||
| `datetime` | string | ISO 8601 formatted datetime |
|
||||
| `source_ip` | string | Source IP address |
|
||||
| `source_port` | integer | Source UDP port |
|
||||
| `message` | object | Complete parsed message dictionary |
|
||||
|
||||
## Log Rotation
|
||||
|
||||
### How Rotation Works
|
||||
|
||||
1. Journal writes messages to the current file
|
||||
2. When file size exceeds `journal_max_size`, rotation is triggered
|
||||
3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
|
||||
4. New empty file is created as the current journal
|
||||
5. Old backup files exceeding `journal_max_backups` are deleted
|
||||
|
||||
### Example File Structure
|
||||
|
||||
```
|
||||
/var/log/heartbeat/
|
||||
├── messages.journal # Current active journal
|
||||
├── messages.journal.20260328-120000 # Rotated backup
|
||||
├── messages.journal.20260328-140000 # Rotated backup
|
||||
└── messages.journal.20260328-160000 # Rotated backup (oldest)
|
||||
```
|
||||
|
||||
### Rotation Behavior
|
||||
|
||||
- Rotation is triggered when the next message would exceed the size limit
|
||||
- Rotation is automatic and requires no manual intervention
|
||||
- Old backups are deleted in FIFO order (oldest first)
|
||||
- Rotation is thread-safe and won't lose messages
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Reading Journal Files
|
||||
|
||||
#### Using Python
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
# Read all entries from current journal
|
||||
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||
for line in f:
|
||||
entry = json.loads(line)
|
||||
print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
|
||||
```
|
||||
|
||||
#### Using jq (command line)
|
||||
|
||||
```bash
|
||||
# View all messages
|
||||
cat /var/log/heartbeat/messages.journal | jq .
|
||||
|
||||
# Filter by message type
|
||||
cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
|
||||
|
||||
# Filter by hostname
|
||||
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||
|
||||
# Count messages by type
|
||||
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||
|
||||
# Extract timestamps and source IPs
|
||||
cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
|
||||
```
|
||||
|
||||
#### Using shell tools
|
||||
|
||||
```bash
|
||||
# Count total messages
|
||||
wc -l /var/log/heartbeat/messages.journal
|
||||
|
||||
# View recent messages
|
||||
tail -n 100 /var/log/heartbeat/messages.journal | jq .
|
||||
|
||||
# Search for specific host
|
||||
grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
|
||||
|
||||
# Check journal file size
|
||||
du -h /var/log/heartbeat/messages.journal
|
||||
```
|
||||
|
||||
### Analyzing Historical Data
|
||||
|
||||
```bash
|
||||
# Combine all journal files (current + backups)
|
||||
cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
|
||||
|
||||
# Count messages per host
|
||||
cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
|
||||
|
||||
# Find all plugin messages
|
||||
cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
|
||||
|
||||
# Extract CPU metrics from plugin messages
|
||||
cat /var/log/heartbeat/messages.journal* | \
|
||||
jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
|
||||
```
|
||||
|
||||
## Integration with Log Management
|
||||
|
||||
### Logrotate
|
||||
|
||||
While the journal has built-in rotation, you can also use logrotate for additional management:
|
||||
|
||||
```
|
||||
/var/log/heartbeat/messages.journal.* {
|
||||
daily
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
missingok
|
||||
notifempty
|
||||
}
|
||||
```
|
||||
|
||||
### Elasticsearch/OpenSearch
|
||||
|
||||
Import journal data into Elasticsearch for advanced analysis:
|
||||
|
||||
```python
|
||||
from elasticsearch import Elasticsearch
|
||||
import json
|
||||
|
||||
es = Elasticsearch(['localhost:9200'])
|
||||
|
||||
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||
for line in f:
|
||||
entry = json.loads(line)
|
||||
es.index(index='heartbeat-messages', body=entry)
|
||||
```
|
||||
|
||||
### Splunk
|
||||
|
||||
Create a Splunk input for the journal:
|
||||
|
||||
```ini
|
||||
[monitor:///var/log/heartbeat/messages.journal*]
|
||||
sourcetype = heartbeat_json
|
||||
index = heartbeat
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Overhead
|
||||
|
||||
- Journal writing is async and non-blocking
|
||||
- Typical overhead: < 1ms per message
|
||||
- Minimal impact on heartbeat processing
|
||||
|
||||
### Disk Usage
|
||||
|
||||
Calculate expected disk usage:
|
||||
|
||||
```
|
||||
Messages per day = (86400 seconds / interval) * number_of_hosts
|
||||
Average message size ≈ 200-500 bytes
|
||||
Daily disk usage = Messages per day * Average message size
|
||||
|
||||
Example:
|
||||
- 100 hosts
|
||||
- 30 second interval
|
||||
- 2880 messages/day per host
|
||||
- 288,000 messages/day total
|
||||
- ~60-140 MB/day
|
||||
```
|
||||
|
||||
### Recommendations
|
||||
|
||||
- **Small deployments** (< 50 hosts): Default settings work well
|
||||
- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
|
||||
- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Check Journal Status
|
||||
|
||||
The journal exposes statistics that can be queried:
|
||||
|
||||
```python
|
||||
from hbd.journal import get_journal
|
||||
|
||||
journal = get_journal()
|
||||
stats = journal.get_stats()
|
||||
print(f"Current size: {stats['current_size']:,} bytes")
|
||||
print(f"Rotation threshold: {stats['rotation_threshold']}")
|
||||
```
|
||||
|
||||
### Log Messages
|
||||
|
||||
Journal operations are logged at appropriate levels:
|
||||
|
||||
- `INFO`: Initialization, rotation events, cleanup
|
||||
- `DEBUG`: Individual message logging
|
||||
- `WARNING`: Non-critical issues
|
||||
- `ERROR`: Critical failures
|
||||
|
||||
Check hbd logs for journal-related messages:
|
||||
|
||||
```bash
|
||||
grep journal /var/log/heartbeat.log
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Journal Files Not Created
|
||||
|
||||
**Problem**: No journal files appear in the configured directory.
|
||||
|
||||
**Solutions**:
|
||||
- Check `journal_enabled: true` in configuration
|
||||
- Verify directory exists and hbd has write permissions
|
||||
- Check hbd logs for initialization errors
|
||||
- Verify disk space is available
|
||||
|
||||
### Rotation Not Working
|
||||
|
||||
**Problem**: Journal file grows beyond `journal_max_size`.
|
||||
|
||||
**Solutions**:
|
||||
- Check that `journal_max_size` is properly configured
|
||||
- Verify hbd has permission to rename/create files
|
||||
- Check for filesystem issues
|
||||
- Review hbd logs for rotation errors
|
||||
|
||||
### Missing Messages
|
||||
|
||||
**Problem**: Some messages don't appear in journal.
|
||||
|
||||
**Solutions**:
|
||||
- Verify `journal_enabled: true`
|
||||
- Check for write errors in hbd logs
|
||||
- Verify sufficient disk space
|
||||
- Check if filesystem is read-only
|
||||
|
||||
### Performance Issues
|
||||
|
||||
**Problem**: Journal causing slow message processing.
|
||||
|
||||
**Solutions**:
|
||||
- Use faster storage (SSD) for journal directory
|
||||
- Increase `journal_max_size` to reduce rotation frequency
|
||||
- Disable journal if not needed: `journal_enabled: false`
|
||||
- Consider async syslog forwarding instead
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### File Permissions
|
||||
|
||||
Ensure proper permissions on journal files:
|
||||
|
||||
```bash
|
||||
# Journal directory
|
||||
chmod 750 /var/log/heartbeat
|
||||
chown hbd:hbd /var/log/heartbeat
|
||||
|
||||
# Journal files
|
||||
chmod 640 /var/log/heartbeat/messages.journal*
|
||||
```
|
||||
|
||||
### Sensitive Data
|
||||
|
||||
Journal files may contain:
|
||||
- Hostnames and IP addresses
|
||||
- System metrics
|
||||
- Custom message content
|
||||
|
||||
**Recommendations**:
|
||||
- Restrict read access to authorized users only
|
||||
- Consider encryption for archived journals
|
||||
- Implement log retention policies
|
||||
- Sanitize data if sharing for debugging
|
||||
|
||||
## API Reference
|
||||
|
||||
### MessageJournal Class
|
||||
|
||||
```python
|
||||
class MessageJournal:
|
||||
def __init__(self, config: Dict[str, Any])
|
||||
async def initialize(self) -> bool
|
||||
async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
|
||||
async def close(self)
|
||||
def get_stats(self) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
### Module Functions
|
||||
|
||||
```python
|
||||
def get_journal(config: Dict = None) -> MessageJournal
|
||||
async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
|
||||
```
|
||||
|
||||
## Example: Custom Message Processing
|
||||
|
||||
Process journal messages in real-time:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
async def tail_journal(journal_path):
|
||||
"""Follow journal file and process new messages."""
|
||||
path = Path(journal_path)
|
||||
|
||||
with open(path, 'r') as f:
|
||||
# Jump to end
|
||||
f.seek(0, 2)
|
||||
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
entry = json.loads(line)
|
||||
await process_message(entry)
|
||||
else:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def process_message(entry):
|
||||
"""Process a journal entry."""
|
||||
msg = entry['message']
|
||||
|
||||
# Alert on boot messages
|
||||
if msg.get('boot'):
|
||||
print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
|
||||
|
||||
# Track CPU usage
|
||||
if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
|
||||
cpu = msg.get('cpu_percent', 0)
|
||||
if cpu > 90:
|
||||
print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
|
||||
```
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for future versions:
|
||||
|
||||
- Compression of rotated logs (gzip)
|
||||
- Time-based rotation in addition to size-based
|
||||
- Filtering to exclude certain message types
|
||||
- Structured logging output formats (CEF, GELF)
|
||||
- Remote syslog forwarding
|
||||
- Message deduplication
|
||||
- Journal file encryption
|
||||
- Signed journal entries
|
||||
|
||||
## See Also
|
||||
|
||||
- [Configuration Guide](../hbd/config.py) - Full configuration options
|
||||
- [UDP Protocol](../hbd/udp.py) - Message handling
|
||||
- [Server Architecture](../hbd/server.py) - Server initialization
|
||||
@@ -0,0 +1,326 @@
|
||||
# Nagios Plugin Integration Guide
|
||||
|
||||
The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Install Nagios Plugins
|
||||
|
||||
**Debian/Ubuntu:**
|
||||
```bash
|
||||
sudo apt-get install nagios-plugins
|
||||
```
|
||||
|
||||
**RHEL/CentOS/Fedora:**
|
||||
```bash
|
||||
sudo yum install nagios-plugins-all
|
||||
# or
|
||||
sudo dnf install nagios-plugins-all
|
||||
```
|
||||
|
||||
**Arch Linux:**
|
||||
```bash
|
||||
sudo pacman -S monitoring-plugins
|
||||
```
|
||||
|
||||
### 2. Configure Heartbeat
|
||||
|
||||
Add the `nagios_runner` section to your `~/.hb.yaml` config:
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
interval: 60 # Run plugins every 60 seconds
|
||||
timeout: 30 # Command timeout in seconds
|
||||
commands:
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
```
|
||||
|
||||
### 3. Start Heartbeat Client
|
||||
|
||||
```bash
|
||||
hbc -v localhost
|
||||
```
|
||||
|
||||
The client will now execute the configured Nagios plugins and send their results to the server.
|
||||
|
||||
## How It Works
|
||||
|
||||
### Nagios Plugin Standard
|
||||
|
||||
Nagios plugins follow a simple interface:
|
||||
|
||||
1. **Exit Codes:**
|
||||
- `0` = OK
|
||||
- `1` = WARNING
|
||||
- `2` = CRITICAL
|
||||
- `3` = UNKNOWN
|
||||
|
||||
2. **Output Format:**
|
||||
```
|
||||
STATUS - Message | performance_data
|
||||
```
|
||||
|
||||
3. **Performance Data Format:**
|
||||
```
|
||||
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
```
|
||||
|
||||
### Example Plugin Output
|
||||
|
||||
```bash
|
||||
$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
|
||||
```
|
||||
|
||||
This output includes:
|
||||
- **Status:** `DISK OK`
|
||||
- **Message:** `free space: / 156 GB (78%)`
|
||||
- **Performance Data:** `/=44GB;127;142;0;159`
|
||||
- Current value: 44GB
|
||||
- Warning threshold: 127GB
|
||||
- Critical threshold: 142GB
|
||||
- Min: 0GB
|
||||
- Max: 159GB
|
||||
|
||||
### Data Collected
|
||||
|
||||
The `nagios_runner` plugin collects:
|
||||
|
||||
**For each configured command:**
|
||||
- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
|
||||
- `{name}_status_code` - Numeric exit code (0-3)
|
||||
- `{name}_output` - Status message
|
||||
- `{name}_{metric}` - Each performance metric value
|
||||
- `{name}_{metric}_uom` - Unit of measurement (if present)
|
||||
- `{name}_{metric}_warn` - Warning threshold (if present)
|
||||
- `{name}_{metric}_crit` - Critical threshold (if present)
|
||||
- `{name}_{metric}_min` - Minimum value (if present)
|
||||
- `{name}_{metric}_max` - Maximum value (if present)
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
# Collection interval in seconds (default: 60)
|
||||
interval: 60
|
||||
|
||||
# Command execution timeout in seconds (default: 30)
|
||||
timeout: 30
|
||||
|
||||
# Execute commands via shell (default: true)
|
||||
# Set to false for direct execution (more secure but less flexible)
|
||||
shell: true
|
||||
|
||||
# List of Nagios plugins to run
|
||||
commands:
|
||||
- name: unique_name # Required: unique identifier
|
||||
command: /path/to/plugin [args] # Required: full command to execute
|
||||
```
|
||||
|
||||
## Common Nagios Plugins
|
||||
|
||||
### System Resources
|
||||
|
||||
**Disk Space:**
|
||||
```yaml
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
```
|
||||
|
||||
**Load Average:**
|
||||
```yaml
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
```
|
||||
|
||||
**Swap Usage:**
|
||||
```yaml
|
||||
- name: check_swap
|
||||
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
```
|
||||
|
||||
**Process Count:**
|
||||
```yaml
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
```
|
||||
|
||||
**Users Logged In:**
|
||||
```yaml
|
||||
- name: check_users
|
||||
command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
|
||||
```
|
||||
|
||||
### Network Services
|
||||
|
||||
**SSH:**
|
||||
```yaml
|
||||
- name: check_ssh
|
||||
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||
```
|
||||
|
||||
**HTTP:**
|
||||
```yaml
|
||||
- name: check_http_local
|
||||
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||
|
||||
- name: check_http_ssl
|
||||
command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
|
||||
```
|
||||
|
||||
**DNS:**
|
||||
```yaml
|
||||
- name: check_dns
|
||||
command: /usr/lib/nagios/plugins/check_dns -H google.com
|
||||
```
|
||||
|
||||
**Ping:**
|
||||
```yaml
|
||||
- name: check_ping_gateway
|
||||
command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
|
||||
```
|
||||
|
||||
### Databases
|
||||
|
||||
**MySQL:**
|
||||
```yaml
|
||||
- name: check_mysql
|
||||
command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
|
||||
```
|
||||
|
||||
**PostgreSQL:**
|
||||
```yaml
|
||||
- name: check_pgsql
|
||||
command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
|
||||
```
|
||||
|
||||
## Writing Custom Nagios Plugins
|
||||
|
||||
You can write your own Nagios-compatible plugins in any language. Here's a simple example:
|
||||
|
||||
**Bash:**
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# /usr/local/bin/check_example.sh
|
||||
|
||||
# Get the value to check
|
||||
value=$(some_command)
|
||||
|
||||
# Define thresholds
|
||||
warn=80
|
||||
crit=90
|
||||
|
||||
# Check and output result
|
||||
if [ $value -ge $crit ]; then
|
||||
echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 2
|
||||
elif [ $value -ge $warn ]; then
|
||||
echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 0
|
||||
fi
|
||||
```
|
||||
|
||||
**Python:**
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
# /usr/local/bin/check_example.py
|
||||
|
||||
import sys
|
||||
|
||||
def check_something():
|
||||
value = get_value() # Your check logic here
|
||||
warn = 80
|
||||
crit = 90
|
||||
|
||||
perfdata = f"value={value};{warn};{crit};0;100"
|
||||
|
||||
if value >= crit:
|
||||
print(f"CRITICAL - Value is {value} | {perfdata}")
|
||||
sys.exit(2)
|
||||
elif value >= warn:
|
||||
print(f"WARNING - Value is {value} | {perfdata}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"OK - Value is {value} | {perfdata}")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_something()
|
||||
```
|
||||
|
||||
Then configure in Heartbeat:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
commands:
|
||||
- name: my_custom_check
|
||||
command: /usr/local/bin/check_example.sh
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Plugin not found
|
||||
```
|
||||
Error: Command not found
|
||||
```
|
||||
**Solution:** Use the full path to the plugin. Common locations:
|
||||
- `/usr/lib/nagios/plugins/`
|
||||
- `/usr/lib64/nagios/plugins/`
|
||||
- `/usr/local/nagios/libexec/`
|
||||
|
||||
### Permission denied
|
||||
```
|
||||
Error: Permission denied
|
||||
```
|
||||
**Solution:** Ensure the plugin is executable:
|
||||
```bash
|
||||
chmod +x /path/to/plugin
|
||||
```
|
||||
|
||||
### Timeout errors
|
||||
```
|
||||
Command timed out after 30s
|
||||
```
|
||||
**Solution:** Increase the timeout in config:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
timeout: 60 # Increase timeout
|
||||
```
|
||||
|
||||
### No performance data
|
||||
If performance data is not being parsed:
|
||||
1. Check plugin output includes `|` separator
|
||||
2. Verify performance data format: `'label'=value[UOM];...`
|
||||
3. Enable debug logging: `hbc -v -x localhost`
|
||||
|
||||
## Benefits
|
||||
|
||||
1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
|
||||
2. **No Rewriting:** Use plugins as-is without modification
|
||||
3. **Community Support:** Well-documented and maintained plugins
|
||||
4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
|
||||
5. **Standard Interface:** Consistent exit codes and output format
|
||||
6. **Performance Data:** Automatic extraction of metrics
|
||||
|
||||
## Resources
|
||||
|
||||
- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
|
||||
- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
|
||||
- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
|
||||
- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Configure threshold alerts based on Nagios plugin status codes
|
||||
- View plugin data in the Heartbeat web UI
|
||||
- Create custom plugins for your specific monitoring needs
|
||||
- Integrate with existing Nagios/Icinga configurations
|
||||
@@ -0,0 +1,325 @@
|
||||
# Notification System
|
||||
|
||||
## Overview
|
||||
|
||||
Notifications are dispatched to the **owner and managers** of a host, each via their own configured notification channels. Channel definitions are global; users reference them by name. No users configured → no notifications sent.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Alert event (udp.py / threshold.py)
|
||||
└─ notify.send_notification(host_name, Notification)
|
||||
├─ look up host.owner + host.managers
|
||||
├─ for each user → user.notification_channels
|
||||
└─ for each channel → _dispatch_to_channel (filtered by min_level)
|
||||
```
|
||||
|
||||
Every notification carries:
|
||||
- **title** — `[LEVEL] hostname` (e.g. `[CRITICAL] webserver01`)
|
||||
- **body** — detail message (metric value, threshold, duration)
|
||||
- **url** — link to the plugin metrics page (`{base_url}/plugins#{hostname}`)
|
||||
- **level** — `RECOVER | WARNING | CRITICAL | INFO`
|
||||
|
||||
## Configuration
|
||||
|
||||
### Base URL
|
||||
|
||||
Set `base_url` so notification links point to your hbd instance:
|
||||
|
||||
```yaml
|
||||
base_url: https://hbd.example.com
|
||||
```
|
||||
|
||||
### Channel definitions
|
||||
|
||||
Channels are defined under `notification_channels`. Each entry specifies a delivery type and its credentials. Two optional metadata fields control visibility:
|
||||
|
||||
| Field | Default | Description |
|
||||
|---|---|---|
|
||||
| `owner` | *(absent)* | Username who created/owns this channel. Absent = admin-created. |
|
||||
| `private` | `false` | When `true`, only the owner can see and select this channel. |
|
||||
| `min_level` | `WARNING` | Minimum alert level this channel receives. |
|
||||
|
||||
**Admin-created channels** (set in the config file or via the admin settings UI) are public by default — all users can select them:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
|
||||
pushover_ops:
|
||||
type: pushover
|
||||
token: your-app-token
|
||||
user: your-user-key
|
||||
min_level: WARNING
|
||||
|
||||
email_ops:
|
||||
type: email
|
||||
recipients: [ops@example.com]
|
||||
sender: hbd@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587
|
||||
smtp_user: hbd@example.com
|
||||
smtp_password: secret
|
||||
min_level: WARNING
|
||||
|
||||
matrix_oncall:
|
||||
type: matrix
|
||||
homeserver: https://matrix.example.org
|
||||
access_token: syt_xxx
|
||||
room_id: "!abc:matrix.example.org"
|
||||
min_level: CRITICAL
|
||||
|
||||
sms_oncall:
|
||||
type: sms_voipms
|
||||
api_user: me@example.com
|
||||
api_password: secret
|
||||
did: "5551234567"
|
||||
dst: "5559876543"
|
||||
min_level: CRITICAL
|
||||
|
||||
signal_ops:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +12025551234
|
||||
recipient: +12025559999
|
||||
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
```
|
||||
|
||||
**User-created channels** are written by authenticated users through the API or their profile page. They carry an `owner` field and optionally `private: true`:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
|
||||
alice_personal:
|
||||
type: pushover
|
||||
token: personal-token
|
||||
user: personal-key
|
||||
owner: alice # created by alice
|
||||
private: true # only alice can see this channel
|
||||
```
|
||||
|
||||
### Channel visibility
|
||||
|
||||
| Channel | Who can see / select it |
|
||||
|---|---|
|
||||
| No `private` field (or `private: false`) | All users |
|
||||
| `private: true` | Only the `owner` |
|
||||
| Any channel | Admins always see everything |
|
||||
|
||||
### Users with notification channels
|
||||
|
||||
Each user lists which channels they receive notifications on. Users can manage their own selection from the profile page:
|
||||
|
||||
```yaml
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
password: pbkdf2:sha256:...
|
||||
admin: true
|
||||
notification_channels: [pushover_ops, email_ops]
|
||||
|
||||
bob:
|
||||
full_name: Bob Jones
|
||||
password: pbkdf2:sha256:...
|
||||
notification_channels: [sms_oncall, matrix_oncall]
|
||||
```
|
||||
|
||||
### Host access — owner and managers
|
||||
|
||||
Notifications for a host go to its owner and all managers:
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: alice # receives all notifications for this host
|
||||
managers: [bob] # also receives notifications
|
||||
threshold_config: default
|
||||
watch: true # bold in dashboard (cosmetic only)
|
||||
dyndns: false
|
||||
|
||||
dbserver01:
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
threshold_config: database
|
||||
dyndns: false
|
||||
```
|
||||
|
||||
`watch: true` only affects display (bold name in the live dashboard). Notifications are now controlled entirely by owner/managers.
|
||||
|
||||
## Channel Types
|
||||
|
||||
### `min_level` filtering
|
||||
|
||||
Every channel accepts an optional `min_level` field:
|
||||
|
||||
| Value | Channels receive |
|
||||
|---|---|
|
||||
| `WARNING` (default) | WARNING, CRITICAL, RECOVER |
|
||||
| `CRITICAL` | CRITICAL only (and RECOVER) |
|
||||
|
||||
`RECOVER` is always passed through — you don't want to miss a recovery.
|
||||
|
||||
### pushover
|
||||
|
||||
Sends push notifications via [Pushover](https://pushover.net). Includes title, body, and a clickable URL.
|
||||
|
||||
```yaml
|
||||
type: pushover
|
||||
token: your-app-token # Required: Pushover application token
|
||||
user: your-user-key # Required: Recipient's user key
|
||||
min_level: WARNING
|
||||
```
|
||||
|
||||
### email
|
||||
|
||||
Sends via SMTP. Subject = title, body = message + URL on final line.
|
||||
|
||||
```yaml
|
||||
type: email
|
||||
recipients: [ops@example.com, oncall@example.com]
|
||||
sender: hbd@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587 # 587 = STARTTLS (default), 465 = SSL
|
||||
smtp_user: hbd@example.com
|
||||
smtp_password: secret
|
||||
min_level: WARNING
|
||||
```
|
||||
|
||||
### matrix
|
||||
|
||||
Sends a formatted HTML message to a Matrix room via [matrix-nio](https://github.com/poljar/matrix-nio).
|
||||
|
||||
```yaml
|
||||
type: matrix
|
||||
homeserver: https://matrix.example.org
|
||||
access_token: syt_xxx # Bot account access token
|
||||
room_id: "!abc:matrix.example.org"
|
||||
min_level: WARNING
|
||||
```
|
||||
|
||||
**Setup:**
|
||||
1. Create a bot Matrix account
|
||||
2. Obtain its access token (Element → Settings → Help & About → Access Token)
|
||||
3. Invite the bot to the target room and note the room ID
|
||||
|
||||
### sms_voipms
|
||||
|
||||
Sends SMS via the [voip.ms REST API](https://voip.ms/api/v1/rest.php). Message is truncated to 160 characters.
|
||||
|
||||
```yaml
|
||||
type: sms_voipms
|
||||
api_user: me@example.com # voip.ms account email
|
||||
api_password: secret # voip.ms API password
|
||||
did: "5551234567" # Your voip.ms DID (sending number)
|
||||
dst: "5559876543" # Destination number
|
||||
min_level: CRITICAL
|
||||
```
|
||||
|
||||
### signal
|
||||
|
||||
Sends via [signal-cli](https://github.com/AsamK/signal-cli).
|
||||
|
||||
```yaml
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +12025551234 # Your registered Signal number
|
||||
recipient: +12025559999 # Recipient number
|
||||
min_level: WARNING
|
||||
```
|
||||
|
||||
**Setup:**
|
||||
```bash
|
||||
signal-cli -u +12025551234 register
|
||||
signal-cli -u +12025551234 verify CODE
|
||||
```
|
||||
|
||||
### mattermost
|
||||
|
||||
Sends via Mattermost incoming webhook. Message is formatted as Markdown.
|
||||
|
||||
```yaml
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: your-webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot # Optional: display name
|
||||
icon: https://…/icon.png # Optional: bot icon URL
|
||||
min_level: WARNING
|
||||
```
|
||||
|
||||
## Notification events
|
||||
|
||||
| Source | Level | Title example | Body example |
|
||||
|---|---|---|---|
|
||||
| Host overdue | CRITICAL | `[CRITICAL] webserver01` | `IPv4 overdue` |
|
||||
| Host recover | RECOVER | `[RECOVER] webserver01` | `IPv4 back after being overdue for 5:23` |
|
||||
| Host boot | INFO | `[INFO] webserver01` | `webserver01 booted` |
|
||||
| Host shutdown | INFO | `[INFO] webserver01` | `IPv4 shutdown` |
|
||||
| Threshold breach | WARNING/CRITICAL | `[CRITICAL] webserver01` | `cpu_percent = 95.2 (threshold: > 90.0)` |
|
||||
| Threshold reminder | CRITICAL | `[REMINDER/CRITICAL] webserver01` | `REMINDER (CRITICAL): … ongoing for 3600s` |
|
||||
| Connection issue | WARNING | `[WARNING] webserver01` | `new address detected …` |
|
||||
|
||||
Reminder notifications (re-notify) are sent only for CRITICAL level alerts.
|
||||
|
||||
## API reference
|
||||
|
||||
### `send_notification(host_name, notif) -> dict`
|
||||
|
||||
Main entry point. Dispatches to owner + managers.
|
||||
|
||||
```python
|
||||
from hbd.server.notify import send_notification, Notification
|
||||
|
||||
send_notification(
|
||||
"webserver01",
|
||||
Notification(
|
||||
title="[CRITICAL] webserver01",
|
||||
body="cpu_percent = 95.2 (threshold: > 90.0)",
|
||||
level="CRITICAL",
|
||||
url="https://hbd.example.com/plugins#webserver01",
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
Returns `{channel_name: bool}` for each channel dispatched.
|
||||
|
||||
### `setup(cfg, loop=None)`
|
||||
|
||||
Called once at startup from `main.py`. Pass the running asyncio event loop so Matrix sends work correctly.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No notifications sent:**
|
||||
- Check that users are configured (`users:` section in yaml)
|
||||
- Check that the host has an `owner` or `managers` set
|
||||
- Check that users have `notification_channels` listed
|
||||
- Check that the channel names in user config match keys under `notification_channels:`
|
||||
- If a user can't select a channel, check whether it is `private: true` and owned by someone else
|
||||
|
||||
**min_level filtering too aggressive:**
|
||||
- Default is `WARNING` — both WARNING and CRITICAL are sent
|
||||
- Set `min_level: WARNING` explicitly if you were expecting warnings but set CRITICAL
|
||||
|
||||
**Matrix sends time out:**
|
||||
- Verify the access token is valid and the bot is in the room
|
||||
- `matrix-nio` must be installed: `pip install matrix-nio`
|
||||
|
||||
**voip.ms SMS fails:**
|
||||
- Enable the API in your voip.ms account (Account → API)
|
||||
- Verify the DID is SMS-capable in your voip.ms account
|
||||
|
||||
**Signal not found:**
|
||||
- Specify full `cli_path`
|
||||
- Run `signal-cli -u +NUMBER receive` to sync trust store
|
||||
|
||||
**Email authentication failed:**
|
||||
- Use app-specific passwords for Gmail/Fastmail
|
||||
- Verify port: 587 for STARTTLS, 465 for SSL
|
||||
|
||||
**Pushover `400` errors:**
|
||||
- Double-check `token` (app) and `user` (user key) — they are different values
|
||||
@@ -0,0 +1,567 @@
|
||||
# Plugin Development Guide
|
||||
|
||||
This guide explains how to create custom plugins for the Heartbeat monitoring system.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Plugin Architecture](#plugin-architecture)
|
||||
- [Plugin Types](#plugin-types)
|
||||
- [Creating a Plugin](#creating-a-plugin)
|
||||
- [Plugin Lifecycle](#plugin-lifecycle)
|
||||
- [Server-initiated InfoPlugin refresh](#server-initiated-infoplugin-refresh)
|
||||
- [Configuration](#configuration)
|
||||
- [Best Practices](#best-practices)
|
||||
- [Examples](#examples)
|
||||
- [Testing](#testing)
|
||||
|
||||
## Plugin Architecture
|
||||
|
||||
Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
|
||||
|
||||
### Key Concepts
|
||||
|
||||
- **Plugin Registry**: Central registry that manages all loaded plugins
|
||||
- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
|
||||
- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
|
||||
- **Async/Await**: All plugin methods are async for non-blocking operation
|
||||
|
||||
## Plugin Types
|
||||
|
||||
### InfoPlugin
|
||||
|
||||
InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
|
||||
|
||||
- **Runs once** at startup (interval = 0)
|
||||
- **Cached** - data is collected once and reused
|
||||
- **Lightweight** - no periodic overhead
|
||||
|
||||
**Use InfoPlugin for:**
|
||||
- Operating system details
|
||||
- Hardware information
|
||||
- Software versions
|
||||
- Configuration data
|
||||
- Static inventory
|
||||
|
||||
### MonitorPlugin
|
||||
|
||||
MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
|
||||
|
||||
- **Runs periodically** based on configured interval
|
||||
- **Scheduled** - collected at regular intervals
|
||||
- **Dynamic** - captures changing system state
|
||||
|
||||
**Use MonitorPlugin for:**
|
||||
- Resource usage (CPU, memory, disk, network)
|
||||
- Performance metrics
|
||||
- Counters and gauges
|
||||
- Time-series data
|
||||
|
||||
## Creating a Plugin
|
||||
|
||||
### Step 1: Choose Plugin Type
|
||||
|
||||
Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
|
||||
|
||||
### Step 2: Create Plugin File
|
||||
|
||||
Create a new Python file in `hbd/plugins/` directory:
|
||||
|
||||
```python
|
||||
"""
|
||||
My awesome plugin for Heartbeat.
|
||||
|
||||
Brief description of what this plugin does.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Import psutil or other dependencies if needed
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.plugin import MonitorPlugin # or InfoPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyAwesomePlugin(MonitorPlugin): # or InfoPlugin
|
||||
"""
|
||||
One-line description of the plugin.
|
||||
|
||||
Collects:
|
||||
- List of metrics/data collected
|
||||
- Another metric
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 60)
|
||||
option1: Description of option1 (default: value)
|
||||
option2: Description of option2 (default: value)
|
||||
"""
|
||||
|
||||
name = "my_awesome_plugin" # Unique plugin name
|
||||
interval = 60 # For MonitorPlugin, use 0 for InfoPlugin
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize the plugin with optional configuration."""
|
||||
super().__init__(config)
|
||||
|
||||
# Extract configuration options
|
||||
self.option1 = self.config.get('option1', 'default_value')
|
||||
self.option2 = self.config.get('option2', True)
|
||||
|
||||
# Check dependencies
|
||||
if psutil is None:
|
||||
raise ImportError("psutil is required for my_awesome_plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize the plugin.
|
||||
|
||||
This is called once when the plugin is loaded.
|
||||
Use this to verify dependencies, establish connections, etc.
|
||||
|
||||
Returns:
|
||||
True if initialization successful, False otherwise
|
||||
"""
|
||||
logger.info(f"My awesome plugin initialized (option1: {self.option1})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect data.
|
||||
|
||||
This is called periodically (MonitorPlugin) or once (InfoPlugin).
|
||||
|
||||
Returns:
|
||||
Dictionary of collected data (will be sent to server)
|
||||
"""
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected {len(data)} metrics")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting data: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Internal method to collect actual metrics."""
|
||||
metrics = {}
|
||||
|
||||
# Collect your data here
|
||||
metrics['metric1'] = self._get_metric1()
|
||||
metrics['metric2'] = self._get_metric2()
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_metric1(self):
|
||||
"""Helper method for metric collection."""
|
||||
# Implementation here
|
||||
return 42
|
||||
|
||||
def _get_metric2(self):
|
||||
"""Helper method for metric collection."""
|
||||
# Implementation here
|
||||
return "hello"
|
||||
|
||||
async def cleanup(self):
|
||||
"""
|
||||
Cleanup resources.
|
||||
|
||||
This is called when the plugin is unloaded or the client shuts down.
|
||||
Use this to close connections, release resources, etc.
|
||||
"""
|
||||
logger.info("My awesome plugin cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = MyAwesomePlugin
|
||||
```
|
||||
|
||||
### Step 3: Test Your Plugin
|
||||
|
||||
Create a test script to verify your plugin works:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
|
||||
|
||||
async def test():
|
||||
# Create plugin instance
|
||||
plugin = MyAwesomePlugin({'option1': 'test_value'})
|
||||
|
||||
# Initialize
|
||||
if not await plugin.initialize():
|
||||
print("Failed to initialize")
|
||||
return False
|
||||
|
||||
# Collect data
|
||||
data = await plugin.collect()
|
||||
print(f"Collected data: {data}")
|
||||
|
||||
# Cleanup
|
||||
await plugin.cleanup()
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
success = asyncio.run(test())
|
||||
sys.exit(0 if success else 1)
|
||||
```
|
||||
|
||||
## Plugin Lifecycle
|
||||
|
||||
Understanding the plugin lifecycle helps you implement plugins correctly:
|
||||
|
||||
```
|
||||
1. Plugin Discovery
|
||||
└─> Loader scans hbd/plugins/ directory
|
||||
└─> Finds Python files (except those starting with _)
|
||||
└─> Imports modules
|
||||
|
||||
2. Plugin Instantiation
|
||||
└─> Creates instance with configuration
|
||||
└─> __init__() is called
|
||||
|
||||
3. Plugin Initialization
|
||||
└─> initialize() is called
|
||||
└─> Plugin verifies dependencies, establishes connections
|
||||
└─> Returns True/False for success/failure
|
||||
|
||||
4. Plugin Registration
|
||||
└─> If initialization succeeds, plugin is registered
|
||||
└─> Plugin becomes active
|
||||
|
||||
5. Data Collection
|
||||
└─> For InfoPlugin: collect() called once after initialization
|
||||
└─> For MonitorPlugin: collect() called periodically based on interval
|
||||
└─> Data is sent to server via PLG message
|
||||
|
||||
6. Plugin Shutdown
|
||||
└─> cleanup() is called
|
||||
└─> Plugin releases resources, closes connections
|
||||
```
|
||||
|
||||
## Server-initiated InfoPlugin refresh
|
||||
|
||||
When a heartbeat packet arrives from a host the server has no plugin data for (e.g. after a server restart), the server sets `request_update = 1` in the ACK reply. The client detects this flag and immediately re-runs all InfoPlugins — clearing their cached results first — then resends the data as PLG messages.
|
||||
|
||||
This means InfoPlugin data will always reach the server as soon as possible without requiring a client restart. No action is needed from plugin authors: the framework handles cache invalidation and re-collection automatically.
|
||||
|
||||
The lifecycle for this case looks like:
|
||||
|
||||
```
|
||||
Server restarts, host reconnects
|
||||
└─> hbd receives HTB with no existing plugin_data for host
|
||||
└─> hbd sets request_update=1 in ACK
|
||||
|
||||
Client receives ACK
|
||||
└─> Detects request_update flag
|
||||
└─> Clears _cache on every registered InfoPlugin
|
||||
└─> Calls collect() on each InfoPlugin
|
||||
└─> Sends fresh PLG messages to server
|
||||
```
|
||||
|
||||
If you write an `InfoPlugin` with side effects in `_collect_info()` (opening connections, writing files, etc.), be aware it may be called more than once per client session when this mechanism triggers.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin-Specific Configuration
|
||||
|
||||
Plugins receive configuration through the `config` parameter in `__init__`:
|
||||
|
||||
```python
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
# Access configuration with defaults
|
||||
self.interval = self.config.get('interval', 60)
|
||||
self.threshold = self.config.get('threshold', 80)
|
||||
self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
|
||||
```
|
||||
|
||||
### Client Configuration File
|
||||
|
||||
Users configure plugins in the client configuration YAML:
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
my_awesome_plugin:
|
||||
enabled: true
|
||||
interval: 120
|
||||
option1: custom_value
|
||||
option2: false
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Error Handling
|
||||
|
||||
Always handle errors gracefully:
|
||||
|
||||
```python
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
try:
|
||||
return await self._collect_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
```
|
||||
|
||||
### 2. Logging
|
||||
|
||||
Use appropriate log levels:
|
||||
|
||||
```python
|
||||
logger.debug("Detailed information for debugging")
|
||||
logger.info("Normal operation messages")
|
||||
logger.warning("Warning messages for unusual but handled situations")
|
||||
logger.error("Error messages for failures")
|
||||
```
|
||||
|
||||
### 3. Dependencies
|
||||
|
||||
Check for optional dependencies:
|
||||
|
||||
```python
|
||||
try:
|
||||
import some_optional_library
|
||||
except ImportError:
|
||||
some_optional_library = None
|
||||
|
||||
# Later in __init__:
|
||||
if some_optional_library is None:
|
||||
raise ImportError("some_optional_library is required")
|
||||
```
|
||||
|
||||
### 4. Performance
|
||||
|
||||
- Keep collection methods fast (< 1 second)
|
||||
- Use async/await for I/O operations
|
||||
- Cache expensive computations
|
||||
- Don't block the event loop
|
||||
|
||||
### 5. Data Structure
|
||||
|
||||
Return clean, structured data:
|
||||
|
||||
```python
|
||||
{
|
||||
'metric_name': value,
|
||||
'nested_data': {
|
||||
'sub_metric': value
|
||||
},
|
||||
'list_data': [item1, item2],
|
||||
'timestamp': time.time() # Optional timestamp
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Documentation
|
||||
|
||||
Document your plugin thoroughly:
|
||||
|
||||
- Class docstring with description and configuration
|
||||
- Method docstrings explaining purpose and return values
|
||||
- Inline comments for complex logic
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Simple InfoPlugin
|
||||
|
||||
```python
|
||||
from hbd.plugin import InfoPlugin
|
||||
import platform
|
||||
|
||||
class SimpleInfoPlugin(InfoPlugin):
|
||||
"""Collect basic system information."""
|
||||
|
||||
name = "simple_info"
|
||||
interval = 0 # InfoPlugin
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'hostname': platform.node(),
|
||||
'system': platform.system(),
|
||||
'python_version': platform.python_version()
|
||||
}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = SimpleInfoPlugin
|
||||
```
|
||||
|
||||
### Example 2: MonitorPlugin with State
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
import time
|
||||
|
||||
class CounterPlugin(MonitorPlugin):
|
||||
"""Track a counter over time."""
|
||||
|
||||
name = "counter"
|
||||
interval = 30
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__(config)
|
||||
self._counter = 0
|
||||
self._start_time = time.time()
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
self._counter += 1
|
||||
uptime = time.time() - self._start_time
|
||||
|
||||
return {
|
||||
'count': self._counter,
|
||||
'uptime': uptime,
|
||||
'rate': self._counter / uptime
|
||||
}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = CounterPlugin
|
||||
```
|
||||
|
||||
### Example 3: Plugin with External Command
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
import asyncio
|
||||
|
||||
class CommandPlugin(MonitorPlugin):
|
||||
"""Execute external command and capture output."""
|
||||
|
||||
name = "command_executor"
|
||||
interval = 60
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__(config)
|
||||
self.command = self.config.get('command', 'echo "no command"')
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
try:
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
self.command,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
return {
|
||||
'exit_code': process.returncode,
|
||||
'stdout': stdout.decode('utf-8'),
|
||||
'stderr': stderr.decode('utf-8')
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = CommandPlugin
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Testing
|
||||
|
||||
Create unit tests for your plugins:
|
||||
|
||||
```python
|
||||
import unittest
|
||||
import asyncio
|
||||
|
||||
class TestMyPlugin(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.plugin = MyAwesomePlugin({'option1': 'test'})
|
||||
|
||||
def test_initialization(self):
|
||||
result = asyncio.run(self.plugin.initialize())
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_collection(self):
|
||||
asyncio.run(self.plugin.initialize())
|
||||
data = asyncio.run(self.plugin.collect())
|
||||
|
||||
self.assertIsInstance(data, dict)
|
||||
self.assertIn('metric1', data)
|
||||
self.assertGreater(data['metric1'], 0)
|
||||
|
||||
def tearDown(self):
|
||||
asyncio.run(self.plugin.cleanup())
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
```
|
||||
|
||||
### Integration Testing
|
||||
|
||||
Test your plugin with the actual client:
|
||||
|
||||
```bash
|
||||
# Create test configuration
|
||||
cat > test_config.yaml <<EOF
|
||||
server: localhost
|
||||
plugins:
|
||||
my_awesome_plugin:
|
||||
enabled: true
|
||||
interval: 10
|
||||
option1: test_value
|
||||
EOF
|
||||
|
||||
# Run client in test mode
|
||||
python -m hbd.hbc -c test_config.yaml --verbose
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### My plugin isn't loading
|
||||
|
||||
1. Check filename doesn't start with underscore
|
||||
2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
|
||||
3. Check `initialize()` returns True
|
||||
4. Look for import errors in logs
|
||||
|
||||
### Plugin loads but doesn't collect data
|
||||
|
||||
1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
|
||||
2. Verify `collect()` returns a dictionary
|
||||
3. Check for exceptions in `collect()` method
|
||||
4. Enable DEBUG logging to see detailed errors
|
||||
|
||||
### Data isn't appearing on server
|
||||
|
||||
1. Verify client is connected to server
|
||||
2. Check server logs for PLG message handling
|
||||
3. Verify returned data is JSON-serializable
|
||||
4. Check for large data sizes (may exceed UDP packet size)
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
|
||||
- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
|
||||
- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
|
||||
- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
|
||||
File diff suppressed because it is too large
Load Diff
+286
@@ -0,0 +1,286 @@
|
||||
# User Management
|
||||
|
||||
Heartbeat supports optional user accounts with role-based access control per host. When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Users are defined in the server config file. Each host can have an **owner**, zero or more **managers**, and zero or more **monitors**. A **default owner** catches any host that does not name an explicit owner.
|
||||
|
||||
### Roles
|
||||
|
||||
| Role | Inherits | Permissions |
|
||||
|------|----------|-------------|
|
||||
| **monitor** | — | View host status, plugin data, alerts; acknowledge alerts they were notified for |
|
||||
| **manager** | monitor | + Queue commands (`/c`), trigger DNS re-registration (`/n`), queue upgrades (`/u`); add/remove monitors |
|
||||
| **owner** | manager | + Drop host (`/d`); add/remove managers; transfer ownership; update host access |
|
||||
| **admin** *(flag)* | owner on all hosts | Full access to every host and the user list |
|
||||
|
||||
`admin` is a flag on the user, not a per-host role. An admin user has owner-level access on every host without being listed as owner/manager/monitor.
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Defining users
|
||||
|
||||
```yaml
|
||||
users:
|
||||
andreas:
|
||||
full_name: Andreas Wrede
|
||||
avatar: /path/to/avatar.png # file path, URL, or base64 data URI (optional)
|
||||
password: pbkdf2:sha256:... # generated with: hbd passwd andreas
|
||||
admin: true # optional — grants server-wide owner access
|
||||
|
||||
bob:
|
||||
full_name: Bob Smith
|
||||
password: pbkdf2:sha256:...
|
||||
notification_channels: [pushover_standard] # channels bob has selected
|
||||
|
||||
carol:
|
||||
full_name: Carol Jones
|
||||
password: pbkdf2:sha256:...
|
||||
|
||||
default_owner: andreas # owns hosts with no explicit owner
|
||||
# falls back to the first admin user if omitted
|
||||
```
|
||||
|
||||
### Client-declared host ownership
|
||||
|
||||
A host can declare its own owner directly in the hbc or hbc_mini client configuration. This is useful for hosts that are not listed in the server config, or during initial setup before a server-side config entry has been created.
|
||||
|
||||
**`~/.hbc.yaml`** (hbc):
|
||||
```yaml
|
||||
owner: andreas
|
||||
```
|
||||
|
||||
**`~/.hbc.json`** (hbc_mini):
|
||||
```json
|
||||
{ "owner": "andreas" }
|
||||
```
|
||||
|
||||
When set, the value is included in the `os_info` plugin data sent to the server. The server applies it as `host.owner` the first time `os_info` arrives, provided no owner has been configured server-side for that host. Server-configured ownership always takes precedence.
|
||||
|
||||
---
|
||||
|
||||
### Assigning roles to hosts
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: andreas
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
|
||||
unattended-host: # no owner → owned by default_owner
|
||||
threshold_config: default
|
||||
watch: true
|
||||
```
|
||||
|
||||
### Generating a password hash
|
||||
|
||||
```bash
|
||||
hbd passwd andreas
|
||||
```
|
||||
|
||||
Enter and confirm the password when prompted. Paste the printed hash into the config file under the user's `password` key.
|
||||
|
||||
You can also generate a hash non-interactively from Python:
|
||||
|
||||
```python
|
||||
from hbd.server.users import hash_password
|
||||
print(hash_password("mysecret"))
|
||||
```
|
||||
|
||||
Passwords are stored as PBKDF2-HMAC-SHA256 hashes (260 000 iterations). No third-party libraries are required — only Python's standard `hashlib`.
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
When at least one user is defined, every request must be authenticated. Unauthenticated requests to HTML pages are redirected to `/login`; unauthenticated API requests receive `401 Unauthorized`.
|
||||
|
||||
### Browser login
|
||||
|
||||
Navigate to any page — you will be redirected to `/login` automatically. After submitting valid credentials the server sets an `hbd_session` cookie (HttpOnly, SameSite=Lax, 24 h lifetime). All subsequent requests, including JavaScript `fetch()` calls on the dashboards, carry the cookie automatically.
|
||||
|
||||
To log out, visit `/logout`.
|
||||
|
||||
### API / programmatic login
|
||||
|
||||
```bash
|
||||
# Log in and capture the token
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"andreas","password":"mysecret"}' | jq -r .token)
|
||||
|
||||
# Use the token in subsequent requests
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||
```
|
||||
|
||||
The token is identical to the session cookie value — both mechanisms work simultaneously.
|
||||
|
||||
```bash
|
||||
# Log out
|
||||
curl -s -X POST http://localhost:50004/api/0/auth/logout \
|
||||
-H "Authorization: Bearer $TOKEN"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Authentication
|
||||
|
||||
#### POST /api/0/auth/login
|
||||
Obtain a session token.
|
||||
|
||||
**Request body:**
|
||||
```json
|
||||
{ "username": "andreas", "password": "mysecret" }
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{ "token": "<opaque-hex-token>", "username": "andreas" }
|
||||
```
|
||||
Also sets the `hbd_session` cookie for browser clients.
|
||||
|
||||
**Status codes:** `200 OK`, `401 Unauthorized`, `404` (auth not configured)
|
||||
|
||||
---
|
||||
|
||||
#### POST /api/0/auth/logout
|
||||
Invalidate the current session.
|
||||
|
||||
**Headers:** `Authorization: Bearer <token>` or cookie
|
||||
|
||||
**Response:** `{ "success": true }`
|
||||
|
||||
---
|
||||
|
||||
### Users
|
||||
|
||||
#### GET /api/0/users
|
||||
List all users. **Admin only.**
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{ "username": "andreas", "full_name": "Andreas Wrede", "avatar": "", "admin": true, "notification_channels": [] },
|
||||
{ "username": "bob", "full_name": "Bob Smith", "avatar": "", "admin": false, "notification_channels": ["pushover_standard"] }
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### GET /api/0/users/me
|
||||
Return the currently authenticated user's profile.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{ "username": "carol", "full_name": "Carol Jones", "avatar": "", "admin": false, "notification_channels": [] }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### PUT /api/0/users/me
|
||||
Update the current user's profile. All fields are optional — send only what you want to change.
|
||||
|
||||
**Update display name and avatar:**
|
||||
```json
|
||||
{ "full_name": "Carol Jones", "avatar": "/avatars/carol.png" }
|
||||
```
|
||||
|
||||
**Change notification channel selection:**
|
||||
```json
|
||||
{ "notification_channels": ["pushover_ops", "email_ops"] }
|
||||
```
|
||||
Only channels visible to the user (public + own private) are accepted; others are silently dropped.
|
||||
|
||||
**Change password:**
|
||||
```json
|
||||
{ "password": { "current": "oldpass", "new": "newpass" } }
|
||||
```
|
||||
Requires the correct current password. New password is hashed before storage.
|
||||
|
||||
**Response:** `{"ok": true}`
|
||||
|
||||
**Status codes:** `200 OK`, `400` (missing/invalid field), `401` (unauthenticated), `403` (wrong current password)
|
||||
|
||||
---
|
||||
|
||||
### Host Access
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/access
|
||||
Return owner/managers/monitors for a host. Requires at least **monitor** role.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"owner": "andreas",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### PUT /api/0/hosts/{hostname}/access
|
||||
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||
|
||||
**Request body** (all fields optional):
|
||||
```json
|
||||
{
|
||||
"owner": "bob",
|
||||
"managers": ["carol"],
|
||||
"monitors": []
|
||||
}
|
||||
```
|
||||
|
||||
Changes take effect immediately in memory. They are not written back to the config file — reload (`SIGHUP`) will re-apply config values. To make changes permanent, update the config file.
|
||||
|
||||
---
|
||||
|
||||
## Host visibility
|
||||
|
||||
When users are configured, `GET /api/0/hosts` only returns hosts the authenticated user has at least monitor access to. Admins see all hosts.
|
||||
|
||||
---
|
||||
|
||||
## Config reload
|
||||
|
||||
On `SIGHUP`, the server reloads the config file, re-loads the user registry, and re-applies `owner`/`managers`/`monitors` from config to all known hosts. Existing sessions remain valid after a reload.
|
||||
|
||||
---
|
||||
|
||||
## No-auth mode
|
||||
|
||||
If `users:` is absent or empty, the server starts in **unauthenticated mode**:
|
||||
|
||||
- No login required — all pages and API endpoints are accessible without credentials.
|
||||
- All permission checks pass unconditionally.
|
||||
- `/login`, `/logout`, and the auth/user API endpoints return `404`.
|
||||
|
||||
This preserves full backwards compatibility with existing deployments.
|
||||
|
||||
---
|
||||
|
||||
## Security notes
|
||||
|
||||
- Session tokens are 64-character cryptographically random hex strings (`secrets.token_hex(32)`).
|
||||
- Sessions expire after 24 hours (configurable via `users_mod.SESSION_TTL`).
|
||||
- Cookies are `HttpOnly` and `SameSite=Lax` — they are not accessible to JavaScript and are not sent on cross-site requests.
|
||||
- The HTTP API does not yet enforce TLS. For production use, place hbd behind a TLS-terminating reverse proxy (nginx, Caddy, etc.) or enable WSS.
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [HTTP API Documentation](HTTP_API.md)
|
||||
- [Notifications](NOTIFICATIONS.md)
|
||||
- Configuration example: `hbd/config_example.yaml`
|
||||
+14
-8
@@ -1,11 +1,17 @@
|
||||
"""hbd package - scaffolding for heartbeat daemon
|
||||
"""hbd package - heartbeat monitoring system
|
||||
|
||||
This package contains the refactored modules for the original monolithic
|
||||
`hbd` script. The initial implementation contains small scaffolds so you can
|
||||
start moving functionality into the package.
|
||||
This package contains both the heartbeat client (hbc) and server (hbd) components,
|
||||
organized into separate subpackages:
|
||||
|
||||
- hbd.client: Client component with system monitoring plugins
|
||||
- hbd.server: Server/daemon component with web UI and notifications
|
||||
- hbd.common: Shared utilities and protocol definitions
|
||||
|
||||
Install options:
|
||||
- pip install hbd[client] # Client only
|
||||
- pip install hbd[server] # Server only
|
||||
- pip install hbd[all] # Both client and server
|
||||
"""
|
||||
|
||||
__all__ = ["main", "__version__"]
|
||||
__version__ = "5.0.3"
|
||||
|
||||
from .cli import main
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.3.7"
|
||||
|
||||
-54
@@ -1,54 +0,0 @@
|
||||
"""Command line interface for hbd package."""
|
||||
|
||||
import argparse
|
||||
|
||||
from .config import load_config
|
||||
from .server import run as run_server
|
||||
|
||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||
|
||||
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbd",
|
||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config", dest="configfile", help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--foreground", action="store_true", help="Run in foreground"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument(
|
||||
"-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS, help="Push service to use"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x", "--debug", action="count", default=0, help="Increase debug level"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.foreground:
|
||||
config["foreground"] = True
|
||||
if args.verbose:
|
||||
config["verbose"] = True
|
||||
if args.pushsrv:
|
||||
config["pushsrv"] = args.pushsrv
|
||||
if args.debug:
|
||||
config.setdefault("debug", 0)
|
||||
config["debug"] += args.debug
|
||||
|
||||
run_server(config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,3 @@
|
||||
"""HeartBeat Client (hbc) - System monitoring client."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
CLIENT_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port where hbd servers listen
|
||||
"interval": 10, # Heartbeat interval in seconds
|
||||
|
||||
# Host identity
|
||||
"owner": None, # Optional username to set as this host's owner on the server
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
# Plugin configuration
|
||||
"plugins": {}, # Per-plugin configuration
|
||||
"thresholds": {}, # Threshold configuration for monitoring
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with client defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = CLIENT_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hbc.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
logger.info("Loading configuration from %s", path)
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
# Keep all keys from YAML to support plugin configs and future extensions
|
||||
for k, v in data.items():
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||
return cfg
|
||||
@@ -0,0 +1,801 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HeartBeat Client (hbc) - Async version with plugin support.
|
||||
|
||||
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
|
||||
system information via plugins.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from logging.handlers import SysLogHandler
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Import protocol and config
|
||||
from .config import load_config
|
||||
from ..common.proto import dicttos, stodict
|
||||
from .. import __version__
|
||||
|
||||
# Import plugin system
|
||||
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||
|
||||
# Constants
|
||||
PORT = 50003
|
||||
INTERVAL = 10
|
||||
MAXRECV = 32767
|
||||
|
||||
# Global state
|
||||
running = True
|
||||
dorestart = False
|
||||
shutdown_event: Optional[asyncio.Event] = None
|
||||
active_tasks: List[asyncio.Task] = []
|
||||
|
||||
|
||||
class AsyncConnection:
|
||||
"""Async UDP connection to a heartbeat server."""
|
||||
|
||||
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
|
||||
self.conn_id = conn_id
|
||||
self.addr = addr
|
||||
self.port = port
|
||||
self.af = af
|
||||
self.name = name
|
||||
|
||||
self.ackcount = 0
|
||||
self.lastack = 0.0
|
||||
self.send_count = 0
|
||||
self.lastsend = 0.0
|
||||
self.rtts = [0.0]
|
||||
|
||||
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||
self._dead = False
|
||||
self._ever_opened = False
|
||||
self._open_fail_count = 0 # consecutive failures before first success
|
||||
self.request_info_event: asyncio.Event = asyncio.Event()
|
||||
|
||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
async def open(self) -> bool:
|
||||
"""Open the UDP connection.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Create datagram endpoint
|
||||
self.transport, self.protocol = await loop.create_datagram_endpoint(
|
||||
lambda: HeartbeatProtocol(self),
|
||||
family=self.af
|
||||
)
|
||||
self._ever_opened = True
|
||||
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to open connection: {e}")
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close the connection."""
|
||||
if self.transport:
|
||||
self.transport.close()
|
||||
self.transport = None
|
||||
self.protocol = None
|
||||
|
||||
async def sendto(self, msg: dict, msg_id: str = "HTB"):
|
||||
"""Send a message to the server.
|
||||
|
||||
Args:
|
||||
msg: Message dictionary
|
||||
msg_id: Message ID (HTB, PLG, etc.)
|
||||
"""
|
||||
if self._dead:
|
||||
return
|
||||
|
||||
if not self.transport:
|
||||
await self.open()
|
||||
|
||||
if not self.transport:
|
||||
self.logger.error("Cannot send - no transport")
|
||||
return
|
||||
|
||||
# Add standard fields
|
||||
msg["name"] = shortname(self.name)
|
||||
msg["id"] = self.conn_id
|
||||
msg["time"] = time.time()
|
||||
|
||||
# Encode message
|
||||
data = dicttos(msg_id, msg)
|
||||
|
||||
# Send
|
||||
self.transport.sendto(data, (self.addr, self.port))
|
||||
self.send_count += 1
|
||||
self.lastsend = time.time()
|
||||
|
||||
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||
|
||||
def handle_ack(self, msg: dict, now: float):
|
||||
"""Handle ACK message from server.
|
||||
|
||||
RTT is calculated as: (time ACK received) - (time HTB sent)
|
||||
"""
|
||||
self.lastack = now
|
||||
|
||||
# Calculate RTT: time ACK received minus time HTB sent
|
||||
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
|
||||
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > 10:
|
||||
self.rtts.pop(0)
|
||||
|
||||
self.ackcount += 1
|
||||
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||
if msg.get("request_update"):
|
||||
self.logger.info("server requested plugin info refresh")
|
||||
self.request_info_event.set()
|
||||
|
||||
|
||||
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
"""Protocol handler for incoming UDP messages."""
|
||||
|
||||
def __init__(self, connection: AsyncConnection):
|
||||
self.connection = connection
|
||||
self.logger = logging.getLogger("hbc.protocol")
|
||||
|
||||
def datagram_received(self, data: bytes, addr):
|
||||
"""Handle incoming datagram."""
|
||||
try:
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
self.logger.warning(f"Failed to parse message from {addr}")
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
msg_id = msg.get("ID")
|
||||
|
||||
if msg_id == "ACK":
|
||||
self.connection.handle_ack(msg, now)
|
||||
elif msg_id == "CMD":
|
||||
# Command from server
|
||||
asyncio.create_task(handle_command(self.connection, msg))
|
||||
elif msg_id == "UPD":
|
||||
# Update from server
|
||||
asyncio.create_task(handle_update(self.connection, msg))
|
||||
else:
|
||||
self.logger.warning(f"Unknown message type: {msg_id}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||
|
||||
def error_received(self, exc):
|
||||
"""Handle protocol errors — close transport so the heartbeat sender retries."""
|
||||
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — will retry")
|
||||
self.connection.close()
|
||||
|
||||
|
||||
async def handle_command(conn: AsyncConnection, msg: dict):
|
||||
"""Execute a command received from server."""
|
||||
import subprocess
|
||||
|
||||
cmd = msg.get("cmd", "")
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
logger = logging.getLogger("hbc.command")
|
||||
logger.info(f"Executing command: {cmd}")
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
|
||||
).decode()
|
||||
status = "OK"
|
||||
except subprocess.CalledProcessError as e:
|
||||
result = str(e)
|
||||
status = "CalledProcessError"
|
||||
except subprocess.TimeoutExpired:
|
||||
result = "Command timed out"
|
||||
status = "Timeout"
|
||||
except Exception as e:
|
||||
result = str(e)
|
||||
status = "Error"
|
||||
|
||||
# Send response
|
||||
response = {
|
||||
"service": "command",
|
||||
"msg": f"{status} {result}"
|
||||
}
|
||||
await conn.sendto(response)
|
||||
|
||||
|
||||
async def handle_update(conn: AsyncConnection, _msg: dict): # pyright: ignore[reportUnusedParameter]
|
||||
"""Handle self-update by running hb_install.sh."""
|
||||
import shutil
|
||||
|
||||
logger = logging.getLogger("hbc.update")
|
||||
|
||||
installer = shutil.which("hb_install.sh")
|
||||
if installer is None:
|
||||
candidate = Path(sys.argv[0]).parent / "hb_install.sh"
|
||||
if candidate.exists():
|
||||
installer = str(candidate)
|
||||
|
||||
if installer is None:
|
||||
error = "hb_install.sh not found in PATH or alongside hbc"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
logger.info(f"Running installer: {installer}")
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
installer, "client",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
)
|
||||
out, _ = await asyncio.wait_for(proc.communicate(), timeout=120)
|
||||
except asyncio.TimeoutError:
|
||||
error = "Installer timed out"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
except Exception as e:
|
||||
error = f"Installer failed: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
if proc.returncode != 0:
|
||||
error = f"Installer exited {proc.returncode}: {out.decode().strip()}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
logger.info("Update successful, restart required")
|
||||
await conn.sendto({"service": "update", "msg": "OK"})
|
||||
|
||||
# Trigger restart
|
||||
global dorestart
|
||||
dorestart = True
|
||||
stop()
|
||||
|
||||
|
||||
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
"""Send periodic heartbeats, retrying the connection if it is not open.
|
||||
|
||||
IPv6 connections that fail to open before their first successful send are
|
||||
dropped after IPV6_EARLY_FAIL_LIMIT attempts so that a network without IPv6
|
||||
does not keep a dead sender alive. IPv4 connections are retried indefinitely.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
interval: Heartbeat interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger("hbc.heartbeat")
|
||||
IPV6_EARLY_FAIL_LIMIT = 3
|
||||
|
||||
while running and not conn._dead:
|
||||
# Ensure transport is open before attempting to send.
|
||||
if not conn.transport:
|
||||
opened = await conn.open()
|
||||
if opened:
|
||||
conn._open_fail_count = 0
|
||||
else:
|
||||
conn._open_fail_count += 1
|
||||
# Drop an IPv6 connection that has never come up within the
|
||||
# first few attempts — it is likely unavailable on this network.
|
||||
if (not conn._ever_opened
|
||||
and conn.af == socket.AF_INET6
|
||||
and conn._open_fail_count >= IPV6_EARLY_FAIL_LIMIT):
|
||||
logger.warning(
|
||||
f"IPv6 connection to {conn.addr} unreachable after "
|
||||
f"{conn._open_fail_count} attempts, disabling"
|
||||
)
|
||||
conn._dead = True
|
||||
break
|
||||
# Retry after the normal interval; IPv4 retries forever.
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
continue
|
||||
|
||||
try:
|
||||
msg = {
|
||||
"acks": conn.ackcount,
|
||||
"rtt": conn.rtts[-1],
|
||||
"interval": interval
|
||||
}
|
||||
await conn.sendto(msg, "HTB")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Heartbeat sender cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||
|
||||
# Wait for next interval or shutdown event
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Heartbeat sender cancelled during sleep")
|
||||
raise
|
||||
|
||||
|
||||
async def _info_plugin_refresh_loop(conn: AsyncConnection, info_plugins: List):
|
||||
"""Wait for server requests to re-send InfoPlugin data."""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
while running:
|
||||
await conn.request_info_event.wait()
|
||||
if not running:
|
||||
break
|
||||
conn.request_info_event.clear()
|
||||
logger.info("refreshing InfoPlugins on server request")
|
||||
for plugin in info_plugins:
|
||||
plugin._cache = None
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
await conn.sendto({"plugin": plugin.name, **data}, "PLG")
|
||||
logger.info(f"Resent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error re-collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
|
||||
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||
"""Collect and send plugin data.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
registry: Plugin registry
|
||||
"""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
|
||||
# Collect InfoPlugins once at startup
|
||||
info_plugins = registry.get_by_type(InfoPlugin)
|
||||
for plugin in info_plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Create PLG message with plugin name
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.info(f"Sent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
# Schedule MonitorPlugins
|
||||
# Group plugins by interval
|
||||
from collections import defaultdict
|
||||
by_interval = defaultdict(list)
|
||||
|
||||
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||
for plugin in monitor_plugins:
|
||||
by_interval[plugin.interval].append(plugin)
|
||||
|
||||
# Create tasks for each interval; always include the info-refresh watcher
|
||||
tasks = [asyncio.create_task(_info_plugin_refresh_loop(conn, info_plugins))]
|
||||
for interval, plugins in by_interval.items():
|
||||
tasks.append(asyncio.create_task(
|
||||
plugin_collector_interval(conn, plugins, interval)
|
||||
))
|
||||
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||
for task in tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
raise
|
||||
|
||||
|
||||
async def plugin_collector_interval(
|
||||
conn: AsyncConnection,
|
||||
plugins: List,
|
||||
interval: int
|
||||
):
|
||||
"""Collect plugins on a specific interval.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
plugins: List of plugins to collect
|
||||
interval: Collection interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger(f"hbc.plugins.{interval}s")
|
||||
|
||||
while running:
|
||||
for plugin in plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Don't use encode_plugin_data - create dict directly
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.debug(f"Sent {plugin.name} data")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error collecting {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
# Wait for next interval or shutdown event
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(
|
||||
shutdown_event.wait(),
|
||||
timeout=interval
|
||||
)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled during sleep")
|
||||
raise
|
||||
|
||||
|
||||
def shortname(name: str) -> str:
|
||||
"""Extract short hostname."""
|
||||
return name.split(".")[0]
|
||||
|
||||
|
||||
def stop():
|
||||
"""Stop the event loop."""
|
||||
global running
|
||||
running = False
|
||||
|
||||
# Set shutdown event to wake up sleeping tasks
|
||||
if shutdown_event:
|
||||
shutdown_event.set()
|
||||
|
||||
# Cancel all active tasks
|
||||
for task in active_tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
|
||||
async def cleanup(connections: List[AsyncConnection]):
|
||||
"""Cleanup connections on shutdown."""
|
||||
logger = logging.getLogger("hbc.cleanup")
|
||||
logger.info("Cleaning up connections")
|
||||
|
||||
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||
if target and send_shutdown:
|
||||
try:
|
||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending shutdown: {e}")
|
||||
for conn in connections:
|
||||
conn.close()
|
||||
|
||||
# Give messages time to send
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
async def async_main(args, config):
|
||||
"""Async main function."""
|
||||
global running, shutdown_event, active_tasks, send_shutdown
|
||||
|
||||
# Create shutdown event
|
||||
shutdown_event = asyncio.Event()
|
||||
active_tasks = []
|
||||
|
||||
logger = logging.getLogger("hbc.main")
|
||||
|
||||
# Setup
|
||||
iam = socket.gethostname()
|
||||
if args.name:
|
||||
iam = args.name
|
||||
|
||||
hb_hosts = args.hosts
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
logger.info(f"hbc {__version__} on {iam} -> {hb_hosts} port={hb_port}, interval={interval}s")
|
||||
|
||||
af_filter = (socket.AF_INET if getattr(args, "ipv4_only", False)
|
||||
else socket.AF_INET6 if getattr(args, "ipv6_only", False)
|
||||
else 0)
|
||||
|
||||
# Create connections
|
||||
connections = []
|
||||
conn_id = 1
|
||||
_retry_delay = 5
|
||||
|
||||
while running and not connections:
|
||||
for host in hb_hosts:
|
||||
try:
|
||||
addrs = socket.getaddrinfo(host, hb_port, af_filter, 0, socket.SOL_UDP)
|
||||
except socket.gaierror as e:
|
||||
logger.warning(f"Cannot resolve {host}: {e} — retrying in {_retry_delay}s")
|
||||
continue
|
||||
for addr_info in addrs:
|
||||
af = addr_info[0]
|
||||
addr = addr_info[4][0]
|
||||
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||
if not await conn.open():
|
||||
logger.warning(f"Initial open to {addr} failed, heartbeat sender will retry")
|
||||
connections.append(conn)
|
||||
conn_id += 1
|
||||
if not connections:
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(shutdown_event.wait(), timeout=_retry_delay)
|
||||
else:
|
||||
await asyncio.sleep(_retry_delay)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
_retry_delay = min(_retry_delay * 2, 60)
|
||||
|
||||
if not connections:
|
||||
return 1
|
||||
|
||||
logger.info(f"Created {len(connections)} connections")
|
||||
|
||||
# Send boot/message if requested
|
||||
send_shutdown = False
|
||||
if args.boot or args.message:
|
||||
boot_msg = {}
|
||||
if args.boot:
|
||||
boot_msg["boot"] = 1
|
||||
args.boot = False # Clear boot flag so we don't send it again in main loop
|
||||
send_shutdown = True
|
||||
if args.message:
|
||||
boot_msg["service"] = "service"
|
||||
boot_msg["msg"] = args.message
|
||||
|
||||
boot_msg["acks"] = 0
|
||||
target = next((c for c in connections if c.transport), connections[0])
|
||||
await target.sendto(boot_msg)
|
||||
|
||||
if args.message and not args.daemon:
|
||||
# Message-only mode
|
||||
await cleanup(connections)
|
||||
return 0
|
||||
|
||||
# Load plugins
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
plugin_dir = Path(__file__).parent / "plugins"
|
||||
if plugin_dir.exists():
|
||||
count = await loader.load_from_directory(plugin_dir, config)
|
||||
logger.info(f"Loaded {count} plugins")
|
||||
else:
|
||||
logger.warning(f"Plugin directory not found: {plugin_dir}")
|
||||
|
||||
# Setup signal handlers
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, stop)
|
||||
|
||||
def _sighup():
|
||||
global dorestart
|
||||
dorestart = True
|
||||
stop()
|
||||
|
||||
loop.add_signal_handler(signal.SIGHUP, _sighup)
|
||||
|
||||
# Start async tasks
|
||||
# Heartbeat senders (one per connection)
|
||||
for conn in connections:
|
||||
task = asyncio.create_task(heartbeat_sender(conn, interval))
|
||||
active_tasks.append(task)
|
||||
|
||||
# Plugin collector (uses all connections, but we'll use first one)
|
||||
if connections and registry.get_enabled():
|
||||
task = asyncio.create_task(plugin_collector(connections[0], registry))
|
||||
active_tasks.append(task)
|
||||
|
||||
# Wait for stop or tasks to complete
|
||||
try:
|
||||
await asyncio.gather(*active_tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Tasks cancelled")
|
||||
|
||||
# Cleanup
|
||||
logger.info("Shutting down...")
|
||||
await cleanup(connections)
|
||||
await loader.unload_all()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def daemonize(
|
||||
working_dir="/",
|
||||
stdin="/dev/zero",
|
||||
stdout="/dev/null",
|
||||
stderr="/dev/null"
|
||||
):
|
||||
"""UNIX double-fork daemonization."""
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #1 failed: {e}\n")
|
||||
os._exit(1)
|
||||
|
||||
os.chdir(working_dir)
|
||||
os.setsid()
|
||||
os.umask(0)
|
||||
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #2 failed: {e}\n")
|
||||
sys.exit(1)
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
si = open(stdin, "r")
|
||||
so = open(stdout, "a+")
|
||||
se = open(stderr, "a+")
|
||||
|
||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
def _reconfigure_logging_for_daemon(log_level: int) -> None:
|
||||
"""Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
|
||||
root = logging.getLogger()
|
||||
for handler in root.handlers[:]:
|
||||
root.removeHandler(handler)
|
||||
handler.close()
|
||||
|
||||
use_udp_fallback = not os.path.exists("/dev/log")
|
||||
|
||||
if use_udp_fallback:
|
||||
syslog_handler = SysLogHandler(
|
||||
address=("localhost", 514),
|
||||
facility=SysLogHandler.LOG_DAEMON,
|
||||
)
|
||||
else:
|
||||
syslog_handler = SysLogHandler(
|
||||
address="/dev/log",
|
||||
facility=SysLogHandler.LOG_DAEMON,
|
||||
)
|
||||
|
||||
syslog_handler.setFormatter(
|
||||
logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
|
||||
)
|
||||
root.addHandler(syslog_handler)
|
||||
root.setLevel(log_level)
|
||||
|
||||
if use_udp_fallback:
|
||||
logging.warning("/dev/log not found, using syslog UDP localhost:514")
|
||||
|
||||
|
||||
def build_parser():
|
||||
"""Build argument parser."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbc",
|
||||
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--boot",
|
||||
action="store_true",
|
||||
help="Send a boot message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config",
|
||||
dest="configfile",
|
||||
help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--message",
|
||||
dest="message",
|
||||
help="Send a message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--name",
|
||||
dest="name",
|
||||
help="Name to use in heartbeat message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--daemon",
|
||||
action="store_true",
|
||||
help="Run in daemon mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose",
|
||||
action="store_true",
|
||||
help="Verbose output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x", "--debug",
|
||||
action="count",
|
||||
default=0,
|
||||
help="Increase debug level"
|
||||
)
|
||||
af_group = parser.add_mutually_exclusive_group()
|
||||
af_group.add_argument("-4", dest="ipv4_only", action="store_true", help="Use IPv4 only")
|
||||
af_group.add_argument("-6", dest="ipv6_only", action="store_true", help="Use IPv6 only")
|
||||
parser.add_argument(
|
||||
"hosts",
|
||||
nargs="+",
|
||||
help="Heartbeat daemon hosts to send to"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
"""Main entry point."""
|
||||
global running, dorestart
|
||||
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.WARNING
|
||||
if args.verbose:
|
||||
log_level = logging.INFO
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
logging.info("Daemonizing...")
|
||||
daemonize()
|
||||
_reconfigure_logging_for_daemon(log_level)
|
||||
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||
|
||||
# Run async main
|
||||
try:
|
||||
exit_code = asyncio.run(async_main(args, config))
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Interrupted by user")
|
||||
exit_code = 0
|
||||
except Exception as e:
|
||||
logging.error(f"Fatal error: {e}", exc_info=True)
|
||||
exit_code = 1
|
||||
|
||||
# Handle restart
|
||||
if dorestart:
|
||||
logging.info("Restarting...")
|
||||
os.execv(sys.argv[0], sys.argv)
|
||||
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,425 @@
|
||||
"""Plugin system for extending Heartbeat data collection and monitoring.
|
||||
|
||||
This module provides the base classes and infrastructure for the plugin system
|
||||
that enables extending hbc (client) data collection and hbd (server) processing.
|
||||
|
||||
Plugin Types:
|
||||
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
|
||||
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
|
||||
|
||||
Plugins run on the client (hbc) to gather data, which is then sent to the server
|
||||
(hbd) for storage, threshold checking, and display.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
|
||||
class Plugin(ABC):
|
||||
"""Base class for all plugins.
|
||||
|
||||
Attributes:
|
||||
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
|
||||
version: Plugin version string
|
||||
description: Human-readable description
|
||||
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
|
||||
enabled: Whether plugin is active (can be disabled via config)
|
||||
skip_reason: Set by plugin before returning False from initialize(); causes loader to log INFO instead of WARNING.
|
||||
"""
|
||||
|
||||
name: str = ""
|
||||
version: str = "1.0.0"
|
||||
description: str = ""
|
||||
interval: int = 0
|
||||
enabled: bool = True
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize plugin with optional configuration.
|
||||
|
||||
Args:
|
||||
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||
self._initialized = False
|
||||
self.skip_reason: Optional[str] = None
|
||||
|
||||
@abstractmethod
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize plugin (load resources, check dependencies).
|
||||
|
||||
Called once when plugin is loaded. Plugins should validate dependencies
|
||||
(e.g., check if psutil is available) and prepare any resources.
|
||||
|
||||
Returns:
|
||||
True if initialization succeeded, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect data from the system.
|
||||
|
||||
This is the main method called on each collection interval. Should return
|
||||
a dictionary of key-value pairs representing the collected data.
|
||||
|
||||
Keys should be strings (metric names). Values can be:
|
||||
- Scalars: int, float, str, bool
|
||||
- Lists/dicts (will be serialized appropriately)
|
||||
|
||||
Returns:
|
||||
Dictionary of collected metrics, or empty dict on error
|
||||
"""
|
||||
pass
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Cleanup plugin resources before shutdown.
|
||||
|
||||
Called when plugin is being unloaded or on system shutdown.
|
||||
Override to release resources, close connections, etc.
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_data(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate collected data before sending to server.
|
||||
|
||||
Override to implement custom validation logic.
|
||||
|
||||
Args:
|
||||
data: Data returned from collect()
|
||||
|
||||
Returns:
|
||||
True if data is valid, False otherwise
|
||||
"""
|
||||
return isinstance(data, dict)
|
||||
|
||||
|
||||
class InfoPlugin(Plugin):
|
||||
"""Plugin for collecting static or rarely-changing information.
|
||||
|
||||
InfoPlugins collect data that doesn't change frequently:
|
||||
- OS name and version
|
||||
- Hardware specifications (CPU model, RAM size)
|
||||
- Network interface MAC addresses
|
||||
|
||||
Characteristics:
|
||||
- interval = 0 (collected once at startup by default)
|
||||
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
|
||||
- Data is cached and reused until next collection
|
||||
"""
|
||||
|
||||
interval: int = 0 # Collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._cached_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get cached data if available (avoids re-collection).
|
||||
|
||||
Returns:
|
||||
Cached data dict, or None if not yet collected
|
||||
"""
|
||||
return self._cached_data
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect and cache static information."""
|
||||
if self._cached_data is None:
|
||||
self._cached_data = await self._collect_info()
|
||||
return self._cached_data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual data collection.
|
||||
|
||||
Override this method instead of collect() for InfoPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
def invalidate_cache(self) -> None:
|
||||
"""Force re-collection on next collect() call."""
|
||||
self._cached_data = None
|
||||
|
||||
|
||||
class MonitorPlugin(Plugin):
|
||||
"""Plugin for collecting periodic monitoring data.
|
||||
|
||||
MonitorPlugins collect time-series metrics that change frequently:
|
||||
- CPU usage percentage
|
||||
- Memory consumption
|
||||
- Disk I/O statistics
|
||||
- Network traffic
|
||||
|
||||
Characteristics:
|
||||
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
|
||||
- Collected continuously on schedule
|
||||
- Data includes timestamps for time-series tracking
|
||||
"""
|
||||
|
||||
interval: int = 30 # Default: collect every 30 seconds
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._last_reading: Optional[Dict[str, Any]] = None
|
||||
|
||||
def get_last_reading(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get the last collected reading.
|
||||
|
||||
Returns:
|
||||
Last reading dict with timestamp, or None if not yet collected
|
||||
"""
|
||||
return self._last_reading
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect monitoring data and store as last reading."""
|
||||
data = await self._collect_metrics()
|
||||
if data:
|
||||
# Add collection timestamp
|
||||
import time
|
||||
data['_timestamp'] = time.time()
|
||||
self._last_reading = data
|
||||
return data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual metric collection.
|
||||
|
||||
Override this method instead of collect() for MonitorPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PluginRegistry:
|
||||
"""Registry for managing loaded plugins.
|
||||
|
||||
Maintains a collection of loaded plugins and provides methods to
|
||||
query plugins by name, type, or interval.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._plugins: Dict[str, Plugin] = {}
|
||||
self.logger = logging.getLogger("plugin.registry")
|
||||
|
||||
def register(self, plugin: Plugin) -> bool:
|
||||
"""Register a plugin instance.
|
||||
|
||||
Args:
|
||||
plugin: Plugin instance to register
|
||||
|
||||
Returns:
|
||||
True if registered successfully, False if name conflict
|
||||
"""
|
||||
if plugin.name in self._plugins:
|
||||
self.logger.error(f"Plugin '{plugin.name}' already registered")
|
||||
return False
|
||||
|
||||
self._plugins[plugin.name] = plugin
|
||||
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
|
||||
return True
|
||||
|
||||
def unregister(self, name: str) -> bool:
|
||||
"""Unregister a plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name to unregister
|
||||
|
||||
Returns:
|
||||
True if unregistered, False if not found
|
||||
"""
|
||||
if name in self._plugins:
|
||||
del self._plugins[name]
|
||||
self.logger.info(f"Unregistered plugin: {name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, name: str) -> Optional[Plugin]:
|
||||
"""Get plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name
|
||||
|
||||
Returns:
|
||||
Plugin instance or None if not found
|
||||
"""
|
||||
return self._plugins.get(name)
|
||||
|
||||
def get_all(self) -> List[Plugin]:
|
||||
"""Get all registered plugins."""
|
||||
return list(self._plugins.values())
|
||||
|
||||
def get_enabled(self) -> List[Plugin]:
|
||||
"""Get all enabled plugins."""
|
||||
return [p for p in self._plugins.values() if p.enabled]
|
||||
|
||||
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
|
||||
"""Get all plugins of a specific type.
|
||||
|
||||
Args:
|
||||
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
|
||||
|
||||
Returns:
|
||||
List of plugins matching the type
|
||||
"""
|
||||
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
|
||||
|
||||
def get_by_interval(self, interval: int) -> List[Plugin]:
|
||||
"""Get all plugins with a specific collection interval.
|
||||
|
||||
Args:
|
||||
interval: Interval in seconds (0 for one-time collection)
|
||||
|
||||
Returns:
|
||||
List of plugins with matching interval
|
||||
"""
|
||||
return [p for p in self._plugins.values() if p.interval == interval]
|
||||
|
||||
|
||||
class PluginLoader:
|
||||
"""Load plugins from filesystem and instantiate them.
|
||||
|
||||
Scans plugin directories for Python modules containing Plugin subclasses,
|
||||
loads them dynamically, and registers them with the PluginRegistry.
|
||||
"""
|
||||
|
||||
def __init__(self, registry: PluginRegistry):
|
||||
self.registry = registry
|
||||
self.logger = logging.getLogger("plugin.loader")
|
||||
self._loaded_modules: Dict[str, Any] = {}
|
||||
|
||||
async def load_from_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
) -> int:
|
||||
"""Load all plugins from a directory.
|
||||
|
||||
Scans for .py files, imports them, finds Plugin subclasses,
|
||||
instantiates them with config, initializes, and registers.
|
||||
|
||||
Args:
|
||||
directory: Path to plugin directory
|
||||
config: Configuration dict (may contain per-plugin config)
|
||||
|
||||
Returns:
|
||||
Number of plugins successfully loaded
|
||||
"""
|
||||
if not directory.exists() or not directory.is_dir():
|
||||
self.logger.warning(f"Plugin directory not found: {directory}")
|
||||
return 0
|
||||
|
||||
loaded_count = 0
|
||||
raw_config = config or {}
|
||||
# Per-plugin config lives under the 'plugins' key or at top-level.
|
||||
# CLIENT_DEFAULTS seeds "plugins": {} so the key always exists; check
|
||||
# both the subdict and top-level so that either layout in .hbc.yaml works.
|
||||
plugins_subconfig = raw_config.get("plugins", {})
|
||||
|
||||
# Scan for Python files
|
||||
for plugin_file in directory.glob("*.py"):
|
||||
if plugin_file.name.startswith("_"):
|
||||
continue # Skip __init__.py and private modules
|
||||
|
||||
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
|
||||
|
||||
try:
|
||||
# Load module dynamically
|
||||
module_name = f"plugins.{plugin_file.stem}"
|
||||
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
|
||||
if not spec or not spec.loader:
|
||||
self.logger.warning(f"Could not create spec for {plugin_file}")
|
||||
continue
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
self._loaded_modules[module_name] = module
|
||||
|
||||
self.logger.debug(f"Loaded module: {module_name}")
|
||||
|
||||
# Track which plugin classes we've already processed to avoid duplicates
|
||||
processed_classes = set()
|
||||
|
||||
# Find Plugin subclasses in module
|
||||
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||
# Skip base classes and non-Plugin classes
|
||||
if obj in (Plugin, InfoPlugin, MonitorPlugin):
|
||||
self.logger.debug(f"Skipping base class: {name}")
|
||||
continue
|
||||
if not issubclass(obj, Plugin):
|
||||
self.logger.debug(f"Skipping non-Plugin class: {name}")
|
||||
continue
|
||||
|
||||
# Skip if we've already processed this class (handles module-level aliases)
|
||||
if id(obj) in processed_classes:
|
||||
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
|
||||
continue
|
||||
processed_classes.add(id(obj))
|
||||
|
||||
self.logger.debug(f"Found plugin class: {name}")
|
||||
|
||||
# Instantiate plugin with config — check plugins subdict first,
|
||||
# then top-level keys (e.g. nagios_runner: ... at root of config).
|
||||
plugin_instance_config = dict(plugins_subconfig.get(obj.name) or raw_config.get(obj.name) or {})
|
||||
# Propagate top-level owner so os_info (and any future plugin) can report it.
|
||||
if "owner" in raw_config and "owner" not in plugin_instance_config:
|
||||
plugin_instance_config["owner"] = raw_config["owner"]
|
||||
plugin = obj(config=plugin_instance_config)
|
||||
|
||||
# Initialize plugin
|
||||
try:
|
||||
initialized = await plugin.initialize()
|
||||
if not initialized:
|
||||
if plugin.skip_reason:
|
||||
self.logger.info(
|
||||
f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Plugin {plugin.name} failed initialization, skipping"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error initializing plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
continue
|
||||
|
||||
# Register with registry
|
||||
if self.registry.register(plugin):
|
||||
loaded_count += 1
|
||||
self.logger.info(
|
||||
f"Loaded plugin: {plugin.name} v{plugin.version} "
|
||||
f"(interval: {plugin.interval}s)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error loading plugin from {plugin_file}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
return loaded_count
|
||||
|
||||
async def unload_all(self) -> None:
|
||||
"""Unload all plugins and cleanup resources."""
|
||||
for plugin in self.registry.get_all():
|
||||
try:
|
||||
await plugin.cleanup()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error cleaning up plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
self.registry.unregister(plugin.name)
|
||||
|
||||
# Remove loaded modules
|
||||
for module_name in self._loaded_modules:
|
||||
if module_name in sys.modules:
|
||||
del sys.modules[module_name]
|
||||
self._loaded_modules.clear()
|
||||
@@ -0,0 +1,136 @@
|
||||
"""CPU Monitoring Plugin for Heartbeat.
|
||||
|
||||
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
||||
load average, and process counts.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
class CPUMonitorPlugin(MonitorPlugin):
|
||||
"""Monitor CPU usage and load.
|
||||
|
||||
Collects:
|
||||
- Overall CPU usage percentage
|
||||
- Per-core CPU usage (if enabled in config)
|
||||
- Load average (1min, 5min, 15min)
|
||||
- Process count
|
||||
- CPU frequency (if available)
|
||||
"""
|
||||
|
||||
name = "cpu_monitor"
|
||||
version = "1.0.0"
|
||||
description = "CPU usage and load monitoring"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.psutil = None
|
||||
self.per_core = config.get("per_core", False) if config else False
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the CPU monitor plugin.
|
||||
|
||||
Checks if psutil is available.
|
||||
|
||||
Returns:
|
||||
True if psutil is available, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
try:
|
||||
import psutil
|
||||
self.psutil = psutil
|
||||
self.logger.info(f"{self.name} initialized successfully")
|
||||
return True
|
||||
except ImportError:
|
||||
self.logger.error(
|
||||
"psutil module not available. Install with: pip install psutil"
|
||||
)
|
||||
return False
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect CPU metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with CPU metrics
|
||||
"""
|
||||
if not self.psutil:
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = {}
|
||||
|
||||
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
||||
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
||||
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
||||
|
||||
# Per-core CPU usage (if enabled)
|
||||
if self.per_core:
|
||||
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
||||
data["cpu_per_core"] = per_core_percents
|
||||
data["cpu_core_count"] = len(per_core_percents)
|
||||
else:
|
||||
# Just report core count
|
||||
data["cpu_core_count"] = self.psutil.cpu_count()
|
||||
|
||||
# Load average (Unix-like systems only)
|
||||
try:
|
||||
load_avg = self.psutil.getloadavg()
|
||||
data["load_1min"] = round(load_avg[0], 2)
|
||||
data["load_5min"] = round(load_avg[1], 2)
|
||||
data["load_15min"] = round(load_avg[2], 2)
|
||||
except (AttributeError, OSError):
|
||||
# Not available on Windows
|
||||
pass
|
||||
|
||||
# Process count
|
||||
try:
|
||||
data["process_count"] = len(self.psutil.pids())
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not get process count: {e}")
|
||||
|
||||
# CPU frequency (if available)
|
||||
try:
|
||||
freq = self.psutil.cpu_freq()
|
||||
if freq:
|
||||
data["cpu_freq_current"] = round(freq.current, 2)
|
||||
data["cpu_freq_min"] = round(freq.min, 2)
|
||||
data["cpu_freq_max"] = round(freq.max, 2)
|
||||
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
||||
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
||||
self.logger.debug(f"CPU frequency not available: {e}")
|
||||
pass
|
||||
|
||||
# CPU times (user, system, idle, etc.)
|
||||
try:
|
||||
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
||||
data["cpu_user"] = round(cpu_times.user, 1)
|
||||
data["cpu_system"] = round(cpu_times.system, 1)
|
||||
data["cpu_idle"] = round(cpu_times.idle, 1)
|
||||
if hasattr(cpu_times, "iowait"):
|
||||
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get CPU times: {e}")
|
||||
|
||||
# Uptime in seconds
|
||||
try:
|
||||
import time
|
||||
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get uptime: {e}")
|
||||
|
||||
self.logger.debug(
|
||||
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||
)
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
||||
return {}
|
||||
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Disk monitoring plugin for Heartbeat.
|
||||
|
||||
Collects disk usage and I/O statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DiskMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor disk usage and I/O statistics.
|
||||
|
||||
Collects:
|
||||
- Disk partition information
|
||||
- Disk usage per partition (total, used, free, percent)
|
||||
- Disk I/O counters (read/write bytes, read/write count)
|
||||
- Disk I/O time statistics
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
partitions: List of mount points to monitor (default: all)
|
||||
include_io: Include disk I/O statistics (default: True)
|
||||
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
|
||||
"""
|
||||
|
||||
name = "disk_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the disk monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- partitions: List of specific mount points to monitor
|
||||
- include_io: Include I/O statistics (default: True)
|
||||
- exclude_types: List of filesystem types to exclude
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.partitions = self.config.get('partitions', None) # None = all partitions
|
||||
self.include_io = self.config.get('include_io', True)
|
||||
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for disk_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - disk_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
|
||||
|
||||
# Initialize I/O counters if available
|
||||
if self.include_io:
|
||||
try:
|
||||
self._prev_io = psutil.disk_io_counters(perdisk=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize disk I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current disk statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with disk metrics organized by partition:
|
||||
- partitions: Dict of partition data, keyed by mount point
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- fstype: Filesystem type (e.g., ext4)
|
||||
- total: Total space in bytes
|
||||
- used: Used space in bytes
|
||||
- free: Free space in bytes
|
||||
- percent: Usage percentage
|
||||
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
|
||||
- read_count: Number of reads
|
||||
- write_count: Number of writes
|
||||
- read_bytes: Bytes read
|
||||
- write_bytes: Bytes written
|
||||
- read_time: Time spent reading in ms
|
||||
- write_time: Time spent writing in ms
|
||||
- read_bytes_delta: Bytes read since last collection
|
||||
- write_bytes_delta: Bytes written since last collection
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting disk metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect disk metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect partition usage
|
||||
partitions_data = {}
|
||||
partitions = psutil.disk_partitions(all=False)
|
||||
|
||||
for partition in partitions:
|
||||
# Skip unwanted filesystem types
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
# Skip if we're only monitoring specific partitions
|
||||
if self.partitions and partition.mountpoint not in self.partitions:
|
||||
continue
|
||||
|
||||
try:
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
partitions_data[partition.mountpoint] = {
|
||||
'device': partition.device,
|
||||
'fstype': partition.fstype,
|
||||
'total': usage.total,
|
||||
'used': usage.used,
|
||||
'free': usage.free,
|
||||
'percent': usage.percent
|
||||
}
|
||||
except PermissionError:
|
||||
logger.debug(f"Permission denied accessing {partition.mountpoint}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading {partition.mountpoint}: {e}")
|
||||
continue
|
||||
|
||||
metrics['partitions'] = partitions_data
|
||||
|
||||
# Collect I/O statistics
|
||||
if self.include_io:
|
||||
try:
|
||||
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||
io_data = {}
|
||||
|
||||
for disk_name, counters in io_counters.items():
|
||||
disk_stats = {
|
||||
'read_count': counters.read_count,
|
||||
'write_count': counters.write_count,
|
||||
'read_bytes': counters.read_bytes,
|
||||
'write_bytes': counters.write_bytes,
|
||||
}
|
||||
|
||||
# Add time statistics if available
|
||||
if hasattr(counters, 'read_time'):
|
||||
disk_stats['read_time'] = counters.read_time
|
||||
if hasattr(counters, 'write_time'):
|
||||
disk_stats['write_time'] = counters.write_time
|
||||
if hasattr(counters, 'busy_time'):
|
||||
disk_stats['busy_time'] = counters.busy_time
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if disk_name in self._prev_io:
|
||||
prev = self._prev_io[disk_name]
|
||||
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
|
||||
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
|
||||
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
|
||||
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
|
||||
|
||||
io_data[disk_name] = disk_stats
|
||||
|
||||
metrics['io_counters'] = io_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect disk I/O statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Disk monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = DiskMonitorPlugin
|
||||
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Filesystem information plugin for Heartbeat.
|
||||
|
||||
Collects static filesystem and partition information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilesystemInfoPlugin(InfoPlugin):
|
||||
"""
|
||||
Collect filesystem and partition information.
|
||||
|
||||
This is an InfoPlugin that collects static information once during startup.
|
||||
|
||||
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
|
||||
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
|
||||
|
||||
Collects:
|
||||
- List of mounted filesystems
|
||||
- Partition details (device, mount point, filesystem type, options)
|
||||
- Filesystem capabilities and features
|
||||
|
||||
Configuration:
|
||||
include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
exclude_types: List of additional filesystem types to exclude (default: [])
|
||||
"""
|
||||
|
||||
name = "filesystem_info"
|
||||
interval = 0 # InfoPlugin - collect once
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the filesystem info plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
- exclude_types: List of filesystem types to exclude (default: [])
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_pseudo = self.config.get('include_pseudo', False)
|
||||
# By default, no exclusions since all=False filters most pseudo filesystems
|
||||
# Users can add specific types to exclude if needed
|
||||
self.exclude_types = set(self.config.get('exclude_types', []))
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for filesystem_info plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - filesystem_info cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect filesystem information.
|
||||
|
||||
Returns only physical mounted filesystems by default.
|
||||
|
||||
Returns:
|
||||
Dictionary with filesystem data:
|
||||
- filesystems: List of filesystem dictionaries:
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- mountpoint: Mount point path
|
||||
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
|
||||
- opts: Mount options (comma-separated string)
|
||||
- maxfile: Maximum filename length
|
||||
- maxpath: Maximum path length
|
||||
- filesystem_types: List of unique filesystem types found
|
||||
- mount_count: Total number of mounted filesystems
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_info()
|
||||
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting filesystem info: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect filesystem information from psutil."""
|
||||
info = {}
|
||||
filesystems = []
|
||||
filesystem_types = set()
|
||||
|
||||
# Get mounted disk partitions
|
||||
# all=False returns only physical devices (real mounted filesystems)
|
||||
# all=True would include pseudo filesystems (proc, sysfs, etc.)
|
||||
partitions = psutil.disk_partitions(all=self.include_pseudo)
|
||||
|
||||
for partition in partitions:
|
||||
# Additional filtering if exclude_types is specified
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
fs_info = {
|
||||
'device': partition.device,
|
||||
'mountpoint': partition.mountpoint,
|
||||
'fstype': partition.fstype,
|
||||
'opts': partition.opts,
|
||||
}
|
||||
|
||||
# Try to get filesystem capabilities
|
||||
try:
|
||||
# Get path configuration for this mount point
|
||||
import os
|
||||
if hasattr(os, 'pathconf'):
|
||||
try:
|
||||
# Maximum filename length
|
||||
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||
if max_name:
|
||||
fs_info['maxfile'] = max_name
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Maximum path length
|
||||
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||
if max_path:
|
||||
fs_info['maxpath'] = max_path
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
|
||||
|
||||
filesystems.append(fs_info)
|
||||
filesystem_types.add(partition.fstype)
|
||||
|
||||
info['filesystems'] = filesystems
|
||||
info['filesystem_types'] = sorted(list(filesystem_types))
|
||||
info['mount_count'] = len(filesystems)
|
||||
|
||||
# Add some additional filesystem statistics
|
||||
try:
|
||||
# Get boot time (useful for determining filesystem mount times)
|
||||
boot_time = psutil.boot_time()
|
||||
info['boot_time'] = boot_time
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get boot time: {e}")
|
||||
|
||||
return info
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Filesystem info cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = FilesystemInfoPlugin
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Memory monitoring plugin for Heartbeat.
|
||||
|
||||
Collects memory and swap usage statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
def _zfs_arc_bytes() -> int:
|
||||
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
|
||||
|
||||
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
|
||||
kernel (it is not in SReclaimable), so it would otherwise be counted as
|
||||
used memory.
|
||||
"""
|
||||
try:
|
||||
with open("/proc/spl/kstat/zfs/arcstats") as fh:
|
||||
for line in fh:
|
||||
parts = line.split()
|
||||
if len(parts) >= 3 and parts[0] == "size":
|
||||
return int(parts[2])
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
return 0
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor memory and swap usage.
|
||||
|
||||
Collects:
|
||||
- Physical memory (RAM) usage and statistics
|
||||
- Virtual memory details
|
||||
- Swap memory usage and statistics
|
||||
- Memory available for applications
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
|
||||
name = "memory_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the memory monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_swap = self.config.get('include_swap', True)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for memory_monitor plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - memory_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current memory statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with memory metrics:
|
||||
- memory_total: Total physical RAM in bytes
|
||||
- memory_available: Available memory in bytes
|
||||
- memory_used: Used memory in bytes
|
||||
- memory_free: Free memory in bytes
|
||||
- memory_percent: Memory usage percentage
|
||||
- memory_active: Active memory (Unix)
|
||||
- memory_inactive: Inactive memory (Unix)
|
||||
- memory_buffers: Buffers (Linux)
|
||||
- memory_cached: Cached (Linux)
|
||||
- memory_shared: Shared (Linux)
|
||||
- swap_total: Total swap in bytes (if include_swap)
|
||||
- swap_used: Used swap in bytes (if include_swap)
|
||||
- swap_free: Free swap in bytes (if include_swap)
|
||||
- swap_percent: Swap usage percentage (if include_swap)
|
||||
- swap_sin: Bytes swapped in from disk (if include_swap)
|
||||
- swap_sout: Bytes swapped out to disk (if include_swap)
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected memory metrics: {len(data)} fields")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting memory metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect memory metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Virtual (physical) memory statistics
|
||||
vmem = psutil.virtual_memory()
|
||||
|
||||
# psutil's available already excludes page cache / file buffers
|
||||
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
|
||||
# does not include it in SReclaimable / MemAvailable even though it is
|
||||
# reclaimable.
|
||||
arc_bytes = _zfs_arc_bytes()
|
||||
available = min(vmem.available + arc_bytes, vmem.total)
|
||||
used = vmem.total - available
|
||||
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
|
||||
|
||||
metrics['memory_total'] = vmem.total
|
||||
metrics['memory_available'] = available
|
||||
metrics['memory_used'] = used
|
||||
metrics['memory_free'] = vmem.free
|
||||
metrics['memory_percent'] = percent
|
||||
|
||||
# Platform-specific memory details
|
||||
if hasattr(vmem, 'active'):
|
||||
metrics['memory_active'] = vmem.active
|
||||
if hasattr(vmem, 'inactive'):
|
||||
metrics['memory_inactive'] = vmem.inactive
|
||||
if hasattr(vmem, 'buffers'):
|
||||
metrics['memory_buffers'] = vmem.buffers
|
||||
if hasattr(vmem, 'cached'):
|
||||
metrics['memory_cached'] = vmem.cached
|
||||
if hasattr(vmem, 'shared'):
|
||||
metrics['memory_shared'] = vmem.shared
|
||||
|
||||
# Swap memory statistics
|
||||
if self.include_swap:
|
||||
try:
|
||||
swap = psutil.swap_memory()
|
||||
metrics['swap_total'] = swap.total
|
||||
metrics['swap_used'] = swap.used
|
||||
metrics['swap_free'] = swap.free
|
||||
metrics['swap_percent'] = swap.percent
|
||||
|
||||
# Swap in/out counters (may not be available on all platforms)
|
||||
if hasattr(swap, 'sin'):
|
||||
metrics['swap_sin'] = swap.sin
|
||||
if hasattr(swap, 'sout'):
|
||||
metrics['swap_sout'] = swap.sout
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect swap statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Memory monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = MemoryMonitorPlugin
|
||||
@@ -0,0 +1,287 @@
|
||||
"""Nagios Plugin Runner for Heartbeat.
|
||||
|
||||
Executes Nagios-compatible monitoring plugins and parses their output.
|
||||
|
||||
Nagios Plugin Standard:
|
||||
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
- Output format: Single line status message, optional performance data
|
||||
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Example configuration in ~/.hb.yaml:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
interval: 60
|
||||
commands:
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
```
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
# Nagios exit codes
|
||||
NAGIOS_UNKNOWN = 3
|
||||
|
||||
STATUS_NAMES = {
|
||||
0: "OK",
|
||||
1: "WARNING",
|
||||
2: "CRITICAL",
|
||||
3: "UNKNOWN",
|
||||
}
|
||||
|
||||
|
||||
class NagiosRunnerPlugin(MonitorPlugin):
|
||||
"""Run Nagios-compatible monitoring plugins.
|
||||
|
||||
This plugin executes external Nagios plugins and collects their output,
|
||||
including status codes, messages, and performance data.
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
commands: List of command definitions with 'name' and 'command' keys
|
||||
timeout: Command execution timeout in seconds (default: 30)
|
||||
|
||||
Example:
|
||||
nagios_runner:
|
||||
interval: 300 # Check every 5 minutes
|
||||
timeout: 30
|
||||
commands:
|
||||
- name: check_disk
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
"""
|
||||
|
||||
name = "nagios_runner"
|
||||
version = "1.0.0"
|
||||
description = "Execute Nagios-compatible monitoring plugins"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
# Extract configuration
|
||||
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the Nagios runner plugin.
|
||||
|
||||
Returns:
|
||||
True if at least one command is configured, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
if not self.commands:
|
||||
self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
|
||||
return False
|
||||
|
||||
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name", "unnamed")
|
||||
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
|
||||
|
||||
# Validate absolute command paths early
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name", "unnamed")
|
||||
command = cmd_config.get("command", "")
|
||||
if not command:
|
||||
continue
|
||||
try:
|
||||
tokens = shlex.split(command)
|
||||
except ValueError:
|
||||
continue # malformed command string; skip validation
|
||||
if not tokens:
|
||||
continue
|
||||
exe = tokens[0]
|
||||
if os.path.isabs(exe):
|
||||
if not os.path.isfile(exe):
|
||||
self.logger.warning(
|
||||
f"Command '{name}': executable not found: {exe}"
|
||||
)
|
||||
elif not os.access(exe, os.X_OK):
|
||||
self.logger.warning(
|
||||
f"Command '{name}': executable not executable: {exe}"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect metrics from all configured Nagios plugins.
|
||||
|
||||
Returns:
|
||||
Dictionary with results from all plugins
|
||||
"""
|
||||
results = {}
|
||||
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name")
|
||||
command = cmd_config.get("command")
|
||||
|
||||
if not name or not command:
|
||||
self.logger.warning("Skipping command with missing name or command")
|
||||
continue
|
||||
|
||||
# Execute plugin
|
||||
try:
|
||||
status_code, output, perfdata = await self._run_nagios_plugin(command)
|
||||
|
||||
# Store results
|
||||
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
|
||||
results[f"{name}_status_code"] = status_code
|
||||
results[f"{name}_output"] = output
|
||||
|
||||
# Parse and add performance data
|
||||
if perfdata:
|
||||
for metric_name, metric_value in perfdata.items():
|
||||
results[f"{name}_{metric_name}"] = metric_value
|
||||
|
||||
self.logger.info(
|
||||
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error running {name}: {e}", exc_info=True)
|
||||
results[f"{name}_status"] = "ERROR"
|
||||
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||
results[f"{name}_output"] = str(e)
|
||||
|
||||
return results
|
||||
|
||||
async def _run_nagios_plugin(
|
||||
self,
|
||||
command: str
|
||||
) -> Tuple[int, str, Dict[str, Any]]:
|
||||
"""Execute a Nagios plugin and parse its output."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_shell(
|
||||
command,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||
proc.communicate(), timeout=self.timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.communicate()
|
||||
self.logger.error(f"Command timed out: {command}")
|
||||
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||
|
||||
status_code = proc.returncode
|
||||
|
||||
if status_code < 0:
|
||||
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
|
||||
|
||||
if status_code > 3:
|
||||
status_code = NAGIOS_UNKNOWN
|
||||
|
||||
stdout = stdout_bytes.decode(errors="replace").strip()
|
||||
stderr = stderr_bytes.decode(errors="replace").strip()
|
||||
|
||||
# Parse perfdata from stdout before mixing in stderr
|
||||
perfdata = self._parse_perfdata(stdout)
|
||||
|
||||
# Build status message
|
||||
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
|
||||
|
||||
if not stdout and stderr:
|
||||
output_msg = stderr
|
||||
elif stdout and stderr:
|
||||
output_msg = f"{status_part} [stderr: {stderr}]"
|
||||
else:
|
||||
output_msg = status_part
|
||||
|
||||
return status_code, output_msg, perfdata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error executing command: {e}")
|
||||
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||
|
||||
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
|
||||
"""Parse Nagios performance data from plugin output.
|
||||
|
||||
Nagios performance data format:
|
||||
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Multiple metrics separated by spaces.
|
||||
|
||||
Args:
|
||||
output: Plugin output string
|
||||
|
||||
Returns:
|
||||
Dictionary of metric_name: value
|
||||
"""
|
||||
perfdata = {}
|
||||
|
||||
# Performance data comes after the pipe character
|
||||
if '|' not in output:
|
||||
return perfdata
|
||||
|
||||
perf_section = output.split('|', 1)[1].strip()
|
||||
|
||||
# Regex to match performance data format
|
||||
# Matches: 'label'=value or label=value
|
||||
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
|
||||
|
||||
for match in re.finditer(perf_regex, perf_section):
|
||||
label = match.group(1).strip()
|
||||
value_str = match.group(2)
|
||||
uom = match.group(3) or ""
|
||||
warn = match.group(4)
|
||||
crit = match.group(5)
|
||||
min_val = match.group(6)
|
||||
max_val = match.group(7)
|
||||
|
||||
# Convert value to float
|
||||
try:
|
||||
value = float(value_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Store the value
|
||||
perfdata[label] = value
|
||||
|
||||
# Optionally store UOM as separate field
|
||||
if uom:
|
||||
perfdata[f"{label}_uom"] = uom
|
||||
|
||||
# Store thresholds if present
|
||||
if warn:
|
||||
try:
|
||||
perfdata[f"{label}_warn"] = float(warn)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if crit:
|
||||
try:
|
||||
perfdata[f"{label}_crit"] = float(crit)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if min_val:
|
||||
try:
|
||||
perfdata[f"{label}_min"] = float(min_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if max_val:
|
||||
try:
|
||||
perfdata[f"{label}_max"] = float(max_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return perfdata
|
||||
@@ -0,0 +1,240 @@
|
||||
"""
|
||||
Network monitoring plugin for Heartbeat.
|
||||
|
||||
Collects network interface statistics and connection information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NetworkMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor network interface statistics and connections.
|
||||
|
||||
Collects:
|
||||
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
|
||||
- Per-interface statistics
|
||||
- Network connection counts by state
|
||||
- Interface addresses and configuration
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
interfaces: List of interfaces to monitor (default: all)
|
||||
include_connections: Include connection statistics (default: True)
|
||||
include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
|
||||
name = "network_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the network monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- interfaces: List of specific interfaces to monitor
|
||||
- include_connections: Include connection stats (default: True)
|
||||
- include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
|
||||
self.include_connections = self.config.get('include_connections', True)
|
||||
self.include_addresses = self.config.get('include_addresses', False)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for network_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - network_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
|
||||
f"connections: {self.include_connections})")
|
||||
|
||||
# Initialize I/O counters
|
||||
try:
|
||||
self._prev_io = psutil.net_io_counters(pernic=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize network I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current network statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with network metrics:
|
||||
- interfaces: Dict of interface statistics, keyed by interface name
|
||||
- bytes_sent: Total bytes sent
|
||||
- bytes_recv: Total bytes received
|
||||
- packets_sent: Total packets sent
|
||||
- packets_recv: Total packets received
|
||||
- errin: Total incoming errors
|
||||
- errout: Total outgoing errors
|
||||
- dropin: Total incoming packets dropped
|
||||
- dropout: Total outgoing packets dropped
|
||||
- bytes_sent_delta: Bytes sent since last collection
|
||||
- bytes_recv_delta: Bytes received since last collection
|
||||
- packets_sent_delta: Packets sent since last collection
|
||||
- packets_recv_delta: Packets received since last collection
|
||||
- connections: Connection statistics by state (if include_connections)
|
||||
- ESTABLISHED: Count of established connections
|
||||
- LISTEN: Count of listening sockets
|
||||
- TIME_WAIT: Count of TIME_WAIT connections
|
||||
- etc.
|
||||
- addresses: Interface address information (if include_addresses)
|
||||
- Dict keyed by interface name with address details
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting network metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect network metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect per-interface I/O counters
|
||||
try:
|
||||
io_counters = psutil.net_io_counters(pernic=True)
|
||||
interfaces_data = {}
|
||||
|
||||
for iface_name, counters in io_counters.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_stats = {
|
||||
'bytes_sent': counters.bytes_sent,
|
||||
'bytes_recv': counters.bytes_recv,
|
||||
'packets_sent': counters.packets_sent,
|
||||
'packets_recv': counters.packets_recv,
|
||||
'errin': counters.errin,
|
||||
'errout': counters.errout,
|
||||
'dropin': counters.dropin,
|
||||
'dropout': counters.dropout,
|
||||
}
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if iface_name in self._prev_io:
|
||||
prev = self._prev_io[iface_name]
|
||||
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
|
||||
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
|
||||
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
|
||||
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
|
||||
|
||||
interfaces_data[iface_name] = iface_stats
|
||||
|
||||
metrics['interfaces'] = interfaces_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect network I/O counters: {e}")
|
||||
|
||||
# Collect connection statistics
|
||||
if self.include_connections:
|
||||
try:
|
||||
connections = psutil.net_connections(kind='inet')
|
||||
conn_stats = {}
|
||||
|
||||
# Count connections by state
|
||||
for conn in connections:
|
||||
state = conn.status
|
||||
conn_stats[state] = conn_stats.get(state, 0) + 1
|
||||
|
||||
metrics['connections'] = conn_stats
|
||||
|
||||
except (PermissionError, psutil.AccessDenied):
|
||||
logger.debug("Permission denied for net_connections (requires root/admin)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect connection statistics: {e}")
|
||||
|
||||
# Collect interface addresses
|
||||
if self.include_addresses:
|
||||
try:
|
||||
addresses = psutil.net_if_addrs()
|
||||
addr_data = {}
|
||||
|
||||
for iface_name, addrs in addresses.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_addrs = []
|
||||
for addr in addrs:
|
||||
addr_info = {
|
||||
'family': str(addr.family),
|
||||
'address': addr.address,
|
||||
}
|
||||
if addr.netmask:
|
||||
addr_info['netmask'] = addr.netmask
|
||||
if addr.broadcast:
|
||||
addr_info['broadcast'] = addr.broadcast
|
||||
iface_addrs.append(addr_info)
|
||||
|
||||
addr_data[iface_name] = iface_addrs
|
||||
|
||||
metrics['addresses'] = addr_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface addresses: {e}")
|
||||
|
||||
# Add interface stats (up/down status, speed, mtu)
|
||||
try:
|
||||
if_stats = psutil.net_if_stats()
|
||||
stats_data = {}
|
||||
|
||||
for iface_name, stats in if_stats.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
stats_data[iface_name] = {
|
||||
'isup': stats.isup,
|
||||
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
|
||||
'speed': stats.speed,
|
||||
'mtu': stats.mtu,
|
||||
}
|
||||
|
||||
metrics['interface_stats'] = stats_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface stats: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Network monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = NetworkMonitorPlugin
|
||||
@@ -0,0 +1,142 @@
|
||||
"""OS Information Plugin for Heartbeat.
|
||||
|
||||
Collects static operating system information including OS name, version,
|
||||
kernel, architecture, and distribution details.
|
||||
"""
|
||||
|
||||
import platform
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
|
||||
class OSInfoPlugin(InfoPlugin):
|
||||
"""Collect operating system information.
|
||||
|
||||
This plugin gathers static OS information that rarely changes:
|
||||
- OS name and version
|
||||
- Kernel version
|
||||
- Architecture (x86_64, arm64, etc.)
|
||||
- Distribution details (for Linux)
|
||||
- Python version (used by hbc)
|
||||
"""
|
||||
|
||||
name = "os_info"
|
||||
version = "1.0.0"
|
||||
description = "Operating system and platform information"
|
||||
interval = 0 # InfoPlugin: collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the OS info plugin.
|
||||
|
||||
Returns:
|
||||
True (always succeeds - platform module is stdlib)
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
return True
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect OS information.
|
||||
|
||||
Returns:
|
||||
Dictionary with OS details
|
||||
"""
|
||||
try:
|
||||
from hbd import __version__ as hbc_version
|
||||
data = {
|
||||
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
|
||||
"node": platform.node(), # hostname
|
||||
"release": platform.release(), # kernel version
|
||||
"version": platform.version(), # detailed version
|
||||
"machine": platform.machine(), # e.g., "x86_64", "arm64"
|
||||
"processor": platform.processor(), # processor name
|
||||
"architecture": platform.architecture()[0], # e.g., "64bit"
|
||||
"python_version": platform.python_version(),
|
||||
"python_implementation": platform.python_implementation(),
|
||||
"hbc_version": hbc_version,
|
||||
"hbc_type": "full",
|
||||
}
|
||||
if self.config.get("owner"):
|
||||
self.logger.debug(f"Adding owner from config: {self.config['owner']}")
|
||||
data["owner"] = self.config["owner"]
|
||||
|
||||
# Add Linux-specific distribution info
|
||||
if platform.system() == "Linux":
|
||||
data.update(self._get_linux_distro())
|
||||
|
||||
# Add macOS-specific info
|
||||
elif platform.system() == "Darwin":
|
||||
data["macos_version"] = platform.mac_ver()[0]
|
||||
|
||||
# Add Windows-specific info
|
||||
elif platform.system() == "Windows":
|
||||
win_ver = platform.win32_ver()
|
||||
data["windows_release"] = win_ver[0]
|
||||
data["windows_version"] = win_ver[1]
|
||||
data["windows_sp"] = win_ver[2]
|
||||
data["windows_type"] = win_ver[3]
|
||||
|
||||
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
|
||||
return {}
|
||||
|
||||
def _get_linux_distro(self) -> Dict[str, str]:
|
||||
"""Get Linux distribution information.
|
||||
|
||||
Returns:
|
||||
Dictionary with distribution details
|
||||
"""
|
||||
distro_info = {}
|
||||
|
||||
# Try reading /etc/os-release (standard on modern Linux)
|
||||
os_release = Path("/etc/os-release")
|
||||
if os_release.exists():
|
||||
try:
|
||||
with open(os_release) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line and not line.startswith("#"):
|
||||
key, value = line.split("=", 1)
|
||||
# Remove quotes from value
|
||||
value = value.strip('"').strip("'")
|
||||
# Map common keys
|
||||
if key == "NAME":
|
||||
distro_info["distro_name"] = value
|
||||
elif key == "VERSION":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "VERSION_ID":
|
||||
distro_info["distro_version_id"] = value
|
||||
elif key == "PRETTY_NAME":
|
||||
distro_info["distro_pretty_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/os-release: {e}")
|
||||
|
||||
# Fallback: try lsb_release (older systems)
|
||||
elif Path("/etc/lsb-release").exists():
|
||||
try:
|
||||
with open("/etc/lsb-release") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
if key == "DISTRIB_ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "DISTRIB_RELEASE":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "DISTRIB_DESCRIPTION":
|
||||
distro_info["distro_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
|
||||
|
||||
return distro_info
|
||||
@@ -0,0 +1,147 @@
|
||||
"""Ping Monitor Plugin for Heartbeat.
|
||||
|
||||
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||
|
||||
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
ping_monitor:
|
||||
interval: 60 # ping every 60 seconds (default)
|
||||
count: 3 # ICMP packets per ping run (default 3)
|
||||
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||
hosts:
|
||||
- 8.8.8.8
|
||||
- 192.168.1.1
|
||||
```
|
||||
|
||||
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||
by underscores so it is a valid identifier):
|
||||
|
||||
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||
|
||||
Server-side threshold config example:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
ping_monitor:
|
||||
8_8_8_8_rtt_avg:
|
||||
warning: 20.0
|
||||
critical: 100.0
|
||||
```
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
def _host_key(host: str) -> str:
|
||||
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||
|
||||
|
||||
class PingMonitorPlugin(MonitorPlugin):
|
||||
"""Ping one or more configured hosts and report RTT metrics."""
|
||||
|
||||
name = "ping_monitor"
|
||||
version = "1.0.0"
|
||||
description = "ICMP ping latency monitoring"
|
||||
interval = 60
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
cfg = config or {}
|
||||
self.interval = cfg.get("interval", 60)
|
||||
self.count = int(cfg.get("count", 3))
|
||||
self.timeout = int(cfg.get("timeout", 5))
|
||||
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||
raw_hosts = cfg.get("hosts", {})
|
||||
if isinstance(raw_hosts, list):
|
||||
self.hosts = {h: {} for h in raw_hosts}
|
||||
else:
|
||||
self.hosts = dict(raw_hosts)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
if not self.hosts:
|
||||
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||
return False
|
||||
self.logger.info(
|
||||
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||
len(self.hosts), self.interval, self.count, self.timeout,
|
||||
)
|
||||
return True
|
||||
|
||||
async def _ping(self, host: str) -> Dict[str, float]:
|
||||
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||
if sys.platform == "win32":
|
||||
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||
else:
|
||||
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(
|
||||
proc.communicate(),
|
||||
timeout=self.timeout * self.count + 2,
|
||||
)
|
||||
output = stdout.decode(errors="replace")
|
||||
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
|
||||
# Parse packet loss
|
||||
loss = 100.0
|
||||
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||
if loss_match:
|
||||
loss = float(loss_match.group(1))
|
||||
|
||||
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||
rtt_match = re.search(
|
||||
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||
output,
|
||||
)
|
||||
if rtt_match:
|
||||
return {
|
||||
"rtt_min": float(rtt_match.group(1)),
|
||||
"rtt_avg": float(rtt_match.group(2)),
|
||||
"rtt_max": float(rtt_match.group(3)),
|
||||
"loss": loss,
|
||||
}
|
||||
|
||||
# Host unreachable or all packets lost
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": loss}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
data: Dict[str, Any] = {}
|
||||
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||
for host, task in tasks.items():
|
||||
try:
|
||||
result = await task
|
||||
except Exception as e:
|
||||
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
key = _host_key(host)
|
||||
for metric, value in result.items():
|
||||
data[f"{key}_{metric}"] = value
|
||||
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||
return data
|
||||
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
ZFS pool monitoring plugin for Heartbeat.
|
||||
|
||||
Collects per-pool health, capacity, and cumulative I/O statistics via zpool(8).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _int(s: str) -> Optional[int]:
|
||||
try:
|
||||
return int(s.strip().rstrip("KMGTkBkmgt%x"))
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
def _float(s: str) -> Optional[float]:
|
||||
try:
|
||||
return float(s.strip().rstrip("%x"))
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
class ZFSMonitorPlugin(MonitorPlugin):
|
||||
"""Monitor ZFS pool health, capacity, and I/O statistics.
|
||||
|
||||
Collects per pool:
|
||||
- health: ONLINE, DEGRADED, FAULTED, etc.
|
||||
- size / alloc / free: total, allocated and free bytes
|
||||
- capacity: percentage used (0-100)
|
||||
- frag: fragmentation percentage
|
||||
- dedup: deduplication ratio
|
||||
- read_ops / write_ops: cumulative I/O operations since last boot/clear
|
||||
- read_bw / write_bw: cumulative bytes transferred since last boot/clear
|
||||
|
||||
Configuration:
|
||||
interval: collection interval in seconds (default: 300)
|
||||
pools: list of pool names to monitor (default: all)
|
||||
"""
|
||||
|
||||
name = "zfs_monitor"
|
||||
description = "ZFS pool health, capacity, and I/O statistics"
|
||||
interval = 300
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.interval = self.config.get("interval", 300)
|
||||
self._pools_filter: Optional[List[str]] = self.config.get("pools", None)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
if not shutil.which("zpool"):
|
||||
self.skip_reason = "zpool not found"
|
||||
return False
|
||||
logger.info("ZFS monitor initialized (interval: %ds)", self.interval)
|
||||
return True
|
||||
|
||||
async def _run(self, *args: str) -> List[str]:
|
||||
"""Run a command and return its stdout lines, or [] on error."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*args,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
|
||||
return stdout.decode(errors="replace").splitlines()
|
||||
except (FileNotFoundError, asyncio.TimeoutError) as exc:
|
||||
logger.warning("zfs_monitor: %s: %s", args[0], exc)
|
||||
return []
|
||||
|
||||
async def _zpool_list(self) -> Dict[str, Dict]:
|
||||
"""Return per-pool health and capacity from `zpool list`."""
|
||||
lines = await self._run(
|
||||
"zpool", "list", "-H", "-p",
|
||||
"-o", "name,health,size,alloc,free,cap,frag,dedup",
|
||||
)
|
||||
pools: Dict[str, Dict] = {}
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 8:
|
||||
continue
|
||||
name = parts[0].strip()
|
||||
if self._pools_filter and name not in self._pools_filter:
|
||||
continue
|
||||
health = parts[1].strip()
|
||||
if health == "ONLINE":
|
||||
status = 0
|
||||
elif health in ("DEGRADED", "ONLINE with errors"):
|
||||
status = 1
|
||||
elif health in ("FAULTED", "OFFLINE", "UNAVAIL"):
|
||||
status = 2
|
||||
else:
|
||||
status = 3 # unknown status
|
||||
pools[name] = {
|
||||
"health": health,
|
||||
"status": status,
|
||||
"size": _int(parts[2]),
|
||||
"alloc": _int(parts[3]),
|
||||
"free": _int(parts[4]),
|
||||
"capacity": _float(parts[5]),
|
||||
"frag": _float(parts[6]),
|
||||
"dedup": _float(parts[7]),
|
||||
}
|
||||
return pools
|
||||
|
||||
async def _zpool_iostat(self) -> Dict[str, Dict]:
|
||||
"""Return per-pool cumulative I/O counters from `zpool iostat`."""
|
||||
lines = await self._run("zpool", "iostat", "-H", "-p")
|
||||
io: Dict[str, Dict] = {}
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
name = parts[0].strip()
|
||||
if not name or name.startswith(" "):
|
||||
continue
|
||||
io[name] = {
|
||||
"read_ops": _int(parts[3]),
|
||||
"write_ops": _int(parts[4]),
|
||||
"read_bw": _int(parts[5]),
|
||||
"write_bw": _int(parts[6]),
|
||||
}
|
||||
return io
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
pools, io = await asyncio.gather(self._zpool_list(), self._zpool_iostat())
|
||||
for name, stats in io.items():
|
||||
if name in pools:
|
||||
pools[name].update(stats)
|
||||
return {"pools": pools}
|
||||
|
||||
|
||||
plugin = ZFSMonitorPlugin
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Common utilities shared between hbc and hbd."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,162 @@
|
||||
"""Message encoding/decoding utilities for hbd protocol.
|
||||
|
||||
Message Types:
|
||||
HTB: Heartbeat message (client -> server)
|
||||
ACK: Acknowledgment (server -> client)
|
||||
CMD: Command message (server -> client)
|
||||
UPD: Update message (server -> client)
|
||||
PLG: Plugin data message (client -> server)
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Union
|
||||
import json
|
||||
import zlib
|
||||
|
||||
|
||||
def encode_value(v: Any) -> str:
|
||||
"""Encode a value for protocol transmission.
|
||||
|
||||
Args:
|
||||
v: Value to encode (int, float, str, bool, list, dict, etc.)
|
||||
|
||||
Returns:
|
||||
String representation suitable for protocol
|
||||
"""
|
||||
if isinstance(v, float):
|
||||
return f"{v:0.5f}"
|
||||
elif isinstance(v, (list, dict)):
|
||||
# Use JSON encoding for complex types, prefixed with @
|
||||
return "@" + json.dumps(v)
|
||||
elif isinstance(v, bool):
|
||||
return str(int(v)) # True->1, False->0
|
||||
else:
|
||||
return str(v)
|
||||
|
||||
|
||||
def decode_value(val: str) -> Any:
|
||||
"""Decode a value from protocol format.
|
||||
|
||||
Args:
|
||||
val: String value from protocol
|
||||
|
||||
Returns:
|
||||
Decoded Python object
|
||||
"""
|
||||
if not val:
|
||||
return val
|
||||
|
||||
# Check for JSON-encoded complex types
|
||||
if val.startswith("@"):
|
||||
try:
|
||||
return json.loads(val[1:])
|
||||
except Exception:
|
||||
return val[1:] # Return as string without @
|
||||
|
||||
# Try numeric conversion (avoid eval to prevent SyntaxWarnings on version strings)
|
||||
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
pass
|
||||
return val
|
||||
|
||||
return val
|
||||
|
||||
|
||||
def dicttos(ID: str, d: Dict[str, Any]):
|
||||
"""Serialize a dict to protocol message bytes.
|
||||
|
||||
If compress is True, the payload is zlib-compressed and the message is
|
||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||
`ID:key=value;...` (bytes).
|
||||
"""
|
||||
s = []
|
||||
for k in d:
|
||||
v = d[k]
|
||||
encoded_val = encode_value(v)
|
||||
s.append(f"{k}={encoded_val}")
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
|
||||
|
||||
def stodict(msg: bytes):
|
||||
"""Deserialize a protocol message into a dict.
|
||||
|
||||
Mirrors original behaviour: detects compressed messages starting with
|
||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||
message ID and the parsed key/value pairs.
|
||||
"""
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
# message is: b'!ID:' + compressed_payload
|
||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||
try:
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
except Exception:
|
||||
# malformed compressed payload
|
||||
return {}
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
try:
|
||||
r0 = msg.split(b":", 1)
|
||||
pk = r0[1].decode()
|
||||
d["ID"] = r0[0].decode()
|
||||
except Exception:
|
||||
return {}
|
||||
if not pk:
|
||||
return d
|
||||
parts = pk.split(";")
|
||||
for v in parts:
|
||||
if not v:
|
||||
continue
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
val = vr[1].strip()
|
||||
d[k] = decode_value(val)
|
||||
return d
|
||||
|
||||
|
||||
def oldmtodict(msg: bytes):
|
||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||
|
||||
The original implementation prefixed with 'HTB:' and called stodict.
|
||||
"""
|
||||
return stodict(b"HTB:" + msg)
|
||||
|
||||
|
||||
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
|
||||
"""Encode plugin data into a PLG message.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
|
||||
data: Plugin data dictionary
|
||||
compress: Whether to compress the payload
|
||||
|
||||
Returns:
|
||||
Encoded message bytes
|
||||
"""
|
||||
# Add plugin name to data
|
||||
full_data = {"plugin": plugin_name, **data}
|
||||
return dicttos("PLG", full_data)
|
||||
|
||||
|
||||
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
|
||||
"""Decode a PLG message into plugin data.
|
||||
|
||||
Args:
|
||||
msg: Raw message bytes
|
||||
|
||||
Returns:
|
||||
Dictionary with 'ID', 'plugin', and plugin data fields
|
||||
"""
|
||||
return stodict(msg)
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
"""Configuration loader and defaults for hbd."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
DEFAULTS = {
|
||||
"hb_port": 50003,
|
||||
"hbd_port": 50004,
|
||||
"hbd_host": "",
|
||||
"pickfile": "/tmp/hb.pick",
|
||||
"logfile": "/var/log/heartbeat.log",
|
||||
"logfmt": "text",
|
||||
"pushsrv": "pushover",
|
||||
"pushover_token": "",
|
||||
"pushover_user": "",
|
||||
"interval": 20,
|
||||
"grace": 2,
|
||||
"dyndomains": ["wrede.org"],
|
||||
"watchhosts": [],
|
||||
"dyndnshosts": [],
|
||||
"drophosts": [],
|
||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
"smtpserver": "smtp.fastmail.com",
|
||||
"smtpuser": "andreas@wrede.ca",
|
||||
"smtppassword": "pvtvefyp5gbhnch2",
|
||||
"smtpport": 587,
|
||||
"toemail": ["aew.hbd.notify@wrede.ca"],
|
||||
"fromemail": "aew.hbd@wrede.ca",
|
||||
"ws_port": 50005,
|
||||
"wss_port": None,
|
||||
"cert_path": "/usr/local/etc/ssl/",
|
||||
"wss_pem": "fullchain.pem",
|
||||
"wss_key": "privkey.pem",
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
"""
|
||||
cfg = DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# only keep known keys
|
||||
for k, v in data.items():
|
||||
if k in cfg:
|
||||
cfg[k] = v
|
||||
else:
|
||||
logging.warning("unknown config key %s in %s", k, path)
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
@@ -0,0 +1,196 @@
|
||||
# Example Heartbeat Client Configuration
|
||||
# This file demonstrates all available configuration options for the heartbeat client (hbc)
|
||||
# and its plugin system.
|
||||
|
||||
# ==============================================================================
|
||||
# Server Configuration
|
||||
# ==============================================================================
|
||||
server: hbd.example.com # Heartbeat server hostname or IP
|
||||
port: 50003 # Server UDP port (default: 50003)
|
||||
interval: 30 # Heartbeat interval in seconds (default: 30)
|
||||
|
||||
# ==============================================================================
|
||||
# Plugin Configuration
|
||||
# ==============================================================================
|
||||
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
|
||||
# and configured with plugin-specific settings.
|
||||
|
||||
plugins:
|
||||
# --------------------------------------------------------------------------
|
||||
# OS Information Plugin (InfoPlugin - runs once at startup)
|
||||
# --------------------------------------------------------------------------
|
||||
os_info:
|
||||
enabled: true
|
||||
# No additional configuration needed
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
|
||||
# --------------------------------------------------------------------------
|
||||
cpu_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
per_core: false # Collect per-core CPU statistics (default: false)
|
||||
# When per_core is true, will report CPU usage for each core separately
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Memory Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
memory_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_swap: true # Include swap memory statistics (default: true)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Disk Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
disk_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_io: true # Include I/O statistics (default: true)
|
||||
# Optional: Monitor only specific partitions
|
||||
# partitions:
|
||||
# - /
|
||||
# - /home
|
||||
# - /var
|
||||
# Optional: Exclude specific filesystem types
|
||||
exclude_types:
|
||||
- tmpfs
|
||||
- devtmpfs
|
||||
- squashfs
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Network Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
network_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_connections: true # Include connection statistics (default: true)
|
||||
include_addresses: false # Include interface addresses (default: false)
|
||||
# Optional: Monitor only specific interfaces
|
||||
# interfaces:
|
||||
# - eth0
|
||||
# - wlan0
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
|
||||
# --------------------------------------------------------------------------
|
||||
filesystem_info:
|
||||
enabled: true
|
||||
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
|
||||
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
|
||||
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
|
||||
# Optional: Exclude additional specific filesystem types
|
||||
# exclude_types:
|
||||
# - squashfs
|
||||
# - iso9660
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Nagios Runner Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
nagios_runner:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
timeout: 30 # Plugin execution timeout in seconds (default: 30)
|
||||
|
||||
# List of Nagios plugins to execute
|
||||
# Each command is executed as-is, so provide full paths and arguments
|
||||
commands:
|
||||
# System load monitoring
|
||||
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
# Disk space monitoring
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||
|
||||
# Process monitoring
|
||||
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
|
||||
|
||||
# Swap usage
|
||||
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
|
||||
# Custom script example
|
||||
# - /usr/local/bin/check_my_app.sh
|
||||
|
||||
# ==============================================================================
|
||||
# Advanced Options
|
||||
# ==============================================================================
|
||||
# These options control client behavior
|
||||
|
||||
# Compression: Enable zlib compression for heartbeat messages (default: true)
|
||||
compress: true
|
||||
|
||||
# Hostname: Override the system hostname (default: auto-detect)
|
||||
# hostname: myhost.example.com
|
||||
|
||||
# Message: Custom message included in heartbeat (optional)
|
||||
# message: "Production web server"
|
||||
|
||||
# Logging
|
||||
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||
# logfile: /var/log/hbc.log # Optional log file path
|
||||
|
||||
# ==============================================================================
|
||||
# Example Profiles
|
||||
# ==============================================================================
|
||||
# Below are example configuration profiles for different use cases
|
||||
|
||||
# Minimal Configuration (default settings):
|
||||
# -----------------------------------------
|
||||
# server: hbd.example.com
|
||||
# interval: 30
|
||||
|
||||
# Monitoring Server (comprehensive metrics):
|
||||
# ------------------------------------------
|
||||
# server: monitoring.example.com
|
||||
# interval: 30
|
||||
# plugins:
|
||||
# cpu_monitor:
|
||||
# enabled: true
|
||||
# interval: 15
|
||||
# per_core: true
|
||||
# memory_monitor:
|
||||
# enabled: true
|
||||
# interval: 15
|
||||
# disk_monitor:
|
||||
# enabled: true
|
||||
# interval: 60
|
||||
# network_monitor:
|
||||
# enabled: true
|
||||
# interval: 30
|
||||
# include_connections: true
|
||||
|
||||
# Nagios Integration (leverage existing plugins):
|
||||
# -----------------------------------------------
|
||||
# server: hbd.example.com
|
||||
# plugins:
|
||||
# nagios_runner:
|
||||
# enabled: true
|
||||
# interval: 300 # Check every 5 minutes
|
||||
# commands:
|
||||
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
|
||||
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
|
||||
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
|
||||
|
||||
# ==============================================================================
|
||||
# Threshold Configuration (for Heartbeat Daemon)
|
||||
# ==============================================================================
|
||||
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
|
||||
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
|
||||
#
|
||||
# Basic threshold example:
|
||||
# thresholds:
|
||||
# cpu_monitor:
|
||||
# cpu_percent:
|
||||
# warning: 80.0
|
||||
# critical: 90.0
|
||||
# memory_monitor:
|
||||
# percent:
|
||||
# warning: 85.0
|
||||
# critical: 95.0
|
||||
# disk_monitor:
|
||||
# partitions:
|
||||
# /:
|
||||
# percent:
|
||||
# warning: 80.0
|
||||
# critical: 90.0
|
||||
|
||||
@@ -0,0 +1,296 @@
|
||||
# ==============================================================================
|
||||
# Heartbeat Daemon Multi-Threshold Configuration Example
|
||||
# ==============================================================================
|
||||
# This file demonstrates the new multi-threshold configuration feature that allows
|
||||
# different threshold settings for different hosts/clients.
|
||||
#
|
||||
# Features:
|
||||
# - Define multiple named threshold configurations
|
||||
# - Map specific hosts to specific threshold configurations
|
||||
# - Set a default configuration for unmapped hosts
|
||||
# - Backward compatible with single threshold configuration
|
||||
# ==============================================================================
|
||||
|
||||
# Global threshold settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||
|
||||
# Optional: Set default threshold config (defaults to "default" if not specified)
|
||||
default_threshold_config: "default"
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Multiple Named Threshold Configurations
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define multiple threshold configurations with different sensitivity levels
|
||||
threshold_configs:
|
||||
|
||||
# Default configuration - moderate thresholds for most servers
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
load_1min:
|
||||
warning: 4.0
|
||||
critical: 8.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
# RTT thresholds (applies to all hosts)
|
||||
warning: 50.0 # ms
|
||||
critical: 200.0
|
||||
|
||||
# High sensitivity configuration - lower thresholds for critical systems
|
||||
high_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 60.0 # Alert earlier
|
||||
critical: 75.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15 # More hysteresis to reduce flapping
|
||||
load_1min:
|
||||
warning: 2.0
|
||||
critical: 4.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 75.0 # Alert at lower memory usage
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 75.0
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
/var:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 30.0
|
||||
critical: 100.0
|
||||
|
||||
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||
low_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 90.0 # Only alert at very high usage
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 90.0
|
||||
critical: 98.0
|
||||
operator: ">"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 90.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 100.0
|
||||
critical: 500.0
|
||||
|
||||
# Production database servers - specialized thresholds
|
||||
database:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 70.0
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 90.0 # Databases can use high memory
|
||||
critical: 97.0
|
||||
operator: ">"
|
||||
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
/var/lib/mysql: # Database data partition
|
||||
percent:
|
||||
warning: 75.0 # Alert earlier for DB partition
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 20.0 # Stricter latency requirements
|
||||
critical: 50.0
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Host to Threshold Configuration Mapping
|
||||
# ----------------------------------------------------------------------------
|
||||
# Map specific hosts to specific threshold configurations
|
||||
# ----------------------------------------------------------------------------
|
||||
# Notification Channels
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define notification providers centrally with their credentials
|
||||
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
||||
notification_channels:
|
||||
# Signal notifications
|
||||
signal_ops:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890
|
||||
recipient: +1234567890
|
||||
|
||||
signal_oncall:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890
|
||||
recipient: +0987654321
|
||||
|
||||
# Email notifications
|
||||
email_ops:
|
||||
type: email
|
||||
recipients: [ops@example.com, alerts@example.com]
|
||||
sender: heartbeat@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587
|
||||
smtp_user: heartbeat@example.com
|
||||
smtp_password: your-smtp-password
|
||||
|
||||
# Pushover notifications
|
||||
pushover_urgent:
|
||||
type: pushover
|
||||
token: your-pushover-app-token
|
||||
user: your-pushover-user-key
|
||||
|
||||
# Mattermost notifications
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: your-webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
|
||||
# Default notification channels (used if host doesn't specify channels)
|
||||
default_notification_channels: [email_ops]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Host Definitions (New Unified Format)
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
||||
hosts:
|
||||
# Critical production servers - high sensitivity, multiple notification channels
|
||||
prod-web-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-web-02:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-api-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, email_ops]
|
||||
dyndns: false
|
||||
|
||||
# Database servers - database-specific thresholds
|
||||
prod-db-01:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [signal_ops, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-db-02:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [signal_ops, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-db-replica:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [email_ops] # Replica gets email only
|
||||
dyndns: false
|
||||
|
||||
# Development servers - low sensitivity, minimal notifications
|
||||
dev-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false # Don't monitor dev servers closely
|
||||
notification_channels: [email_ops]
|
||||
dyndns: false
|
||||
|
||||
dev-server-02:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
notification_channels: [email_ops]
|
||||
dyndns: false
|
||||
|
||||
# Test servers
|
||||
test-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
dyndns: false
|
||||
# No notification channels - uses default_notification_channels
|
||||
|
||||
# Home server with dynamic DNS
|
||||
home-server:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [signal_ops]
|
||||
dyndns: true # Update DNS when IP changes
|
||||
|
||||
# Hosts not listed in the hosts section will use:
|
||||
# - default_threshold_config for thresholds (falls back to "default")
|
||||
# - default_notification_channels for notifications
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Notes on Configuration Structure
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# All configuration is centralized in the hosts section. Each host can specify:
|
||||
# - threshold_config: Name of threshold configuration to use
|
||||
# - watch: Whether to monitor this host actively (send notifications)
|
||||
# - notification_channels: List of channels to use for this host
|
||||
# - dyndns: Whether to update DNS when IP address changes
|
||||
#
|
||||
# Notification channels are defined once at the top level and referenced
|
||||
# by name in host definitions, allowing easy reuse and updates.
|
||||
#
|
||||
# For hosts not explicitly listed, the system will still accept heartbeats
|
||||
# and track their state, but won't apply thresholds or send notifications
|
||||
# unless default settings are configured.
|
||||
@@ -0,0 +1,111 @@
|
||||
# Heartbeat Configuration Example with Nagios Plugin Runner
|
||||
|
||||
# This example shows how to configure the Nagios Runner plugin
|
||||
# to execute existing Nagios-compatible monitoring plugins
|
||||
|
||||
# Basic server settings (existing config)
|
||||
hb_port: 50003
|
||||
hbd_port: 50004
|
||||
interval: 20
|
||||
grace: 2
|
||||
|
||||
# Plugin configuration
|
||||
# Each plugin can have its own configuration section
|
||||
|
||||
# CPU Monitor Plugin
|
||||
cpu_monitor:
|
||||
interval: 300 # Collect every 5 minutes (default)
|
||||
per_core: false # Set to true to get per-core CPU usage
|
||||
|
||||
# Nagios Runner Plugin
|
||||
nagios_runner:
|
||||
interval: 300 # Run Nagios plugins every 5 minutes (default)
|
||||
timeout: 30 # Command execution timeout in seconds
|
||||
shell: true # Execute commands via shell
|
||||
|
||||
# List of Nagios plugins to run
|
||||
commands:
|
||||
|
||||
# Example 1: Check disk space
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
|
||||
# Example 2: Check disk space for /home
|
||||
- name: check_disk_home
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||
|
||||
# Example 3: Check system load
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
# Example 4: Check process count
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
|
||||
# Example 5: Check SSH service
|
||||
- name: check_ssh
|
||||
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||
|
||||
# Example 6: Check HTTP service
|
||||
- name: check_http
|
||||
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||
|
||||
# Example 7: Check swap usage
|
||||
- name: check_swap
|
||||
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
|
||||
# Example 8: Custom script (Nagios plugin format)
|
||||
- name: check_custom
|
||||
command: /usr/local/bin/my_custom_check.sh
|
||||
|
||||
# Example 9: Check specific log file
|
||||
- name: check_logs
|
||||
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
|
||||
|
||||
# Notes:
|
||||
#
|
||||
# 1. Nagios Plugin Output Format:
|
||||
# - Single line: STATUS - Message | performance_data
|
||||
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
#
|
||||
# 2. Exit Codes:
|
||||
# - 0 = OK
|
||||
# - 1 = WARNING
|
||||
# - 2 = CRITICAL
|
||||
# - 3 = UNKNOWN
|
||||
#
|
||||
# 3. Performance Data:
|
||||
# - Automatically parsed and included in heartbeat data
|
||||
# - Metrics are stored as: {plugin_name}_{metric_name}
|
||||
# - Example: check_disk_root_/ will contain the disk usage percentage
|
||||
#
|
||||
# 4. Overall Status:
|
||||
# - The plugin reports the worst status from all commands
|
||||
# - Useful for quick health checks
|
||||
#
|
||||
# 5. Plugin Paths:
|
||||
# Common Nagios plugin directories:
|
||||
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
|
||||
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
|
||||
# - Custom installs: /usr/local/nagios/libexec/
|
||||
#
|
||||
# 6. Installing Nagios Plugins:
|
||||
# Debian/Ubuntu: sudo apt-get install nagios-plugins
|
||||
# RHEL/CentOS: sudo yum install nagios-plugins-all
|
||||
# Arch Linux: sudo pacman -S monitoring-plugins
|
||||
#
|
||||
# 7. Writing Custom Nagios Plugins:
|
||||
# Any script can be a Nagios plugin if it:
|
||||
# - Returns appropriate exit codes (0-3)
|
||||
# - Prints status message to stdout
|
||||
# - Optionally includes performance data after "|"
|
||||
#
|
||||
# Example custom plugin (save as /usr/local/bin/check_example.sh):
|
||||
# #!/bin/bash
|
||||
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
|
||||
# echo "CRITICAL - Too many users | users=52;40;50;0"
|
||||
# exit 2
|
||||
# else
|
||||
# echo "OK - Normal user count | users=25;40;50;0"
|
||||
# exit 0
|
||||
# fi
|
||||
@@ -0,0 +1,279 @@
|
||||
# ==============================================================================
|
||||
# Heartbeat Daemon Threshold Configuration Example
|
||||
# ==============================================================================
|
||||
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
||||
# Thresholds can be defined for any metric collected by monitoring plugins.
|
||||
#
|
||||
# Threshold levels:
|
||||
# - WARNING: First level of concern, typically for early notification
|
||||
# - CRITICAL: Severe condition requiring immediate attention
|
||||
#
|
||||
# Alert notifications are sent when:
|
||||
# - A metric crosses from OK to WARNING or CRITICAL
|
||||
# - A metric crosses from WARNING to CRITICAL
|
||||
# - A metric recovers (returns to a lower severity level)
|
||||
#
|
||||
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
||||
# ==============================================================================
|
||||
|
||||
# Global threshold settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||
|
||||
# Threshold definitions per plugin
|
||||
thresholds:
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# CPU Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
cpu_monitor:
|
||||
# Overall CPU usage percentage (0-100)
|
||||
cpu_percent:
|
||||
warning: 80.0 # Warn when CPU usage exceeds 80%
|
||||
critical: 90.0 # Critical when CPU usage exceeds 90%
|
||||
operator: ">" # Alert when value is GREATER than threshold
|
||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||
enabled: true
|
||||
|
||||
# 1-minute load average
|
||||
load_1min:
|
||||
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
||||
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15 # 15% hysteresis
|
||||
enabled: true
|
||||
|
||||
# 5-minute load average
|
||||
load_5min:
|
||||
warning: 3.0
|
||||
critical: 6.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# 15-minute load average
|
||||
load_15min:
|
||||
warning: 2.0
|
||||
critical: 4.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Memory Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
memory_monitor:
|
||||
# Memory usage percentage
|
||||
percent:
|
||||
warning: 85.0 # Warn at 85% memory usage
|
||||
critical: 95.0 # Critical at 95% memory usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Available memory in MB (inverse threshold - alert when LOW)
|
||||
available_mb:
|
||||
warning: 1000 # Warn when less than 1GB available
|
||||
critical: 500 # Critical when less than 500MB available
|
||||
operator: "<" # Alert when value is LESS than threshold
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Swap usage percentage
|
||||
swap_percent:
|
||||
warning: 50.0 # Warn at 50% swap usage
|
||||
critical: 80.0 # Critical at 80% swap usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Disk Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
disk_monitor:
|
||||
# Partition-specific thresholds
|
||||
# Use the mount point as the key
|
||||
partitions:
|
||||
# Root filesystem
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0 # Warn at 80% disk usage
|
||||
critical: 90.0 # Critical at 90% disk usage
|
||||
operator: ">"
|
||||
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 10.0 # Warn when less than 10GB free
|
||||
critical: 5.0 # Critical when less than 5GB free
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Home filesystem (if separate partition)
|
||||
/home:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
# Var filesystem (logs, etc.)
|
||||
/var:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 5.0 # Var needs space for logs
|
||||
critical: 2.0
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# ZFS Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
zfs_monitor:
|
||||
# Pool health check — built-in default; shown here for reference/override.
|
||||
# status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
|
||||
# Use '*' to apply the same rule to every pool, or name a specific pool.
|
||||
pools:
|
||||
'*':
|
||||
status:
|
||||
warning: 1 # Alert WARNING when pool is DEGRADED
|
||||
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
|
||||
operator: ">="
|
||||
hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting
|
||||
grace: 0 # Fire immediately — don't wait for a second collection
|
||||
display: "ZFS pool {pool_name} is {health}"
|
||||
|
||||
# Per-pool capacity thresholds (optional; add pools you care about)
|
||||
# tank:
|
||||
# capacity:
|
||||
# warning: 75.0 # Warn at 75% used
|
||||
# critical: 90.0 # Critical at 90% used
|
||||
# operator: ">"
|
||||
# hysteresis: 0.05
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Network Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
network_monitor:
|
||||
# Total error count across all interfaces
|
||||
errors_total:
|
||||
warning: 100 # Warn at 100 errors
|
||||
critical: 1000 # Critical at 1000 errors
|
||||
operator: ">"
|
||||
hysteresis: 0.2 # 20% hysteresis for counters
|
||||
enabled: true
|
||||
|
||||
# Total dropped packets
|
||||
dropin_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
dropout_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# TCP connections in TIME_WAIT state
|
||||
connections_TIME_WAIT:
|
||||
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
||||
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# Total established connections
|
||||
connections_ESTABLISHED:
|
||||
warning: 500
|
||||
critical: 1000
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Nagios Plugin Thresholds (if using nagios_runner)
|
||||
# ----------------------------------------------------------------------------
|
||||
nagios_runner:
|
||||
# Nagios plugins report exit codes:
|
||||
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
||||
# We can threshold on the exit_code directly
|
||||
exit_code:
|
||||
warning: 1 # Map Nagios WARNING to our WARNING
|
||||
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
||||
operator: ">=" # Alert when exit code >= threshold
|
||||
hysteresis: 0.0 # No hysteresis for exit codes
|
||||
enabled: true
|
||||
|
||||
# ==============================================================================
|
||||
# Notification Configuration
|
||||
# ==============================================================================
|
||||
# Configure notification methods (email, pushover, etc.)
|
||||
# These are used when threshold violations occur
|
||||
|
||||
# Email notifications
|
||||
toemail:
|
||||
- admin@example.com
|
||||
- oncall@example.com
|
||||
fromemail: heartbeat@example.com
|
||||
smtpserver: smtp.example.com
|
||||
smtpport: 587
|
||||
smtpuser: heartbeat@example.com
|
||||
smtppassword: your-password-here
|
||||
|
||||
# Pushover notifications (optional)
|
||||
# pushover_token: your-pushover-app-token
|
||||
# pushover_user: your-pushover-user-key
|
||||
|
||||
# Mattermost webhook (optional)
|
||||
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
||||
|
||||
# ==============================================================================
|
||||
# Watched Hosts
|
||||
# ==============================================================================
|
||||
# Hosts in this list will trigger notifications for:
|
||||
# - Heartbeat timeouts/overdue
|
||||
# - Threshold violations
|
||||
# - Boot messages
|
||||
watchhosts:
|
||||
- webserver01
|
||||
- database01
|
||||
- mailserver
|
||||
- critical-app
|
||||
|
||||
# ==============================================================================
|
||||
# Additional Server Settings
|
||||
# ==============================================================================
|
||||
hb_port: 50003 # UDP port for heartbeat messages
|
||||
hbd_port: 50004 # HTTP port for web interface
|
||||
grace: 10 # Grace period for overdue detection (seconds)
|
||||
debug: 0 # Debug level (0-3)
|
||||
verbose: false # Verbose output
|
||||
|
||||
# Journal settings (message logging)
|
||||
journal_enabled: true
|
||||
journal_path: /var/log/heartbeat/messages.journal
|
||||
journal_max_size: 104857600 # 100MB before rotation
|
||||
journal_max_backups: 10
|
||||
|
||||
# ==============================================================================
|
||||
# Example: Production Configuration with Conservative Thresholds
|
||||
# ==============================================================================
|
||||
# For production systems, consider:
|
||||
# - Higher warning thresholds to reduce alert fatigue
|
||||
# - Appropriate hysteresis values (5-15% typical)
|
||||
# - Re-notification intervals matching on-call rotation
|
||||
# - Multiple escalation contacts
|
||||
# - Integration with incident management systems
|
||||
# ==============================================================================
|
||||
-600
@@ -1,600 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
|
||||
# NEW
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
import os
|
||||
import signal
|
||||
import select
|
||||
import traceback
|
||||
from hashlib import md5
|
||||
import shutil
|
||||
import zlib
|
||||
import subprocess
|
||||
import syslog
|
||||
import codecs
|
||||
|
||||
from .config import load_config
|
||||
|
||||
PORT = 50003
|
||||
INTERVAL = 10
|
||||
REOPENC = 6
|
||||
PIDFILE = "/tmp/hbc.pid"
|
||||
VER = 6
|
||||
MAXRECV = 32767
|
||||
|
||||
running = True
|
||||
dorestart = False
|
||||
warned1 = False
|
||||
|
||||
msgonly = False
|
||||
helpflag = False
|
||||
verbose = False
|
||||
fdaemon = False
|
||||
daemonized = False
|
||||
msgboot = {}
|
||||
home = os.environ["HOME"]
|
||||
configfile = "%s/.hbrc" % home
|
||||
cmdargs = []
|
||||
iam = socket.gethostname()
|
||||
|
||||
|
||||
def log(msg):
|
||||
if fdaemon:
|
||||
syslog.syslog(syslog.LOG_ERR, msg)
|
||||
else:
|
||||
print(msg)
|
||||
|
||||
|
||||
def handler(signum, frame):
|
||||
if signum == signal.SIGTERM:
|
||||
cleanup()
|
||||
|
||||
|
||||
class NullDevice:
|
||||
def write(self, s):
|
||||
pass
|
||||
|
||||
|
||||
class Conn:
|
||||
def __init__(self, conId, addr, port, af):
|
||||
self.conId = conId
|
||||
self.addr = addr
|
||||
self.port = port
|
||||
self.af = af
|
||||
|
||||
self.ackcount = 0 # num of accks received
|
||||
self.lastack = 0 # time() last ACK was received
|
||||
self.send = 0
|
||||
self.lastsend = 0 # time() last msg was sent
|
||||
self.rtts = [0]
|
||||
self.sock = None
|
||||
|
||||
def __str__(self):
|
||||
return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
|
||||
|
||||
def open(self):
|
||||
self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
|
||||
self.sock.setsockopt(
|
||||
socket.SOL_SOCKET,
|
||||
socket.SO_REUSEADDR,
|
||||
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
|
||||
)
|
||||
|
||||
def sendto(self, msg, ID="HTB"): # default ID is HearTBeat
|
||||
global warned1
|
||||
|
||||
if self.send % REOPENC == 0:
|
||||
self.close()
|
||||
if not self.sock:
|
||||
self.open()
|
||||
msg["name"] = shortname(iam)
|
||||
msg["id"] = self.conId
|
||||
msg["ver"] = VER
|
||||
msg["time"] = time.time()
|
||||
m = dicttos(ID, msg) # always compress
|
||||
if verbose:
|
||||
log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
|
||||
try:
|
||||
self.sock.sendto(m, (self.addr, self.port))
|
||||
except socket.error as e:
|
||||
if not warned1:
|
||||
log("socket error: %s %s:%s" % (e, self.addr, self.port))
|
||||
warned1 = True
|
||||
self.close()
|
||||
return
|
||||
self.send += 1
|
||||
self.lastsend = time.time()
|
||||
|
||||
def ack(self, msgDict, now):
|
||||
try:
|
||||
self.lastack = msgDict["time"]
|
||||
mul = 2
|
||||
except Exception:
|
||||
self.lastack = now
|
||||
mul = 1
|
||||
rtt = (self.lastack - self.lastsend) * mul
|
||||
if verbose:
|
||||
log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
|
||||
self.rtts.append(rtt * 1000.0)
|
||||
if len(self.rtts) > 10:
|
||||
del self.rtts[0]
|
||||
self.ackcount += 1
|
||||
|
||||
def close(self):
|
||||
if self.sock:
|
||||
self.sock.close()
|
||||
self.sock = None
|
||||
|
||||
|
||||
def shortname(name):
|
||||
r = name.split(".")
|
||||
return r[0]
|
||||
|
||||
|
||||
def dicttos(ID, d):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
return ID.encode() + zpk
|
||||
|
||||
|
||||
def stodict(msg):
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
r0 = msg.split(":", 1)
|
||||
pk = r0[1]
|
||||
d["ID"] = r0[0]
|
||||
r = pk.split(";")
|
||||
for v in r:
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
v = vr[1].strip()
|
||||
try:
|
||||
v = eval(v)
|
||||
except Exception:
|
||||
pass
|
||||
d[k] = v
|
||||
if verbose:
|
||||
print("msg is %s" % d)
|
||||
return d
|
||||
|
||||
|
||||
def XXstodict(msg):
|
||||
d = {}
|
||||
r0 = msg.split(":", 1)
|
||||
if len(r0) == 1:
|
||||
return None
|
||||
if r0[0][0] == "!": # compressed
|
||||
pk = zlib.decompress(msg[len(r0[0]) + 1 :])
|
||||
d["ID"] = r0[0][1:]
|
||||
else:
|
||||
pk = r0[1]
|
||||
d["ID"] = r0[0]
|
||||
r = pk.split(";")
|
||||
for v in r:
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
v = vr[1].strip()
|
||||
try:
|
||||
if v[0].isdigit():
|
||||
v = eval(v)
|
||||
except Exception:
|
||||
pass
|
||||
d[k] = v
|
||||
return d
|
||||
|
||||
|
||||
def syslogtrace(note):
|
||||
logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
|
||||
log(logm)
|
||||
for line in logm.split("\n"):
|
||||
syslog.syslog(syslog.LOG_ERR, " tb: %s" % line)
|
||||
if verbose:
|
||||
print(logm)
|
||||
|
||||
|
||||
conId = 1
|
||||
|
||||
|
||||
def createConnections(hosts):
|
||||
global conId
|
||||
for host in hosts:
|
||||
if verbose:
|
||||
log("createConnections for %s" % host)
|
||||
try:
|
||||
rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
||||
except socket.gaierror:
|
||||
logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
|
||||
if verbose:
|
||||
log(logm)
|
||||
return None
|
||||
for r in rs:
|
||||
if verbose:
|
||||
log("address %s" % str(r))
|
||||
if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD
|
||||
af = socket.AF_INET6
|
||||
elif r[0] == 2:
|
||||
af = socket.AF_INET
|
||||
else:
|
||||
print("dont know this net type: %s" % r[0][0])
|
||||
sys.exit(1)
|
||||
|
||||
addr = r[4][0]
|
||||
conns[conId] = Conn(conId, addr, hb_port, af)
|
||||
if verbose:
|
||||
print("cons[%s] = %s" % (conId, str(conns[conId])))
|
||||
conId += 1
|
||||
|
||||
|
||||
def doexec(conn, data):
|
||||
try:
|
||||
ro = subprocess.check_output(
|
||||
data, stderr=subprocess.STDOUT, shell=True
|
||||
).decode()
|
||||
fail = "OK"
|
||||
except subprocess.CalledProcessError as e:
|
||||
ro = str(e)
|
||||
fail = "CalledProcessError"
|
||||
except Exception as e:
|
||||
syslogtrace("System")
|
||||
ro = "N/A"
|
||||
fail = "cmd failed: %s" % e
|
||||
msg = {"service": "command", "msg": fail + " " + ro}
|
||||
conns[conn].sendto(msg)
|
||||
|
||||
|
||||
def doupdate(conn, msgDict):
|
||||
fail = None
|
||||
try:
|
||||
code = codecs.decode(msgDict["code"], "base64").decode()
|
||||
csum = msgDict["csum"]
|
||||
except Exception as e:
|
||||
fail = "csum/code missing: %s" % e
|
||||
if not fail:
|
||||
fail = doupdateone(code, csum)
|
||||
|
||||
msg = {"service": "update", "msg": fail if fail else "OK"}
|
||||
conns[conn].sendto(msg)
|
||||
if not fail:
|
||||
log("hc updates, fs = %s" % (len(code)))
|
||||
|
||||
return fail
|
||||
|
||||
|
||||
def doupdateone(code, csum):
|
||||
|
||||
m = md5()
|
||||
m.update(code.encode())
|
||||
icsum = m.hexdigest()
|
||||
if icsum != csum:
|
||||
return "checksum error"
|
||||
|
||||
fn = sys.argv[0]
|
||||
ofn = "%s.sav" % fn
|
||||
try:
|
||||
shutil.copy2(fn, ofn)
|
||||
except Exception as e:
|
||||
return "cannot make backup copy: %s" % e
|
||||
|
||||
try:
|
||||
fh = open(fn, "w")
|
||||
fh.write(code)
|
||||
fh.close()
|
||||
except Exception as e:
|
||||
return "cannot write new code: %s" % e
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def restart():
|
||||
if verbose:
|
||||
print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
|
||||
syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
|
||||
e = "fallthrough"
|
||||
try:
|
||||
os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
|
||||
except Exception:
|
||||
pass
|
||||
print("should not be here:", str(e))
|
||||
log("restart failed: %s" % e)
|
||||
|
||||
|
||||
def process():
|
||||
global running, dorestart
|
||||
|
||||
nextReport = time.time()
|
||||
|
||||
while running:
|
||||
while time.time() < nextReport:
|
||||
ifiles = {}
|
||||
conIds = {}
|
||||
for conn in conns:
|
||||
if conns[conn].sock:
|
||||
ifiles[conns[conn].sock.fileno()] = conns[conn].sock
|
||||
conIds[conns[conn].sock.fileno()] = conn
|
||||
|
||||
sleep = nextReport - time.time()
|
||||
if sleep <= 0:
|
||||
break
|
||||
try:
|
||||
r = select.select(list(ifiles.keys()), [], [], sleep)
|
||||
now = (
|
||||
time.time()
|
||||
) # nb: delay from actual packet arrival to select is ca. 105ms!
|
||||
except KeyboardInterrupt:
|
||||
running = False
|
||||
break
|
||||
except SystemExit:
|
||||
log("daemon exit, running was %s" % running)
|
||||
if running:
|
||||
running = False
|
||||
break
|
||||
except Exception:
|
||||
if running:
|
||||
syslogtrace("select")
|
||||
running = False
|
||||
break
|
||||
for rfh in r[0]:
|
||||
conn = conIds[rfh]
|
||||
data, addr = ifiles[rfh].recvfrom(MAXRECV)
|
||||
if verbose:
|
||||
print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
|
||||
try:
|
||||
msgDict = stodict(data)
|
||||
except Exception as e:
|
||||
print(
|
||||
"failed to parse incoming data from %s: %s (%s)"
|
||||
% (addr, data, e)
|
||||
)
|
||||
continue
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
"sock.recvfrom: %s (%s) %s"
|
||||
% (addr, len(data), str(msgDict)[:80])
|
||||
)
|
||||
if msgDict is None:
|
||||
print("bad backet from %s (%s) %s" % (addr, len(data), data))
|
||||
elif msgDict["ID"] == "ACK":
|
||||
conns[conn].ack(msgDict, now)
|
||||
elif msgDict["ID"] == "UPD":
|
||||
if doupdate(conn, msgDict) is None:
|
||||
if verbose:
|
||||
print("process: restart after update")
|
||||
dorestart = True
|
||||
break
|
||||
elif msgDict["ID"] == "CMD":
|
||||
doexec(conn, msgDict["cmd"])
|
||||
else:
|
||||
doexec(conn, data) # deprecated until no more VER - hbc
|
||||
if dorestart:
|
||||
running = False
|
||||
break
|
||||
if not running:
|
||||
break
|
||||
for conn in conns:
|
||||
msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
|
||||
conns[conn].sendto(msg)
|
||||
time.sleep(
|
||||
0.1
|
||||
) # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
|
||||
if nextReport + interval >= time.time():
|
||||
nextReport += interval
|
||||
else:
|
||||
nextReport = time.time() + interval
|
||||
|
||||
if verbose:
|
||||
log("process: done running")
|
||||
|
||||
|
||||
def cleanup():
|
||||
global running
|
||||
if not running:
|
||||
return
|
||||
if verbose:
|
||||
log("cleanup")
|
||||
running = False
|
||||
for conn in conns:
|
||||
msg = {"shutdown": 1, "acks": conns[conn].ackcount}
|
||||
conns[conn].sendto(msg)
|
||||
conns[conn].close()
|
||||
time.sleep(1)
|
||||
closeall()
|
||||
|
||||
|
||||
def closeall():
|
||||
if verbose:
|
||||
syslog.syslog(syslog.LOG_ERR, "closecall")
|
||||
for conn in conns:
|
||||
conns[conn].close()
|
||||
|
||||
|
||||
def daemonize(
|
||||
working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
|
||||
):
|
||||
"""
|
||||
Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
|
||||
UNIX Environment" for details (ISBN 0201563177)
|
||||
http://www.yendor.com/programming/unix/apue/proc/fork2.c
|
||||
"""
|
||||
|
||||
try:
|
||||
# first fork
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# exit from first parent
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
|
||||
os._exit(1)
|
||||
|
||||
# decouple from parent environment
|
||||
os.chdir(working_dir)
|
||||
os.setsid()
|
||||
os.umask(0)
|
||||
# second fork
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# exit from second parent
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
|
||||
sys.exit(1)
|
||||
|
||||
# redirects standard file descriptors
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
si = open(stdin, "r")
|
||||
so = open(stdout, "a+")
|
||||
se = open(stderr, "a+")
|
||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
#
|
||||
# Main program
|
||||
#
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbc",
|
||||
description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
|
||||
parser.add_argument(
|
||||
"-c", "--config", dest="configfile", help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument("-m", "--message", dest="message", help="Send a message")
|
||||
parser.add_argument(
|
||||
"-n", "--name", dest="name", help="Name to use in heartbeat message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--daemon", action="store_true", help="Run in daemon mode"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument(
|
||||
"-x", "--debug", action="count", default=0, help="Increase debug level"
|
||||
)
|
||||
parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
global msgonly, verbose, fdaemon, daemonized, cmdargs, iam, hb_port, conns, interval, hb_hosts
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.boot:
|
||||
msgboot["boot"] = 1
|
||||
if args.message:
|
||||
msgboot["service"] = "service"
|
||||
msgboot["msg"] = args.message
|
||||
msgonly = True
|
||||
if args.name:
|
||||
iam = args.name
|
||||
if args.daemon:
|
||||
fdaemon = True
|
||||
if args.verbose:
|
||||
verbose = True
|
||||
if args.debug:
|
||||
config.setdefault("debug", 0)
|
||||
config["debug"] += args.debug
|
||||
|
||||
cmdargs += argv
|
||||
if verbose:
|
||||
print("cmdargs for restart are %s" % cmdargs)
|
||||
|
||||
#
|
||||
# set defaults
|
||||
|
||||
hb_hosts = args.hosts
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
#
|
||||
if verbose:
|
||||
print("notice: hb_hosts: %s" % str(hb_hosts))
|
||||
print("notice: hb_port: %s" % hb_port)
|
||||
print("notice: interval: %s" % interval)
|
||||
print("notice: iam: %s" % iam)
|
||||
print("notice: msgonly: %s" % msgonly)
|
||||
print("notice: msgboot: %s" % msgboot)
|
||||
|
||||
if not msgonly:
|
||||
msgboot["interval"] = interval
|
||||
|
||||
conns = {}
|
||||
while True:
|
||||
if verbose:
|
||||
log("create connections")
|
||||
createConnections(hb_hosts)
|
||||
if len(conns) != 0:
|
||||
break
|
||||
if verbose:
|
||||
log("no connections yet, sleep a bit")
|
||||
time.sleep(2)
|
||||
|
||||
if verbose:
|
||||
log("%s connections created" % (len(conns)))
|
||||
|
||||
if len(msgboot) > 0:
|
||||
if verbose:
|
||||
print("on boot")
|
||||
msgboot["acks"] = 0
|
||||
for conn in conns:
|
||||
conns[conn].sendto(msgboot)
|
||||
|
||||
if msgonly:
|
||||
if verbose:
|
||||
print("msgboot done msgonly=%s" % msgonly)
|
||||
closeall()
|
||||
sys.exit(0)
|
||||
|
||||
#
|
||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
||||
if fdaemon:
|
||||
print("daemoinizing.")
|
||||
daemonize()
|
||||
daemonized = True
|
||||
syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
|
||||
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
try:
|
||||
process()
|
||||
except Exception as e:
|
||||
syslogtrace("process")
|
||||
if verbose:
|
||||
print("err: process exit: %s" % e)
|
||||
|
||||
if verbose:
|
||||
log("main: cleanup")
|
||||
cleanup()
|
||||
if dorestart:
|
||||
restart()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
-381
@@ -1,381 +0,0 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.__dict__)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except Exception:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.cver = 0
|
||||
self.connections = {}
|
||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["ver"] = str(self.cver)
|
||||
d["num"] = self.num
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["ver"] = "Ver"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c not in self.connections:
|
||||
continue
|
||||
# dirty ugly hack: fix conn to host backpointer
|
||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
||||
cld["host"] = cld["host"].name
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def setcver(self, cver):
|
||||
self.cver = cver
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
pass
|
||||
|
||||
# def dispstate(self):
|
||||
# if self.state in ["down", "overdue"]:
|
||||
# state = "<b>%s</b>" % self.state
|
||||
# elif self.state in ["up", "UP"]:
|
||||
# state = ""
|
||||
# for x in list(self.connections.keys()):
|
||||
# try:
|
||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
||||
# except:
|
||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
||||
# state = ""
|
||||
# else:
|
||||
# state = "%s" % self.state
|
||||
# return state
|
||||
|
||||
def dispstats(self):
|
||||
if self.doesack != -1:
|
||||
if self.upcount > 0:
|
||||
r = ""
|
||||
for v in range(3):
|
||||
a, u = self.hdwcounts[v]
|
||||
if (self.upcount - u) != 0:
|
||||
vs = "%0.0f" % (
|
||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
||||
)
|
||||
if vs == "0":
|
||||
vs = ""
|
||||
else:
|
||||
vs = "-"
|
||||
r += '<td align="right">%s</td>' % vs
|
||||
return r
|
||||
else:
|
||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
"ver",
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le :]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
-221
@@ -1,221 +0,0 @@
|
||||
"""HTTP server implementation using aiohttp and jinja2."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import os
|
||||
import logging
|
||||
from aiohttp import web
|
||||
import jinja2
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _render_template(html_str: str, **context) -> str:
|
||||
tmpl = jinja2.Template(html_str)
|
||||
return tmpl.render(**context)
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
port: int,
|
||||
config,
|
||||
hbdclass,
|
||||
msgs_getter,
|
||||
log=None,
|
||||
email=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
tcss=None,
|
||||
DEBUG=0,
|
||||
verbose=False,
|
||||
get_now=None,
|
||||
VER="",
|
||||
):
|
||||
"""Start an aiohttp web server and block until cancelled.
|
||||
|
||||
This function is intended to be awaited inside the main asyncio event loop.
|
||||
"""
|
||||
get_now = get_now or (lambda: time.time())
|
||||
|
||||
async def index(request):
|
||||
res = []
|
||||
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
||||
res.append("<html>")
|
||||
res.append("<head>")
|
||||
res.append("<title>Heartbeat</title>")
|
||||
if tcss:
|
||||
res.append(tcss)
|
||||
res.append("</head>")
|
||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
||||
res += hbdclass.ubHost.buildhosttable()
|
||||
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
|
||||
res.append(
|
||||
"<p> %s (%s)</p>"
|
||||
% (
|
||||
time.strftime("%H:%M:%S", time.localtime(get_now())),
|
||||
config.get("tz", "CET-1CDT"),
|
||||
)
|
||||
)
|
||||
res.append("</body></html>")
|
||||
body = "\n".join(res)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def api_hosts(request):
|
||||
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
|
||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||
|
||||
async def api_messages(request):
|
||||
lst = msgs_getter()[-30:]
|
||||
return web.json_response(lst)
|
||||
|
||||
async def cmd(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
ucmd = qa.get("c")
|
||||
if not ucmd or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
hbdclass.Host.hosts[uname].cmds.append(
|
||||
("CMD", {"cmd": urllib.parse.unquote(ucmd)})
|
||||
)
|
||||
return web.Response(text=f"cmd {uname} queued")
|
||||
|
||||
async def drop(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if log:
|
||||
log(uname, "dropped")
|
||||
del hbdclass.Host.hosts[uname]
|
||||
return web.Response(text="Done")
|
||||
|
||||
async def register(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
ll = hbdclass.Host.hosts[uname].registerDns()
|
||||
if log:
|
||||
log(uname, ll)
|
||||
return web.Response(text=str(ll))
|
||||
|
||||
async def update(request):
|
||||
qa = request.rel_url.query
|
||||
uname = urllib.parse.unquote(qa.get("h", ""))
|
||||
ucode = qa.get("c")
|
||||
if not ucode or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname != "All" and uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if uname != "All":
|
||||
names = [uname]
|
||||
else:
|
||||
names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
|
||||
out = []
|
||||
for n in names:
|
||||
err = None
|
||||
try:
|
||||
r = {"csum": None, "code": ucode}
|
||||
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
|
||||
except Exception as e:
|
||||
err = str(e)
|
||||
out.append(f"update started for {n}: {err if err else 'OK'}")
|
||||
return web.Response(text="\n".join(out))
|
||||
|
||||
async def restart(request):
|
||||
# signal main application to perform restart if needed
|
||||
# not implemented here - return OK
|
||||
if log:
|
||||
log(None, "restart request")
|
||||
return web.Response(text="restart request")
|
||||
|
||||
async def live(request):
|
||||
# render template from hbd/templates/live.html using Jinja2
|
||||
# Resolve templates directory relative to the hbd package
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
host = config.get("hb_host", "localhost")
|
||||
extra_scripts = config.get("http_extra_scripts", "")
|
||||
host = request.host.split(":")[0]
|
||||
if config.get("wss_port"):
|
||||
heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
|
||||
else:
|
||||
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
|
||||
tmpl = env.get_template("live.html")
|
||||
body = tmpl.render(
|
||||
title="Heartbeat",
|
||||
header="Heartbeat",
|
||||
request=request,
|
||||
heartbeat_ws_url=heartbeat_ws_url,
|
||||
extra_scripts=extra_scripts,
|
||||
hosts=[
|
||||
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
messages=msgs_getter()[-30:],
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def static(request):
|
||||
"""Serve files from the package static directory.
|
||||
|
||||
URL form: /static/<path>
|
||||
"""
|
||||
p = request.match_info.get("path", "")
|
||||
logger.debug("static file requested: %s", p)
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
||||
# normalize and prevent directory traversal
|
||||
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
||||
if not target.startswith(base + os.sep) and target != base:
|
||||
return web.Response(status=403, text="Forbidden")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
logger.info("serving static file: %s", target)
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
async def favicon(request):
|
||||
"""Serve favicon.ico from the package static directory."""
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
|
||||
target = os.path.join(base, "favicon.ico")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
app = web.Application()
|
||||
app.add_routes(
|
||||
[
|
||||
web.get("/", index),
|
||||
web.get("/api/0/hosts", api_hosts),
|
||||
web.get("/api/0/messages", api_messages),
|
||||
web.get("/c", cmd),
|
||||
web.get("/d", drop),
|
||||
web.get("/n", register),
|
||||
web.get("/u", update),
|
||||
web.get("/r", restart),
|
||||
web.get("/live", live),
|
||||
web.get("/static/{path:.*}", static),
|
||||
web.get("/favicon.ico", favicon),
|
||||
]
|
||||
)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, host, port)
|
||||
await site.start()
|
||||
|
||||
if verbose:
|
||||
print(f"HTTP server started on {host}:{port}")
|
||||
|
||||
try:
|
||||
await asyncio.Future()
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
@@ -1,50 +0,0 @@
|
||||
"""monitor helper and thread for heartbeat daemon."""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
|
||||
|
||||
def checkoverdue(
|
||||
config: dict,
|
||||
hbdclass,
|
||||
log: callable,
|
||||
pushmsg: callable,
|
||||
msg_to_websockets: callable,
|
||||
):
|
||||
now = time.time()
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
pmsg = []
|
||||
for c in hbdclass.Host.hosts[h].connections:
|
||||
conn = hbdclass.Host.hosts[h].connections[c]
|
||||
if conn.state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
||||
pmsg.append(conn.afam)
|
||||
if (
|
||||
conn.state == hbdclass.Connection.OVERDUE
|
||||
and (now - conn.lastbeat) > DROPOVERDUE
|
||||
):
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
if pmsg != []:
|
||||
if h in config.get("watchhosts", []):
|
||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
||||
log(h, "%s overdue" % " and ".join(pmsg))
|
||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
||||
|
||||
|
||||
async def start(
|
||||
config: dict,
|
||||
hbdclass: callable,
|
||||
log=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
):
|
||||
"""start a monitor loop that checks for overdue hosts every minute"""
|
||||
while True:
|
||||
await asyncio.sleep(15) # 15 seconds between checks
|
||||
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
|
||||
-202
@@ -1,202 +0,0 @@
|
||||
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
import http.client
|
||||
import urllib.parse
|
||||
import subprocess
|
||||
import smtplib
|
||||
import time
|
||||
|
||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
||||
|
||||
# module-level configuration set via setup()
|
||||
_config = {}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def setup(cfg: dict):
|
||||
"""Initialize notifier defaults from a configuration dict."""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
|
||||
|
||||
def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
|
||||
"""Send a plain email via SMTP. Returns True on success."""
|
||||
try:
|
||||
smtpport = _config.get("smtpport", 587)
|
||||
server = smtplib.SMTP(smtpserver, smtpport)
|
||||
if debug > 0:
|
||||
server.set_debuglevel(1)
|
||||
if smtpport == 587:
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
smtpuser = _config.get("smtpuser", None)
|
||||
smtppassword = _config.get("smtppassword", None)
|
||||
if smtpuser and smtppassword:
|
||||
server.login(smtpuser, smtppassword)
|
||||
server.sendmail(sender, toaddrs, body)
|
||||
except Exception as e:
|
||||
logger.warning("email send failed: %s", e)
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def email(subject: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Convenience wrapper exposed to the rest of the application.
|
||||
|
||||
Uses module-level configuration to supply recipient list, smtp server
|
||||
and sender address.
|
||||
"""
|
||||
toaddrs = _config.get("toemail")
|
||||
fromemail = _config.get("fromemail")
|
||||
smtpserver = _config.get("smtpserver")
|
||||
if not toaddrs or not fromemail or not smtpserver:
|
||||
logger.warning(
|
||||
"email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
|
||||
toaddrs,
|
||||
fromemail,
|
||||
smtpserver,
|
||||
)
|
||||
return False
|
||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||
toaddrs[0] if toaddrs else "",
|
||||
fromemail,
|
||||
subject,
|
||||
date,
|
||||
msg,
|
||||
)
|
||||
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
|
||||
|
||||
|
||||
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Send message via Pushover API."""
|
||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||
try:
|
||||
conn.request(
|
||||
"POST",
|
||||
"/1/messages.json",
|
||||
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
|
||||
{"Content-type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
r = conn.getresponse()
|
||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||
return r.status == 200
|
||||
except Exception as e:
|
||||
logger.error("pushover error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushmattermost(
|
||||
host: str,
|
||||
token: str,
|
||||
channel: str,
|
||||
msg: str,
|
||||
username: str = "hbd",
|
||||
icon: Optional[str] = None,
|
||||
debug: int = 0,
|
||||
) -> bool:
|
||||
"""Send a message to Mattermost via simple webhook driver if available.
|
||||
|
||||
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
|
||||
If the import fails it returns False.
|
||||
"""
|
||||
try:
|
||||
from mattermostdriver import Driver
|
||||
except Exception:
|
||||
return False
|
||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||
mm = Driver(ses)
|
||||
payload = {"text": msg, "channel": channel, "username": username}
|
||||
if icon:
|
||||
payload["icon_url"] = icon
|
||||
try:
|
||||
rc = mm.webhooks.call_webhook(token, payload)
|
||||
logger.debug("mattermost rc: %s", rc)
|
||||
return bool(rc is None or rc == "")
|
||||
except Exception as e:
|
||||
logger.error("mattermost error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushsignal(
|
||||
signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
|
||||
) -> bool:
|
||||
"""Send a message via signal-cli (requires local installation).
|
||||
|
||||
Uses subprocess to call signal-cli. Returns True if the command succeeded.
|
||||
"""
|
||||
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
|
||||
logger.debug("signal cli: %s", CLI)
|
||||
try:
|
||||
res = subprocess.run(CLI, capture_output=True)
|
||||
if res.returncode != 0:
|
||||
logger.error("signal failed: %s".res.stderr.decode())
|
||||
return False
|
||||
logger.debug("signal sent: %s", res.stdout.decode())
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception("signal exception: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
||||
|
||||
cfg is expected to contain keys for different services when needed, e.g.
|
||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
||||
- cfg['pushover_token'], cfg['pushover_user']
|
||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
||||
|
||||
Returns a dict of results per provider.
|
||||
"""
|
||||
results = {}
|
||||
p = cfg.get("pushsrv", "pushover")
|
||||
if p in ("all", "pushover"):
|
||||
ok = pushover(
|
||||
cfg.get("pushover_token", ""),
|
||||
cfg.get("pushover_user", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["pushover"] = ok
|
||||
if p in ("all", "mattermost"):
|
||||
ok = pushmattermost(
|
||||
cfg.get("matter_host", ""),
|
||||
cfg.get("matter_token", ""),
|
||||
cfg.get("matter_channel", ""),
|
||||
msg,
|
||||
username=cfg.get("matter_username", "hbd"),
|
||||
icon=cfg.get("matter_icon"),
|
||||
debug=debug,
|
||||
)
|
||||
results["mattermost"] = ok
|
||||
if p in ("all", "signal"):
|
||||
ok = pushsignal(
|
||||
cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
|
||||
cfg.get("signal_user", ""),
|
||||
cfg.get("signal_recipient", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["signal"] = ok
|
||||
if p in ("all", "email"):
|
||||
ok = email("Heartbeat notification", msg, debug=debug)
|
||||
results["email"] = ok
|
||||
logger.debug("push results: %s", results)
|
||||
return results
|
||||
|
||||
|
||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
||||
"""Use the module-level configuration dict to dispatch a push message."""
|
||||
return pushmsg(_config, msg, debug=debug)
|
||||
@@ -1,82 +0,0 @@
|
||||
"""Message encoding/decoding utilities for hbd protocol."""
|
||||
|
||||
from typing import Dict, Any
|
||||
import zlib
|
||||
|
||||
|
||||
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
||||
"""Serialize a dict to protocol message bytes.
|
||||
|
||||
If compress is True, the payload is zlib-compressed and the message is
|
||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||
`ID:key=value;...` (bytes).
|
||||
"""
|
||||
s = []
|
||||
for k in d:
|
||||
v = d[k]
|
||||
if isinstance(v, float):
|
||||
s.append(f"{k}={v:0.5f}")
|
||||
else:
|
||||
s.append(f"{k}={v}")
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
else:
|
||||
return (ID + ":" + pk).encode()
|
||||
|
||||
|
||||
def stodict(msg: bytes):
|
||||
"""Deserialize a protocol message into a dict.
|
||||
|
||||
Mirrors original behaviour: detects compressed messages starting with
|
||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||
message ID and the parsed key/value pairs.
|
||||
"""
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
# message is: b'!ID:' + compressed_payload
|
||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||
try:
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
except Exception:
|
||||
# malformed compressed payload
|
||||
return {}
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
try:
|
||||
r0 = msg.split(b":", 1)
|
||||
pk = r0[1].decode()
|
||||
d["ID"] = r0[0].decode()
|
||||
except Exception:
|
||||
return {}
|
||||
if not pk:
|
||||
return d
|
||||
parts = pk.split(";")
|
||||
for v in parts:
|
||||
if not v:
|
||||
continue
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
val = vr[1].strip()
|
||||
if val and val[0].isdigit():
|
||||
try:
|
||||
val_e = eval(val)
|
||||
except Exception:
|
||||
val_e = val
|
||||
d[k] = val_e
|
||||
else:
|
||||
d[k] = val
|
||||
return d
|
||||
|
||||
|
||||
def oldmtodict(msg: bytes):
|
||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||
|
||||
The original implementation prefixed with 'HTB:' and called stodict.
|
||||
"""
|
||||
return stodict(b"HTB:" + msg)
|
||||
-370
@@ -1,370 +0,0 @@
|
||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
import signal
|
||||
import sys
|
||||
import ssl
|
||||
from . import __version__
|
||||
|
||||
from . import udp
|
||||
from . import hbdclass
|
||||
|
||||
from . import ws as ws_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
|
||||
logf = None
|
||||
lastfm = ["", "", ""]
|
||||
|
||||
# shared runtime collections and helpers
|
||||
msgs = []
|
||||
|
||||
|
||||
def initlog(logfile):
|
||||
try:
|
||||
return open(logfile, "a+")
|
||||
except Exception as e:
|
||||
import sys
|
||||
|
||||
print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
|
||||
return sys.stderr
|
||||
|
||||
|
||||
def log(host, m, service=None):
|
||||
ts = time.time()
|
||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
|
||||
msgs.append(s)
|
||||
logger.info(s)
|
||||
if logf:
|
||||
try:
|
||||
logf.write(s + "\n")
|
||||
logf.flush()
|
||||
except Exception as e:
|
||||
logger.warning("failed to write to logfile: %s", e)
|
||||
msg_to_websockets("message", s)
|
||||
|
||||
|
||||
def cleanup_function(config):
|
||||
"""This function will be executed upon program exit."""
|
||||
logger.info("Running cleanup function...")
|
||||
import pickle
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
|
||||
pickf = open(pickfile, "wb")
|
||||
pick = pickle.Pickler(pickf)
|
||||
pick.dump(hbdclass.Host.hosts)
|
||||
pick.dump(msgs)
|
||||
pick.dump(lastfm)
|
||||
pickf.close()
|
||||
|
||||
logger.info("Cleanup complete.")
|
||||
|
||||
|
||||
async def _run_async(config):
|
||||
loop = asyncio.get_running_loop()
|
||||
shutdown_event = asyncio.Event()
|
||||
|
||||
# Signal handlers for graceful shutdown
|
||||
def signal_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||
loop.call_soon_threadsafe(shutdown_event.set)
|
||||
|
||||
# Register signal handlers
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||
|
||||
from . import http as http_mod
|
||||
from . import dns as dns_mod
|
||||
from . import notify as notify_mod
|
||||
from . import monitor as monitor_mod
|
||||
|
||||
notify_mod.setup(config)
|
||||
|
||||
pushmsg = notify_mod.pushmsg_from_config
|
||||
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||
# This option is system-dependent; on many systems, setting it to False enables
|
||||
# the socket to handle both IPv4 and IPv6 traffic.
|
||||
try:
|
||||
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||
)
|
||||
|
||||
# 3. Bind to all interfaces (::) on a specific port
|
||||
|
||||
# UDP server endpoint (handler wired to handle_datagram with context)
|
||||
bind_addr = ("::", config.get("hb_port", 50003))
|
||||
sock.bind(bind_addr)
|
||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||
|
||||
def udp_handler(msg, addr, transport):
|
||||
ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
)
|
||||
udp.handle_datagram(msg, addr, transport, ctx)
|
||||
|
||||
transport, protocol = await loop.create_datagram_endpoint(
|
||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||
sock=sock,
|
||||
)
|
||||
|
||||
# HTTP server (asyncio-based via aiohttp)
|
||||
try:
|
||||
http_task = asyncio.create_task(
|
||||
http_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
port=config.get("hbd_port", 50004),
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
msgs_getter=lambda: msgs,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
tcss=None,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"HTTP server started on %s:%s",
|
||||
config.get("hbd_host", ""),
|
||||
config.get("hbd_port", 50004),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("failed to start HTTP server: %s", e)
|
||||
|
||||
# start dns update worker (async)
|
||||
dns_task = None
|
||||
try:
|
||||
dns_task = dns_mod.start_dns_worker(
|
||||
hbdclass, config, log=log, pushmsg=pushmsg, loop=loop
|
||||
)
|
||||
logger.info("dns update worker started")
|
||||
except Exception as e:
|
||||
logger.exception("dns worker failed to start: %s", e)
|
||||
|
||||
# Start the websocket servers as a background task
|
||||
if config.get("wss_port", None):
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ssl_path = config.get("cert_path", "")
|
||||
wss_pem = ssl_path + config.get("wss_pem", "")
|
||||
wss_key = ssl_path + config.get("wss_key", "")
|
||||
try:
|
||||
ssl_context.load_cert_chain(wss_pem, keyfile=wss_key)
|
||||
except FileNotFoundError:
|
||||
logger.error("error: missing SSL keys %s or %s", wss_pem, wss_key)
|
||||
sys.exit(1)
|
||||
logger.info(
|
||||
"Starting secure WebSocket server on port %s with cert %s",
|
||||
config.get("wss_port", None),
|
||||
wss_pem,
|
||||
)
|
||||
else:
|
||||
ssl_context = None
|
||||
|
||||
try:
|
||||
ws_task = asyncio.create_task(
|
||||
ws_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
ws_port=config.get("ws_port", None),
|
||||
wss_port=config.get("wss_port", None),
|
||||
ssl_context=ssl_context,
|
||||
get_hosts=lambda: [
|
||||
hbdclass.Host.hosts[h].stateinfo()
|
||||
for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
get_msgs=lambda: msgs,
|
||||
verbose=config.get("verbose", False),
|
||||
)
|
||||
)
|
||||
logger.info("WebSocket task started")
|
||||
except Exception as e:
|
||||
logger.exception("websocket server failed to start: %s", e)
|
||||
|
||||
# Start the monitor thread as a background task
|
||||
try:
|
||||
monitor_task = asyncio.create_task(
|
||||
monitor_mod.start(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
)
|
||||
)
|
||||
logger.info("Monitor task started")
|
||||
except Exception as e:
|
||||
logger.exception("monitor task failed to start: %s", e)
|
||||
|
||||
try:
|
||||
# run forever until shutdown event is set
|
||||
await shutdown_event.wait()
|
||||
logger.info("Shutdown signal received, stopping services...")
|
||||
except Exception as e:
|
||||
logger.exception("Error in main loop: %s", e)
|
||||
finally:
|
||||
# Cancel all running tasks
|
||||
logger.info("Cancelling tasks...")
|
||||
try:
|
||||
transport.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing UDP transport: %s", e)
|
||||
|
||||
tasks_to_cancel = [http_task, ws_task, monitor_task]
|
||||
for task in tasks_to_cancel:
|
||||
if task:
|
||||
try:
|
||||
task.cancel()
|
||||
logger.debug("Cancelled task: %s", task)
|
||||
except Exception as e:
|
||||
logger.warning("Error cancelling task: %s", e)
|
||||
|
||||
# Wait for tasks to finish cancellation with timeout
|
||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||
if remaining_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||
timeout=2.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for tasks to cancel")
|
||||
except Exception as e:
|
||||
logger.debug("Exception during task cancellation: %s", e)
|
||||
|
||||
# Signal DNS worker to exit and await it
|
||||
try:
|
||||
if "dns_task" in locals() and dns_task:
|
||||
try:
|
||||
hbdclass.Host.dnsQ.put(None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||
logger.info("DNS worker finished")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for DNS worker to finish")
|
||||
dns_task.cancel()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("DNS worker was cancelled")
|
||||
except Exception as e:
|
||||
logger.warning("Error awaiting DNS worker: %s", e)
|
||||
finally:
|
||||
# Clear queue bridge to release any held references
|
||||
hbdclass.Host.dnsQ = None
|
||||
except Exception as e:
|
||||
logger.warning("Error stopping DNS worker: %s", e)
|
||||
|
||||
logger.info("All tasks cancelled")
|
||||
|
||||
|
||||
def load_pickled_hosts(config, hbdclass):
|
||||
"""Load pickled hosts from file, if available."""
|
||||
global lastfm, msgs
|
||||
import os
|
||||
import pickle
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config.get("dyndnshosts", [])
|
||||
watchhosts = config.get("watchhosts", [])
|
||||
drophosts = config.get("drophosts", [])
|
||||
if 1 and os.path.exists(pickfile):
|
||||
if config.get("verbose", False):
|
||||
logger.info("opening pickls %s", pickfile)
|
||||
pickf = open(pickfile, "rb")
|
||||
pick = pickle.Unpickler(pickf)
|
||||
try:
|
||||
hbdclass.Host.hosts = pick.load()
|
||||
msgs = pick.load()
|
||||
try:
|
||||
lastfm = pick.load()
|
||||
except Exception:
|
||||
lastfm = ["", "", ""]
|
||||
pickf.close()
|
||||
except Exception as e:
|
||||
logger.exception("load pickled failed: %s", e)
|
||||
os.unlink(pickfile)
|
||||
hbdclass.Connection.htab = {}
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||
hbdclass.Host.hosts[h].fixup()
|
||||
for h in drophosts:
|
||||
if h in hbdclass.Host.hosts:
|
||||
del hbdclass.Host.hosts[h]
|
||||
if config.get("verbose", False):
|
||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||
else:
|
||||
if config.get("verbose", False):
|
||||
logger.info("no pickled data")
|
||||
|
||||
|
||||
def run(config):
|
||||
"""Start the hbd service (blocking).
|
||||
|
||||
Manually manages the event loop to ensure clean shutdown.
|
||||
"""
|
||||
global logf
|
||||
import os
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO
|
||||
)
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
logf = initlog(logfile=config.get("logfile", "messages.log"))
|
||||
log(None, f"hbd version {__version__} starting up")
|
||||
|
||||
# Create and set the event loop manually
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(_run_async(config))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception in main: %s", e)
|
||||
finally:
|
||||
cleanup_function(config)
|
||||
logger.info("hbd shutdown complete")
|
||||
if logf and logf != sys.stderr:
|
||||
try:
|
||||
logf.close()
|
||||
except Exception:
|
||||
pass
|
||||
# Explicitly close the loop
|
||||
try:
|
||||
# Cancel all remaining tasks
|
||||
pending = asyncio.all_tasks(loop)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
# Run one more cycle to process cancellations
|
||||
if pending:
|
||||
loop.run_until_complete(
|
||||
asyncio.gather(*pending, return_exceptions=True)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# Exit
|
||||
os._exit(0)
|
||||
@@ -0,0 +1,3 @@
|
||||
"""HeartBeat Daemon (hbd) - Server/daemon component."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,302 @@
|
||||
"""Command line interface for hbd package."""
|
||||
|
||||
import argparse
|
||||
import getpass
|
||||
import sys
|
||||
|
||||
from .config import load_config
|
||||
from .main import run as run_server
|
||||
|
||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||
|
||||
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbd",
|
||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
# --- serve (default) ---
|
||||
serve_p = subparsers.add_parser("serve", help="Start the hbd server (default)")
|
||||
serve_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
serve_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||
serve_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
serve_p.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||
help="Push service to use")
|
||||
serve_p.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||
|
||||
# Legacy top-level flags (no subcommand) — kept for backward compatibility
|
||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||
help="Push service to use")
|
||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||
|
||||
# --- passwd ---
|
||||
passwd_p = subparsers.add_parser(
|
||||
"passwd",
|
||||
help="Generate a password hash for use in the config file",
|
||||
)
|
||||
passwd_p.add_argument(
|
||||
"username",
|
||||
nargs="?",
|
||||
help="Username (informational only, for display)",
|
||||
)
|
||||
|
||||
# --- notify ---
|
||||
notify_p = subparsers.add_parser(
|
||||
"notify",
|
||||
help="Send a test message via a configured notification channel",
|
||||
)
|
||||
notify_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
notify_p.add_argument(
|
||||
"channel",
|
||||
help="Channel name as defined in notification_channels",
|
||||
)
|
||||
notify_p.add_argument(
|
||||
"message",
|
||||
nargs="?",
|
||||
default="Test notification from hbd",
|
||||
help="Message body (default: 'Test notification from hbd')",
|
||||
)
|
||||
notify_p.add_argument(
|
||||
"--level",
|
||||
default="WARNING",
|
||||
choices=["INFO", "WARNING", "CRITICAL", "RECOVER"],
|
||||
help="Notification level (default: WARNING)",
|
||||
)
|
||||
notify_p.add_argument(
|
||||
"--title",
|
||||
default=None,
|
||||
help="Notification title (default: '[LEVEL] test')",
|
||||
)
|
||||
|
||||
# --- stop ---
|
||||
stop_p = subparsers.add_parser("stop", help="Stop the running hbd instance")
|
||||
stop_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
|
||||
# --- reload ---
|
||||
reload_p = subparsers.add_parser("reload", help="Reload configuration (SIGHUP)")
|
||||
reload_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
|
||||
# --- restart ---
|
||||
restart_p = subparsers.add_parser("restart", help="Restart the running hbd instance")
|
||||
restart_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
restart_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground after restart")
|
||||
restart_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output after restart")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def cmd_passwd(args):
|
||||
"""Interactive password hash generator."""
|
||||
from .users import hash_password
|
||||
|
||||
username = args.username or ""
|
||||
prompt = f"New password for {username}: " if username else "New password: "
|
||||
while True:
|
||||
pw = getpass.getpass(prompt)
|
||||
if not pw:
|
||||
print("Password must not be empty.", file=sys.stderr)
|
||||
continue
|
||||
pw2 = getpass.getpass("Confirm password: ")
|
||||
if pw != pw2:
|
||||
print("Passwords do not match, try again.", file=sys.stderr)
|
||||
continue
|
||||
break
|
||||
|
||||
hashed = hash_password(pw)
|
||||
if username:
|
||||
print(f"\nAdd the following to your config under users: -> {username}:")
|
||||
else:
|
||||
print("\nPassword hash (paste into config file under the user's 'password' key):")
|
||||
print(f" password: {hashed}")
|
||||
|
||||
|
||||
def cmd_notify(args):
|
||||
"""Send a test message via a single notification channel."""
|
||||
from .config import load_config
|
||||
from .notify import Notification, _dispatch_to_channel, setup
|
||||
|
||||
config = load_config(args.configfile)
|
||||
setup(config)
|
||||
|
||||
channels = config.get("notification_channels", {})
|
||||
if args.channel not in channels:
|
||||
available = ", ".join(channels.keys()) if channels else "(none)"
|
||||
print(f"Error: channel '{args.channel}' not found in notification_channels.", file=sys.stderr)
|
||||
print(f"Available channels: {available}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
channel_cfg = channels[args.channel]
|
||||
level = args.level.upper()
|
||||
title = args.title or f"[{level}] test"
|
||||
base_url = config.get("base_url", "").rstrip("/")
|
||||
|
||||
notif = Notification(
|
||||
title=title,
|
||||
body=args.message,
|
||||
level=level,
|
||||
url=f"{base_url}/plugins" if base_url else "",
|
||||
)
|
||||
|
||||
import asyncio
|
||||
from .notify import _send_matrix_async, _send_sms_voipms_async, _DRIVERS
|
||||
ch_type = channel_cfg.get("type", "")
|
||||
print(f"Sending via {args.channel} ({ch_type}): {title} — {args.message}")
|
||||
|
||||
if ch_type == "matrix":
|
||||
ok = asyncio.run(_send_matrix_async(channel_cfg, notif))
|
||||
elif ch_type == "sms_voipms":
|
||||
ok = asyncio.run(_send_sms_voipms_async(channel_cfg, notif))
|
||||
else:
|
||||
driver = _DRIVERS.get(ch_type)
|
||||
if driver is None:
|
||||
print(f"Error: unknown channel type '{ch_type}'", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
ok = driver(channel_cfg, notif)
|
||||
|
||||
if ok:
|
||||
print("OK")
|
||||
else:
|
||||
print("FAILED — check logs for details", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _read_pid(configfile) -> int | None:
|
||||
"""Return the PID from the pidfile, or None if not found / not running."""
|
||||
import os
|
||||
config = load_config(configfile)
|
||||
pidfile = config.get("pidfile", "")
|
||||
if not pidfile:
|
||||
print("Error: no pidfile configured.", file=sys.stderr)
|
||||
return None
|
||||
try:
|
||||
with open(pidfile) as f:
|
||||
pid = int(f.read().strip())
|
||||
# Verify process is actually running
|
||||
os.kill(pid, 0)
|
||||
return pid
|
||||
except FileNotFoundError:
|
||||
print(f"PID file not found ({pidfile}). Is hbd running?", file=sys.stderr)
|
||||
return None
|
||||
except ProcessLookupError:
|
||||
print(f"PID file exists but process {pid} is not running.", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error reading pidfile: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def cmd_stop(args):
|
||||
import os, signal as _signal, time
|
||||
pid = _read_pid(args.configfile)
|
||||
if pid is None:
|
||||
sys.exit(1)
|
||||
print(f"Stopping hbd (pid {pid})...")
|
||||
os.kill(pid, _signal.SIGTERM)
|
||||
# Wait up to 10 s for the process to exit
|
||||
for _ in range(20):
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
print("hbd stopped.")
|
||||
return
|
||||
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def cmd_reload(args):
|
||||
import os, signal as _signal
|
||||
pid = _read_pid(args.configfile)
|
||||
if pid is None:
|
||||
sys.exit(1)
|
||||
print(f"Sending SIGHUP to hbd (pid {pid})...")
|
||||
os.kill(pid, _signal.SIGHUP)
|
||||
print("Reload signal sent.")
|
||||
|
||||
|
||||
def cmd_restart(args):
|
||||
import os, signal as _signal, time, subprocess
|
||||
pid = _read_pid(args.configfile)
|
||||
if pid is not None:
|
||||
print(f"Stopping hbd (pid {pid})...")
|
||||
os.kill(pid, _signal.SIGTERM)
|
||||
for _ in range(20):
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
print("hbd stopped.")
|
||||
break
|
||||
else:
|
||||
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("hbd does not appear to be running — starting fresh.")
|
||||
|
||||
# Re-launch hbd with the same config
|
||||
cmd = [sys.executable, "-m", "hbd.server.cli", "serve"]
|
||||
if args.configfile:
|
||||
cmd += ["-c", args.configfile]
|
||||
if getattr(args, "foreground", False):
|
||||
cmd += ["-f"]
|
||||
if getattr(args, "verbose", False):
|
||||
cmd += ["-v"]
|
||||
|
||||
if getattr(args, "foreground", False):
|
||||
# Run in foreground — replace current process
|
||||
os.execv(sys.executable, cmd)
|
||||
else:
|
||||
subprocess.Popen(cmd, start_new_session=True)
|
||||
print("hbd restarted.")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.command == "passwd":
|
||||
cmd_passwd(args)
|
||||
return
|
||||
|
||||
if args.command == "notify":
|
||||
cmd_notify(args)
|
||||
return
|
||||
|
||||
if args.command == "stop":
|
||||
cmd_stop(args)
|
||||
return
|
||||
|
||||
if args.command == "reload":
|
||||
cmd_reload(args)
|
||||
return
|
||||
|
||||
if args.command == "restart":
|
||||
cmd_restart(args)
|
||||
return
|
||||
|
||||
# Default: run the server (supports both `hbd serve ...` and `hbd ...`)
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.foreground:
|
||||
config["foreground"] = True
|
||||
if args.verbose:
|
||||
config["verbose"] = True
|
||||
if args.pushsrv:
|
||||
config["pushsrv"] = args.pushsrv
|
||||
if args.debug > 0:
|
||||
config["debug"] = args.debug
|
||||
|
||||
# Pass config_path for reloading support
|
||||
run_server(config, config_path=args.configfile)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,332 @@
|
||||
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
SERVER_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port to listen for heartbeats
|
||||
"hbd_port": 50004, # HTTP API port
|
||||
"hbd_host": "", # Bind address (empty = all interfaces)
|
||||
|
||||
# Persistence
|
||||
"pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"), # File to store host state between restarts
|
||||
"pidfile": os.path.join(os.path.expanduser("~"), ".hb.pid"), # PID file for stop/restart/reload
|
||||
|
||||
# Logging
|
||||
"logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
|
||||
# Notification channels
|
||||
"notification_channels": {}, # Named channels with type and credentials
|
||||
"base_url": "", # Base URL for notification links (e.g. https://hbd.example.com)
|
||||
|
||||
# Monitoring settings
|
||||
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||
"grace": 2, # Grace period (extra seconds before notifying after a missed heartbeat)
|
||||
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||
|
||||
# User management
|
||||
"users": {}, # username -> {full_name, avatar, password, admin, notification_channels}
|
||||
"default_owner": None, # Username that owns hosts with no explicit owner
|
||||
|
||||
# OAuth2 providers
|
||||
"oauth": {}, # oauth.gitea.{url,client_id,client_secret}
|
||||
|
||||
# Host management
|
||||
"hosts": {}, # Unified host definitions
|
||||
"dyndomains": ["example.org"], # Domains to update via nsupdate when a host with dyndns: true is updated
|
||||
|
||||
# DNS updates
|
||||
"nsupdate_bin": "/usr/bin/nsupdate", # Path to nsupdate binary
|
||||
|
||||
# WebSocket settings
|
||||
"ws_port": 50005,
|
||||
"wss_port": None,
|
||||
"cert_path": "/usr/local/etc/ssl/",
|
||||
"wss_pem": "fullchain.pem",
|
||||
"wss_key": "privkey.pem",
|
||||
|
||||
# Message journal configuration
|
||||
"journal_enabled": True,
|
||||
"journal_dir": "/var/log/heartbeat",
|
||||
"journal_file": "messages.journal",
|
||||
"journal_max_size": 100 * 1024 * 1024, # 100MB
|
||||
"journal_max_backups": 10,
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
# Plugin/threshold configs (for clients reporting to this server)
|
||||
"plugins": {},
|
||||
"thresholds": {},
|
||||
}
|
||||
|
||||
THRESHOLD_DEFAULTS = {
|
||||
'thresholds': {
|
||||
'cpu_monitor': {
|
||||
'cpu_percent': {
|
||||
'warning': 80.0,
|
||||
'critical': 90.0
|
||||
}
|
||||
},
|
||||
'memory_monitor': {
|
||||
'memory_percent': {
|
||||
'warning': 85.0,
|
||||
'critical': 95.0
|
||||
},
|
||||
'swap_percent': {
|
||||
'warning': 40.0,
|
||||
'critical': 75.0
|
||||
}
|
||||
},
|
||||
'disk_monitor': {
|
||||
'partitions': {
|
||||
'/': {
|
||||
'percent': {
|
||||
'warning': 85.0,
|
||||
'critical': 90.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'rtt': {
|
||||
'warning': 200,
|
||||
'critical': 250.0,
|
||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||
},
|
||||
'nagios_runner': {
|
||||
'status_code': {
|
||||
'display': '{check_name} {output}',
|
||||
'operator': "nagios"
|
||||
}
|
||||
},
|
||||
'zfs_monitor': {
|
||||
'pools': {
|
||||
'*': {
|
||||
'status': {
|
||||
'warning': 1,
|
||||
'critical': 2,
|
||||
'operator': '>=',
|
||||
'hysteresis': 0.0,
|
||||
'grace': 0,
|
||||
'display': 'ZFS pool {pool_name} is {health}'
|
||||
},
|
||||
'capacity': {
|
||||
'warning': 80.0,
|
||||
'critical': 90.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with server defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = SERVER_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
# Keep all keys from YAML to support plugin configs and future extensions
|
||||
for k, v in data.items():
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
|
||||
|
||||
class ReloadableConfig:
|
||||
"""Thread-safe/async-safe configuration wrapper that supports runtime reloading.
|
||||
|
||||
This class wraps the configuration dictionary and provides:
|
||||
- Thread-safe config reloading via SIGHUP
|
||||
- Backward-compatible dict-like access
|
||||
- Async lock to prevent concurrent reloads
|
||||
"""
|
||||
|
||||
def __init__(self, initial_config, config_path=None):
|
||||
"""Initialize with initial configuration.
|
||||
|
||||
Args:
|
||||
initial_config: Initial configuration dictionary
|
||||
config_path: Path to config file for reloading (optional)
|
||||
"""
|
||||
self._config = initial_config
|
||||
self._config_path = config_path
|
||||
self._lock = asyncio.Lock()
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
async def reload(self, config_path=None):
|
||||
"""Reload configuration from file.
|
||||
|
||||
Args:
|
||||
config_path: Path to config file (uses stored path if not provided)
|
||||
|
||||
Returns:
|
||||
New configuration dictionary
|
||||
|
||||
Raises:
|
||||
Exception if reload fails (keeps existing config)
|
||||
"""
|
||||
path = config_path or self._config_path
|
||||
if not path:
|
||||
raise ValueError("No config path specified for reload")
|
||||
|
||||
async with self._lock:
|
||||
try:
|
||||
# Load new config
|
||||
new_config = load_config(path)
|
||||
|
||||
# Store old config for rollback if needed
|
||||
old_config = self._config
|
||||
|
||||
# Update config
|
||||
self._config = new_config
|
||||
self._logger.info(f"Configuration reloaded from {path}")
|
||||
|
||||
return new_config
|
||||
except Exception as e:
|
||||
self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
|
||||
# Keep existing config on error
|
||||
raise
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get a config value (dict-compatible)."""
|
||||
return self._config.get(key, default)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Get a config value via subscript (dict-compatible)."""
|
||||
return self._config[key]
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check if key exists (dict-compatible)."""
|
||||
return key in self._config
|
||||
|
||||
def keys(self):
|
||||
"""Return config keys (dict-compatible)."""
|
||||
return self._config.keys()
|
||||
|
||||
def items(self):
|
||||
"""Return config items (dict-compatible)."""
|
||||
return self._config.items()
|
||||
|
||||
def values(self):
|
||||
"""Return config values (dict-compatible)."""
|
||||
return self._config.values()
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
"""Get the underlying config dict (for components that need full dict)."""
|
||||
return self._config
|
||||
|
||||
|
||||
def get_watchhosts(config):
|
||||
"""Extract watched hostnames from config (hosts with watch: true).
|
||||
|
||||
Returns:
|
||||
# List of hostnames to watch
|
||||
"""
|
||||
watchhosts = []
|
||||
hosts_config = config.get("hosts", {})
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("watch", True):
|
||||
watchhosts.append(host_name)
|
||||
return watchhosts
|
||||
|
||||
|
||||
def get_dyndnshosts(config):
|
||||
"""Return hostnames that have a dyndns setting in the hosts section."""
|
||||
hosts_config = config.get("hosts", {})
|
||||
if not isinstance(hosts_config, dict):
|
||||
return []
|
||||
return [
|
||||
name for name, attrs in hosts_config.items()
|
||||
if isinstance(attrs, dict) and attrs.get("dyndns")
|
||||
]
|
||||
|
||||
|
||||
def get_host_config(config, hostname):
|
||||
"""Get configuration for a specific host from the hosts section.
|
||||
|
||||
Returns:
|
||||
Dictionary with host attributes or empty dict
|
||||
"""
|
||||
hosts_config = config.get("hosts", {})
|
||||
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||
val = hosts_config[hostname]
|
||||
return val if isinstance(val, dict) else {}
|
||||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# User / host-access helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_default_owner(config) -> str | None:
|
||||
"""Return the configured default_owner username, or the first admin user, or None."""
|
||||
explicit = config.get("default_owner")
|
||||
if explicit:
|
||||
return explicit
|
||||
# Fall back to first admin user found in config
|
||||
users_cfg = config.get("users", {})
|
||||
if isinstance(users_cfg, dict):
|
||||
for username, attrs in users_cfg.items():
|
||||
if isinstance(attrs, dict) and attrs.get("admin", False):
|
||||
return username
|
||||
return None
|
||||
|
||||
|
||||
def get_host_access(config, hostname) -> dict:
|
||||
"""Return the access dict for *hostname*: owner, managers, monitors.
|
||||
|
||||
Falls back to default_owner for hosts without an explicit owner.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"owner": str | None,
|
||||
"managers": list[str],
|
||||
"monitors": list[str],
|
||||
}
|
||||
"""
|
||||
host_cfg = get_host_config(config, hostname)
|
||||
|
||||
owner = host_cfg.get("owner") # or get_default_owner(config)
|
||||
|
||||
managers = host_cfg.get("managers", [])
|
||||
if isinstance(managers, str):
|
||||
managers = [managers]
|
||||
|
||||
monitors = host_cfg.get("monitors", [])
|
||||
if isinstance(monitors, str):
|
||||
monitors = [monitors]
|
||||
|
||||
return {
|
||||
"owner": owner,
|
||||
"managers": list(managers),
|
||||
"monitors": list(monitors),
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
"""YAML round-trip read/write for .hb.yaml, with backup and atomic writes."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import threading
|
||||
from datetime import datetime
|
||||
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
_write_lock = threading.Lock()
|
||||
|
||||
|
||||
def _make_yaml() -> YAML:
|
||||
y = YAML()
|
||||
y.preserve_quotes = True
|
||||
return y
|
||||
|
||||
# Top-level keys managed by the 'server' logical section
|
||||
_SERVER_KEYS = [
|
||||
"hbd_port", "hbd_host", "ws_port", "wss_port", "hb_port",
|
||||
"interval", "grace", "base_url", "threshold_renotify_interval",
|
||||
"logfile", "pidfile", "pickfile", "journal_enabled", "journal_dir",
|
||||
"journal_max_size", "journal_max_backups", "default_owner",
|
||||
"default_threshold_config",
|
||||
]
|
||||
|
||||
# Top-level keys managed by the 'dns' logical section
|
||||
_DNS_KEYS = ["nsupdate_bin", "rndc_key", "dyndomains"]
|
||||
|
||||
|
||||
def read_roundtrip(path: str):
|
||||
"""Load .hb.yaml with ruamel.yaml, preserving comments and ordering."""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return _make_yaml().load(f)
|
||||
|
||||
|
||||
def write_config(path: str, data) -> None:
|
||||
"""Backup current file then atomically write data.
|
||||
|
||||
Backup naming: {path}.bak.YYYYMMDD-HHMMSS
|
||||
Rotation: keep the 10 most recent backups, delete older ones.
|
||||
Atomic write: write to {path}.tmp then os.replace({path}.tmp, path).
|
||||
Acquires _write_lock for the full backup+write sequence.
|
||||
"""
|
||||
with _write_lock:
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
backup_path = f"{path}.bak.{ts}"
|
||||
n = 0
|
||||
while os.path.exists(backup_path):
|
||||
n += 1
|
||||
backup_path = f"{path}.bak.{ts}-{n}"
|
||||
orig_mode = None
|
||||
if os.path.exists(path):
|
||||
orig_mode = os.stat(path).st_mode
|
||||
with open(path, "rb") as src, open(backup_path, "wb") as dst:
|
||||
dst.write(src.read())
|
||||
os.chmod(backup_path, orig_mode)
|
||||
backups = sorted(glob.glob(f"{path}.bak.*"), reverse=True)
|
||||
for old in backups[10:]:
|
||||
os.unlink(old)
|
||||
tmp = f"{path}.tmp"
|
||||
try:
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
_make_yaml().dump(data, f)
|
||||
if orig_mode is not None:
|
||||
os.chmod(tmp, orig_mode)
|
||||
os.replace(tmp, path)
|
||||
except Exception:
|
||||
try:
|
||||
os.unlink(tmp)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def list_backups(path: str) -> list:
|
||||
"""Return backup paths sorted newest-first."""
|
||||
return sorted(glob.glob(f"{path}.bak.*"), reverse=True)
|
||||
|
||||
|
||||
def apply_structured_section(data, section: str, values: dict) -> None:
|
||||
"""Merge a dict of scalar/list values into data for the named logical section.
|
||||
|
||||
For 'server': updates each known key individually, preserving comments on
|
||||
unchanged keys. For 'users': replaces the entire users dict.
|
||||
"""
|
||||
if section == "server":
|
||||
for key in _SERVER_KEYS:
|
||||
if key in values:
|
||||
data[key] = values[key]
|
||||
elif section == "dns":
|
||||
for key in _DNS_KEYS:
|
||||
if key in values:
|
||||
data[key] = values[key]
|
||||
else:
|
||||
data.pop(key, None)
|
||||
elif section == "users":
|
||||
data["users"] = values
|
||||
elif section == "hosts":
|
||||
data["hosts"] = values
|
||||
else:
|
||||
raise ValueError(f"Unknown structured section: {section!r}")
|
||||
|
||||
|
||||
def apply_channel(data, name: str, channel_cfg: dict) -> None:
|
||||
"""Insert or replace a single notification channel entry, preserving others."""
|
||||
if not data.get("notification_channels"):
|
||||
data["notification_channels"] = {}
|
||||
data["notification_channels"][name] = channel_cfg
|
||||
|
||||
|
||||
def delete_channel(data, name: str) -> None:
|
||||
"""Remove a notification channel by name. No-op if not found."""
|
||||
nc = data.get("notification_channels") or {}
|
||||
nc.pop(name, None)
|
||||
|
||||
|
||||
def apply_yaml_section(data, section: str, yaml_text: str) -> None:
|
||||
"""Replace the named logical section by parsing yaml_text."""
|
||||
parsed = _make_yaml().load(yaml_text)
|
||||
if section == "notification_channels":
|
||||
data["notification_channels"] = parsed
|
||||
elif section == "thresholds":
|
||||
data["threshold_configs"] = parsed
|
||||
elif section == "hosts":
|
||||
data["hosts"] = parsed
|
||||
elif section == "dns":
|
||||
if parsed:
|
||||
for key in _DNS_KEYS:
|
||||
if key in parsed:
|
||||
data[key] = parsed[key]
|
||||
else:
|
||||
for key in _DNS_KEYS:
|
||||
data.pop(key, None)
|
||||
else:
|
||||
raise ValueError(f"Unknown YAML section: {section!r}")
|
||||
@@ -0,0 +1,12 @@
|
||||
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
|
||||
class Data:
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.data = {}
|
||||
|
||||
def update(self, new_data):
|
||||
self.data.update(new_data)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.data.get(key, default)
|
||||
@@ -4,6 +4,9 @@ from __future__ import annotations
|
||||
from subprocess import Popen, PIPE, STDOUT
|
||||
from typing import Optional
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_nsupdate_payload(
|
||||
@@ -123,7 +126,6 @@ async def dns_update_worker(
|
||||
pass
|
||||
continue
|
||||
|
||||
m = f"changed address to {addr}"
|
||||
for dyndomain in cfg.get("dyndomains", []):
|
||||
err = await loop.run_in_executor(
|
||||
None,
|
||||
@@ -135,43 +137,34 @@ async def dns_update_worker(
|
||||
cfg.get("rndc_key", "/etc/dhcpc/rndc-key"),
|
||||
)
|
||||
if err:
|
||||
m += f", DNS update failed: {err}"
|
||||
if pushmsg:
|
||||
m = f"DNS update failed for {addr} ({dyndomain}): {err}"
|
||||
logger.error("DNS update failed for %s: %s", name, err)
|
||||
if log:
|
||||
try:
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
pushmsg,
|
||||
"error: nsupdate failed",
|
||||
f"{name}.dy.{dyndomain}: {m}",
|
||||
)
|
||||
await loop.run_in_executor(None, log, name, "ERROR", m)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
m += ", DNS updated."
|
||||
m = f"DNS updated {name}.dy.{dyndomain} → {addr}"
|
||||
if log:
|
||||
try:
|
||||
await loop.run_in_executor(None, log, name, "INFO", m)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not cfg.get("dyndomains"):
|
||||
logger.warning("DNS update triggered for %s but no dyndomains configured", name)
|
||||
|
||||
try:
|
||||
dnsq.task_done()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if log:
|
||||
try:
|
||||
await loop.run_in_executor(None, log, name, m)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if log:
|
||||
try:
|
||||
await loop.run_in_executor(None, log, None, "dns_update_worker exiting")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def start_dns_worker(
|
||||
hbdclass,
|
||||
cfg: dict,
|
||||
log: Optional[callable] = None,
|
||||
pushmsg: Optional[callable] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||
):
|
||||
"""Start the async DNS worker and return the Task.
|
||||
@@ -218,7 +211,7 @@ def start_dns_worker(
|
||||
|
||||
task = loop.create_task(
|
||||
dns_update_worker(
|
||||
hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
|
||||
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||
)
|
||||
)
|
||||
return task
|
||||
@@ -0,0 +1,638 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
self.overdue_timer = None
|
||||
self.overdue_callback = None
|
||||
self.timeout_duration = None
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def __getstate__(self):
|
||||
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
|
||||
state = self.__dict__.copy()
|
||||
# Remove asyncio timer objects that can't be pickled
|
||||
# These will be recreated when the next HTB arrives after unpickling
|
||||
state['overdue_timer'] = None
|
||||
state['overdue_callback'] = None
|
||||
state['timeout_duration'] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""Restore Connection from pickle, reinitializing timer fields."""
|
||||
self.__dict__.update(state)
|
||||
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
|
||||
if not hasattr(self, 'overdue_timer'):
|
||||
self.overdue_timer = None
|
||||
if not hasattr(self, 'overdue_callback'):
|
||||
self.overdue_callback = None
|
||||
if not hasattr(self, 'timeout_duration'):
|
||||
self.timeout_duration = None
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%d" % round(self.rtts[-1])
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
"""Serialize connection to JSON, excluding non-serializable timer objects."""
|
||||
data = {}
|
||||
for key, value in self.__dict__.items():
|
||||
# Skip timer-related fields that can't be serialized
|
||||
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||
continue
|
||||
# Handle host backpointer by converting to name
|
||||
if key == 'host':
|
||||
data[key] = value.name if value else None
|
||||
else:
|
||||
data[key] = value
|
||||
return json.dumps(data)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
if rtt is not None:
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except Exception:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
def reset_overdue_timer(self, timeout_seconds, callback):
|
||||
"""Reset the overdue timer for this connection.
|
||||
|
||||
Cancels any existing timer and sets a new one that will mark
|
||||
the connection as overdue if no heartbeat arrives before timeout.
|
||||
|
||||
Args:
|
||||
timeout_seconds: Seconds before marking as overdue
|
||||
callback: Async function to call when timer expires
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
# Cancel existing timer if any
|
||||
if self.overdue_timer and not self.overdue_timer.cancelled():
|
||||
self.overdue_timer.cancel()
|
||||
|
||||
# Store parameters for later reference
|
||||
self.timeout_duration = timeout_seconds
|
||||
self.overdue_callback = callback
|
||||
|
||||
# Create new timer
|
||||
async def timer_expired():
|
||||
await callback(self)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
self.overdue_timer = loop.call_later(timeout_seconds,
|
||||
lambda: asyncio.create_task(timer_expired()))
|
||||
except RuntimeError:
|
||||
# No event loop running yet
|
||||
pass
|
||||
|
||||
def cancel_overdue_timer(self):
|
||||
"""Cancel the overdue timer if it exists and clear all timer references."""
|
||||
if self.overdue_timer:
|
||||
try:
|
||||
if not self.overdue_timer.cancelled():
|
||||
self.overdue_timer.cancel()
|
||||
except Exception:
|
||||
pass
|
||||
# Clear all timer-related references
|
||||
self.overdue_timer = None
|
||||
self.overdue_callback = None
|
||||
self.timeout_duration = None
|
||||
|
||||
def get_avg_rtt(self):
|
||||
"""Get average RTT from recent samples."""
|
||||
valid_rtts = [r for r in self.rtts if r > 0]
|
||||
if valid_rtts:
|
||||
return sum(valid_rtts) / len(valid_rtts)
|
||||
return 0
|
||||
|
||||
def get_current_rtt(self):
|
||||
"""Get most recent RTT value."""
|
||||
return self.rtts[-1] if self.rtts else 0
|
||||
|
||||
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
|
||||
"""Check if RTT exceeds thresholds.
|
||||
|
||||
Args:
|
||||
warning_threshold: RTT in ms for warning level
|
||||
critical_threshold: RTT in ms for critical level
|
||||
|
||||
Returns:
|
||||
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
|
||||
"""
|
||||
rtt = self.get_current_rtt()
|
||||
if rtt <= 0:
|
||||
return (None, rtt)
|
||||
|
||||
if critical_threshold and rtt > critical_threshold:
|
||||
return ('CRITICAL', rtt)
|
||||
elif warning_threshold and rtt > warning_threshold:
|
||||
return ('WARNING', rtt)
|
||||
|
||||
return (None, rtt)
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.connections = {}
|
||||
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
|
||||
self.plugin_data = {}
|
||||
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||
# Alert state tracking: {metric_path: AlertState}
|
||||
self.alert_states = {}
|
||||
# User access control
|
||||
self.owner: str | None = None # username of owner
|
||||
self.managers: list = [] # usernames with manager role
|
||||
self.monitors: list = [] # usernames with monitor role
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["raw_name"] = self.name
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["num"] = self.num
|
||||
|
||||
# Add alert counts (split by acknowledged status)
|
||||
warning_unacked = 0
|
||||
warning_acked = 0
|
||||
critical_unacked = 0
|
||||
critical_acked = 0
|
||||
for metric_path, alert_state in self.alert_states.items():
|
||||
# Import AlertLevel here to avoid circular imports
|
||||
from .threshold import AlertLevel
|
||||
if alert_state.level == AlertLevel.WARNING:
|
||||
if alert_state.acknowledged:
|
||||
warning_acked += 1
|
||||
else:
|
||||
warning_unacked += 1
|
||||
elif alert_state.level == AlertLevel.CRITICAL:
|
||||
if alert_state.acknowledged:
|
||||
critical_acked += 1
|
||||
else:
|
||||
critical_unacked += 1
|
||||
|
||||
d["alert_warning_unacked"] = warning_unacked
|
||||
d["alert_warning_acked"] = warning_acked
|
||||
d["alert_critical_unacked"] = critical_unacked
|
||||
d["alert_critical_acked"] = critical_acked
|
||||
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d in ["alert_states", "plugin_data"]:
|
||||
continue
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c not in self.connections:
|
||||
continue
|
||||
# Create connection dict, excluding non-serializable timer objects
|
||||
conn = self.connections[c]
|
||||
cld = {}
|
||||
for key, value in conn.__dict__.items():
|
||||
# Skip timer-related fields that can't be serialized
|
||||
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||
continue
|
||||
# Handle host backpointer by converting to name
|
||||
if key == 'host':
|
||||
cld[key] = value.name if value else None
|
||||
else:
|
||||
# Safe copy for serializable values
|
||||
try:
|
||||
cld[key] = copy.deepcopy(value)
|
||||
except Exception:
|
||||
# If deepcopy fails, use shallow copy
|
||||
cld[key] = value
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
|
||||
# Add alert counts (computed from alert_states)
|
||||
warning_unacked = 0
|
||||
warning_acked = 0
|
||||
critical_unacked = 0
|
||||
critical_acked = 0
|
||||
if hasattr(self, 'alert_states'):
|
||||
from .threshold import AlertLevel
|
||||
for metric_path, alert_state in self.alert_states.items():
|
||||
if alert_state.level == AlertLevel.WARNING:
|
||||
if alert_state.acknowledged:
|
||||
warning_acked += 1
|
||||
else:
|
||||
warning_unacked += 1
|
||||
elif alert_state.level == AlertLevel.CRITICAL:
|
||||
if alert_state.acknowledged:
|
||||
critical_acked += 1
|
||||
else:
|
||||
critical_unacked += 1
|
||||
|
||||
ddict["alert_warning_unacked"] = warning_unacked
|
||||
ddict["alert_warning_acked"] = warning_acked
|
||||
ddict["alert_critical_unacked"] = critical_unacked
|
||||
ddict["alert_critical_acked"] = critical_acked
|
||||
|
||||
# User access
|
||||
ddict["owner"] = getattr(self, "owner", None)
|
||||
ddict["managers"] = list(getattr(self, "managers", []))
|
||||
ddict["monitors"] = list(getattr(self, "monitors", []))
|
||||
|
||||
# hbc version from latest os_info plugin data
|
||||
hbc_version = None
|
||||
latest_os = self.get_latest_plugin_data("os_info")
|
||||
if latest_os:
|
||||
_, os_data = latest_os
|
||||
hbc_version = os_data.get("hbc_version")
|
||||
ddict["hbc_version"] = hbc_version
|
||||
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
# Add plugin_data if missing (for backward compatibility)
|
||||
if not hasattr(self, "plugin_data"):
|
||||
self.plugin_data = {}
|
||||
if not hasattr(self, "plugin_retention"):
|
||||
self.plugin_retention = 100
|
||||
if not hasattr(self, "alert_states"):
|
||||
self.alert_states = {}
|
||||
# User access fields (added in user-management feature)
|
||||
if not hasattr(self, "owner"):
|
||||
self.owner = None
|
||||
if not hasattr(self, "managers"):
|
||||
self.managers = []
|
||||
if not hasattr(self, "monitors"):
|
||||
self.monitors = []
|
||||
|
||||
pass
|
||||
|
||||
def add_plugin_data(self, plugin_name, data, timestamp=None):
|
||||
"""Store plugin data with timestamp.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin (e.g., "cpu_monitor")
|
||||
data: Dict of plugin data
|
||||
timestamp: Optional timestamp (default: current time)
|
||||
"""
|
||||
if timestamp is None:
|
||||
timestamp = time.time()
|
||||
|
||||
if plugin_name not in self.plugin_data:
|
||||
self.plugin_data[plugin_name] = []
|
||||
|
||||
# Add new data
|
||||
self.plugin_data[plugin_name].append((timestamp, data))
|
||||
|
||||
# Enforce retention limit (keep last N samples)
|
||||
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
|
||||
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
|
||||
|
||||
def get_plugin_data(self, plugin_name, limit=None):
|
||||
"""Retrieve plugin data for a specific plugin.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin
|
||||
limit: Optional limit on number of recent samples to return
|
||||
|
||||
Returns:
|
||||
List of (timestamp, data) tuples, most recent last
|
||||
"""
|
||||
data = self.plugin_data.get(plugin_name, [])
|
||||
if limit and len(data) > limit:
|
||||
return data[-limit:]
|
||||
return data
|
||||
|
||||
def get_latest_plugin_data(self, plugin_name):
|
||||
"""Get the most recent plugin data for a plugin.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin
|
||||
|
||||
Returns:
|
||||
(timestamp, data) tuple or None if no data
|
||||
"""
|
||||
data = self.plugin_data.get(plugin_name, [])
|
||||
return data[-1] if data else None
|
||||
|
||||
def get_all_plugin_data(self):
|
||||
"""Get all plugin data for this host.
|
||||
|
||||
Returns:
|
||||
Dict of {plugin_name: [(timestamp, data), ...]}
|
||||
"""
|
||||
return self.plugin_data
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# User-role helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def apply_access(self, owner, managers, monitors):
|
||||
"""Set owner/managers/monitors on this host (called from config load)."""
|
||||
self.owner = owner
|
||||
self.managers = list(managers)
|
||||
self.monitors = list(monitors)
|
||||
|
||||
def is_owner(self, username: str) -> bool:
|
||||
return self.owner == username
|
||||
|
||||
def is_manager(self, username: str) -> bool:
|
||||
return username in self.managers or self.is_owner(username)
|
||||
|
||||
def is_monitor(self, username: str) -> bool:
|
||||
return username in self.monitors or self.is_manager(username)
|
||||
|
||||
def access_dict(self) -> dict:
|
||||
return {
|
||||
"owner": self.owner,
|
||||
"managers": list(self.managers),
|
||||
"monitors": list(self.monitors),
|
||||
}
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le :]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
+1738
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Journal logging for heartbeat messages.
|
||||
|
||||
Provides size-based rotating log files for all received heartbeat messages.
|
||||
Messages are logged in JSON format for easy parsing and analysis.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MessageJournal:
|
||||
"""
|
||||
Journal logger for heartbeat messages with size-based rotation.
|
||||
|
||||
Features:
|
||||
- Logs all received messages in JSON format
|
||||
- Automatic rotation when file size exceeds threshold
|
||||
- Keeps configurable number of rotated logs
|
||||
- Thread-safe and async-safe operation
|
||||
- Configurable log directory and file naming
|
||||
|
||||
Configuration:
|
||||
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
|
||||
journal_file: Base filename (default: messages.journal)
|
||||
max_size: Maximum file size in bytes before rotation (default: 100MB)
|
||||
max_backups: Number of backup files to keep (default: 10)
|
||||
enabled: Enable/disable journaling (default: True)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the message journal.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary with journal settings
|
||||
"""
|
||||
self.config = config or {}
|
||||
|
||||
# Configuration options
|
||||
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
|
||||
self.journal_file = self.config.get('journal_file', 'messages.journal')
|
||||
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
|
||||
self.max_backups = self.config.get('journal_max_backups', 10)
|
||||
self.enabled = self.config.get('journal_enabled', True)
|
||||
|
||||
# Runtime state
|
||||
self._file_handle = None
|
||||
self._current_size = 0
|
||||
self._lock = asyncio.Lock()
|
||||
self._initialized = False
|
||||
|
||||
# Full path to current journal file
|
||||
self.journal_path = self.journal_dir / self.journal_file
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""
|
||||
Initialize the journal.
|
||||
|
||||
Creates journal directory if needed and opens the journal file.
|
||||
|
||||
Returns:
|
||||
True if initialization successful, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
logger.info("Message journal disabled in configuration")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Create journal directory if it doesn't exist
|
||||
self.journal_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Open journal file in append mode
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
|
||||
# Get current file size
|
||||
try:
|
||||
self._current_size = os.path.getsize(self.journal_path)
|
||||
except OSError:
|
||||
self._current_size = 0
|
||||
|
||||
self._initialized = True
|
||||
logger.info(f"Message journal initialized: {self.journal_path} "
|
||||
f"(current size: {self._current_size:,} bytes, "
|
||||
f"max: {self.max_size:,} bytes)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize message journal: {e}")
|
||||
self.enabled = False
|
||||
return False
|
||||
|
||||
async def log_message(
|
||||
self,
|
||||
msg: Dict[str, Any],
|
||||
addr: tuple,
|
||||
timestamp: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a received message to the journal.
|
||||
|
||||
Args:
|
||||
msg: Parsed message dictionary
|
||||
addr: Source address (ip, port) tuple
|
||||
timestamp: Message timestamp (defaults to current time)
|
||||
"""
|
||||
if not self.enabled or not self._initialized:
|
||||
return
|
||||
|
||||
# Skip HTB (heartbeat) messages - too verbose
|
||||
msg_id = msg.get('ID', '')
|
||||
if msg_id == 'HTB':
|
||||
return
|
||||
|
||||
async with self._lock:
|
||||
try:
|
||||
# Prepare journal entry
|
||||
if timestamp is None:
|
||||
import time
|
||||
timestamp = time.time()
|
||||
|
||||
entry = {
|
||||
'timestamp': timestamp,
|
||||
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
|
||||
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
|
||||
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
|
||||
'message': msg
|
||||
}
|
||||
|
||||
# Serialize to JSON (one line per entry)
|
||||
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
|
||||
json_bytes = json_line.encode('utf-8')
|
||||
|
||||
# Check if rotation is needed
|
||||
if self._current_size + len(json_bytes) > self.max_size:
|
||||
await self._rotate()
|
||||
|
||||
# Write to journal
|
||||
if self._file_handle:
|
||||
self._file_handle.write(json_line)
|
||||
self._file_handle.flush() # Ensure data is written
|
||||
self._current_size += len(json_bytes)
|
||||
|
||||
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing to journal: {e}")
|
||||
|
||||
async def _rotate(self):
|
||||
"""
|
||||
Rotate the journal file.
|
||||
|
||||
Renames current file with timestamp, opens new file, and removes
|
||||
old backups exceeding max_backups limit.
|
||||
"""
|
||||
try:
|
||||
# Close current file
|
||||
if self._file_handle:
|
||||
self._file_handle.close()
|
||||
self._file_handle = None
|
||||
|
||||
# Generate backup filename with timestamp
|
||||
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
|
||||
backup_name = f"{self.journal_file}.{timestamp_str}"
|
||||
backup_path = self.journal_dir / backup_name
|
||||
|
||||
# Rename current file to backup
|
||||
if self.journal_path.exists():
|
||||
self.journal_path.rename(backup_path)
|
||||
logger.info(f"Rotated journal: {backup_path} "
|
||||
f"(size: {self._current_size:,} bytes)")
|
||||
|
||||
# Open new journal file
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
self._current_size = 0
|
||||
|
||||
# Clean up old backups
|
||||
await self._cleanup_old_backups()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rotating journal: {e}")
|
||||
# Try to reopen the file even if rotation failed
|
||||
try:
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
except Exception as e2:
|
||||
logger.error(f"Failed to reopen journal after rotation error: {e2}")
|
||||
self.enabled = False
|
||||
|
||||
async def _cleanup_old_backups(self):
|
||||
"""
|
||||
Remove old backup files exceeding max_backups limit.
|
||||
|
||||
Keeps only the most recent backups based on filename (which includes timestamp).
|
||||
"""
|
||||
try:
|
||||
# Find all backup files
|
||||
backup_pattern = f"{self.journal_file}.*"
|
||||
backup_files = sorted(self.journal_dir.glob(backup_pattern))
|
||||
|
||||
# Remove oldest backups if we have too many
|
||||
if len(backup_files) > self.max_backups:
|
||||
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
|
||||
for backup_file in files_to_remove:
|
||||
try:
|
||||
backup_file.unlink()
|
||||
logger.info(f"Removed old backup: {backup_file.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up old backups: {e}")
|
||||
|
||||
async def log_threshold_event(
|
||||
self,
|
||||
host_name: str,
|
||||
metric_path: str,
|
||||
old_level: str,
|
||||
new_level: str,
|
||||
value: Any,
|
||||
timestamp: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a threshold state change event.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
|
||||
old_level: Previous alert level
|
||||
new_level: New alert level
|
||||
value: Current metric value
|
||||
timestamp: Event timestamp (default: current time)
|
||||
"""
|
||||
if not self.enabled or not self._initialized:
|
||||
return
|
||||
|
||||
try:
|
||||
if timestamp is None:
|
||||
timestamp = __import__('time').time()
|
||||
|
||||
event = {
|
||||
'timestamp': timestamp,
|
||||
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
|
||||
'event_type': 'threshold',
|
||||
'host': host_name,
|
||||
'metric': metric_path,
|
||||
'old_level': old_level,
|
||||
'new_level': new_level,
|
||||
'value': value,
|
||||
}
|
||||
|
||||
async with self._lock:
|
||||
if not self._file_handle:
|
||||
return
|
||||
|
||||
# Check if rotation is needed
|
||||
if self._current_size >= self.max_size:
|
||||
await self._rotate()
|
||||
|
||||
# Write event
|
||||
line = json.dumps(event) + '\n'
|
||||
self._file_handle.write(line)
|
||||
self._file_handle.flush()
|
||||
|
||||
# Update size
|
||||
self._current_size += len(line.encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging threshold event: {e}")
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close the journal and release resources.
|
||||
|
||||
Should be called during shutdown.
|
||||
"""
|
||||
async with self._lock:
|
||||
if self._file_handle:
|
||||
try:
|
||||
self._file_handle.close()
|
||||
logger.info("Message journal closed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing journal: {e}")
|
||||
finally:
|
||||
self._file_handle = None
|
||||
self._initialized = False
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get journal statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with journal stats
|
||||
"""
|
||||
return {
|
||||
'enabled': self.enabled,
|
||||
'initialized': self._initialized,
|
||||
'current_file': str(self.journal_path),
|
||||
'current_size': self._current_size,
|
||||
'max_size': self.max_size,
|
||||
'max_backups': self.max_backups,
|
||||
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
|
||||
}
|
||||
|
||||
|
||||
# Global journal instance
|
||||
_journal_instance: Optional[MessageJournal] = None
|
||||
|
||||
|
||||
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
|
||||
"""
|
||||
Get or create the global journal instance.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary (only used on first call)
|
||||
|
||||
Returns:
|
||||
MessageJournal instance
|
||||
"""
|
||||
global _journal_instance
|
||||
if _journal_instance is None:
|
||||
_journal_instance = MessageJournal(config)
|
||||
return _journal_instance
|
||||
|
||||
|
||||
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
|
||||
"""
|
||||
Convenience function to log a message using the global journal.
|
||||
|
||||
Args:
|
||||
msg: Parsed message dictionary
|
||||
addr: Source address (ip, port) tuple
|
||||
timestamp: Message timestamp (defaults to current time)
|
||||
"""
|
||||
journal = get_journal()
|
||||
await journal.log_message(msg, addr, timestamp)
|
||||
@@ -0,0 +1,535 @@
|
||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
import signal
|
||||
import sys
|
||||
import ssl
|
||||
from . import __version__
|
||||
|
||||
from . import udp
|
||||
from . import hbdclass
|
||||
|
||||
from . import ws as ws_mod
|
||||
from . import notify as notify_mod
|
||||
from . import data
|
||||
from . import users as users_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
# shared runtime collections and helpers
|
||||
|
||||
def save_state(config, hbdclass):
|
||||
"""Save current state to pickle file. Safe to call at any time."""
|
||||
import pickle
|
||||
import os
|
||||
from . import users as users_mod
|
||||
|
||||
# Clear timer references before pickling (they can't be serialized)
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
if hasattr(conn, 'overdue_timer'):
|
||||
conn.overdue_timer = None
|
||||
if hasattr(conn, 'overdue_callback'):
|
||||
conn.overdue_callback = None
|
||||
if hasattr(conn, 'timeout_duration'):
|
||||
conn.timeout_duration = None
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
tmpfile = pickfile + ".tmp"
|
||||
|
||||
try:
|
||||
with open(tmpfile, "wb") as pickf:
|
||||
pick = pickle.Pickler(pickf)
|
||||
pick.dump(hbdclass.Host.hosts)
|
||||
pick.dump(data.msgs)
|
||||
pick.dump(users_mod.save_sessions())
|
||||
os.replace(tmpfile, pickfile)
|
||||
except Exception as e:
|
||||
logger.error("Failed to save state: %s", e)
|
||||
try:
|
||||
os.unlink(tmpfile)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def cleanup_function(config, hbdclass):
|
||||
"""This function will be executed upon program exit."""
|
||||
logger.info("Running cleanup function...")
|
||||
save_state(config, hbdclass)
|
||||
logger.info("Cleanup complete.")
|
||||
|
||||
|
||||
async def reload_configuration(config_obj, config_path, components):
|
||||
"""Reload configuration and update all components.
|
||||
|
||||
Args:
|
||||
config_obj: ReloadableConfig instance
|
||||
config_path: Path to config file
|
||||
components: Dict with threshold_checker and other components
|
||||
|
||||
Returns:
|
||||
True if reload succeeded, False otherwise
|
||||
"""
|
||||
try:
|
||||
logger.info("Starting configuration reload...")
|
||||
|
||||
# Reload config file
|
||||
new_config = await config_obj.reload(config_path)
|
||||
|
||||
# Update notify module
|
||||
notify_mod.reload_config(new_config)
|
||||
|
||||
# Reload users
|
||||
users_mod.load_users(new_config)
|
||||
|
||||
# Re-apply host attributes from updated config to all known hosts
|
||||
from . import config as config_mod
|
||||
dyndnshosts = config_mod.get_dyndnshosts(new_config)
|
||||
watchhosts = config_mod.get_watchhosts(new_config)
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
host.dyn = hostname in dyndnshosts
|
||||
host.watched = hostname in watchhosts
|
||||
access = config_mod.get_host_access(new_config, hostname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
|
||||
# Reload threshold checker and prune alerts orphaned by the new config
|
||||
if 'threshold_checker' in components:
|
||||
components['threshold_checker'].reload(new_config)
|
||||
components['threshold_checker'].purge_stale_alerts(hbdclass)
|
||||
|
||||
# Note: Changes to the following require restart:
|
||||
# - hb_port, hbd_port, ws_port (already bound)
|
||||
# - SSL certificates (already loaded)
|
||||
# - pickfile (already opened)
|
||||
# - journal settings (journal already initialized)
|
||||
|
||||
# These are reloadable and effective immediately:
|
||||
# - notification_channels
|
||||
# - threshold_configs
|
||||
# - hosts (watchhosts, dyndns, notification_channels)
|
||||
# - grace period (used on next heartbeat)
|
||||
# - debug/verbose flags (used on next message)
|
||||
|
||||
logger.info("Configuration reload completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("=" * 60)
|
||||
logger.error(f"Failed to reload configuration: {e}", exc_info=True)
|
||||
logger.error("Keeping previous configuration")
|
||||
logger.error("=" * 60)
|
||||
return False
|
||||
|
||||
|
||||
async def _run_async(config, config_path=None):
|
||||
from .config import ReloadableConfig
|
||||
if not isinstance(config, ReloadableConfig):
|
||||
config = ReloadableConfig(config, config_path)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
shutdown_event = asyncio.Event()
|
||||
reload_event = asyncio.Event()
|
||||
|
||||
# Signal handlers for graceful shutdown and reload
|
||||
def signal_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||
loop.call_soon_threadsafe(shutdown_event.set)
|
||||
|
||||
def reload_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating config reload...")
|
||||
loop.call_soon_threadsafe(reload_event.set)
|
||||
|
||||
# Register signal handlers
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||
loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
|
||||
|
||||
from . import http as http_mod
|
||||
from . import dns as dns_mod
|
||||
from . import notify as notify_mod
|
||||
from . import journal as journal_mod
|
||||
from . import threshold as threshold_mod
|
||||
|
||||
notify_mod.setup(config, loop=loop)
|
||||
|
||||
# Initialize message journal
|
||||
msg_journal = journal_mod.get_journal(config)
|
||||
await msg_journal.initialize()
|
||||
|
||||
# Initialize threshold checker
|
||||
threshold_checker = threshold_mod.ThresholdChecker(
|
||||
config=config,
|
||||
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||
journal=msg_journal,
|
||||
)
|
||||
logger.info("Threshold checker initialized")
|
||||
|
||||
# Components dict for reload orchestration
|
||||
components = {
|
||||
'threshold_checker': threshold_checker,
|
||||
'msg_journal': msg_journal,
|
||||
}
|
||||
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||
# This option is system-dependent; on many systems, setting it to False enables
|
||||
# the socket to handle both IPv4 and IPv6 traffic.
|
||||
try:
|
||||
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||
)
|
||||
|
||||
bind_addr = ("::", config.get("hb_port", 50003))
|
||||
sock.bind(bind_addr)
|
||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||
|
||||
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
|
||||
# If supported, read datagrams via recvmsg() so RTT uses the kernel
|
||||
# timestamp rather than the time.time() call after asyncio scheduling.
|
||||
use_kernel_ts = udp.enable_kernel_timestamps(sock)
|
||||
if use_kernel_ts:
|
||||
logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
|
||||
else:
|
||||
logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
|
||||
|
||||
def udp_handler(msg, addr, transport, recv_ts=None):
|
||||
ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
msg_journal=msg_journal,
|
||||
threshold_checker=threshold_checker,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
recv_ts=recv_ts,
|
||||
)
|
||||
udp.handle_datagram(msg, addr, transport, ctx)
|
||||
|
||||
if use_kernel_ts:
|
||||
# recvmsg path: manage the socket ourselves with loop.add_reader()
|
||||
sock.setblocking(False)
|
||||
transport = udp.RecvmsgTransport(loop, sock)
|
||||
reader = udp.make_recvmsg_reader(sock, udp_handler, transport)
|
||||
loop.add_reader(sock.fileno(), reader)
|
||||
protocol = None
|
||||
else:
|
||||
transport, protocol = await loop.create_datagram_endpoint(
|
||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||
sock=sock,
|
||||
)
|
||||
|
||||
# Restore connection timers for hosts loaded from pickle
|
||||
restore_ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
threshold_checker=threshold_checker,
|
||||
)
|
||||
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||
|
||||
# Drop alert states that no longer have a matching threshold (stale after
|
||||
# upgrade or config change between runs).
|
||||
threshold_checker.purge_stale_alerts(hbdclass)
|
||||
|
||||
async def _http_reload_callback():
|
||||
await reload_configuration(config, config_path, components)
|
||||
|
||||
# HTTP server (asyncio-based via aiohttp)
|
||||
try:
|
||||
http_task = asyncio.create_task(
|
||||
http_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
port=config.get("hbd_port", 50004),
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
tcss=None,
|
||||
threshold_checker=threshold_checker,
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
reload_callback=_http_reload_callback,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"HTTP server started on %s:%s",
|
||||
config.get("hbd_host", ""),
|
||||
config.get("hbd_port", 50004),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("failed to start HTTP server: %s", e)
|
||||
|
||||
# start dns update worker (async)
|
||||
dns_task = None
|
||||
try:
|
||||
dns_task = dns_mod.start_dns_worker(
|
||||
hbdclass, config, log=eventlog, loop=loop
|
||||
)
|
||||
logger.info("dns update worker started")
|
||||
except Exception as e:
|
||||
logger.exception("dns worker failed to start: %s", e)
|
||||
|
||||
# Register WebSocket state — connections are now served through /ws on the HTTP port
|
||||
ws_task = None
|
||||
ws_mod.setup(
|
||||
loop=loop,
|
||||
get_hosts=lambda: [
|
||||
hbdclass.Host.hosts[h].stateinfo()
|
||||
for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
verbose=config.get("verbose", False),
|
||||
)
|
||||
logger.info("WebSocket handler registered on /ws (HTTP port %s)", config.get("hbd_port", 50004))
|
||||
|
||||
# Periodic autosave task
|
||||
autosave_interval = config.get("autosave_interval", 300) # default: 5 minutes
|
||||
|
||||
async def autosave_task():
|
||||
while True:
|
||||
await asyncio.sleep(autosave_interval)
|
||||
logger.debug("Autosaving state...")
|
||||
save_state(config, hbdclass)
|
||||
logger.debug("Autosave complete (%d hosts)", len(hbdclass.Host.hosts))
|
||||
|
||||
autosave = asyncio.create_task(autosave_task())
|
||||
logger.info("Autosave task started (interval: %ds)", autosave_interval)
|
||||
|
||||
# Main event loop - monitor shutdown and reload events
|
||||
try:
|
||||
while True:
|
||||
# Wait for either shutdown or reload event
|
||||
done, pending = await asyncio.wait(
|
||||
[
|
||||
asyncio.create_task(shutdown_event.wait()),
|
||||
asyncio.create_task(reload_event.wait()),
|
||||
],
|
||||
return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
# Check which event was triggered
|
||||
if shutdown_event.is_set():
|
||||
logger.info("Shutdown signal received, stopping services...")
|
||||
# Cancel pending wait tasks
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
break
|
||||
|
||||
if reload_event.is_set():
|
||||
# Clear the event for next reload
|
||||
reload_event.clear()
|
||||
|
||||
# Cancel pending wait tasks
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
|
||||
# Perform reload if config_path is available
|
||||
if config_path:
|
||||
await reload_configuration(config, config_path, components)
|
||||
else:
|
||||
logger.warning("Cannot reload: no config path available")
|
||||
|
||||
# Continue main loop
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error in main loop: %s", e)
|
||||
finally:
|
||||
# Cancel all running tasks
|
||||
logger.info("Cancelling tasks...")
|
||||
try:
|
||||
transport.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing UDP transport: %s", e)
|
||||
|
||||
tasks_to_cancel = [http_task, autosave]
|
||||
for task in tasks_to_cancel:
|
||||
if task:
|
||||
try:
|
||||
task.cancel()
|
||||
logger.debug("Cancelled task: %s", task)
|
||||
except Exception as e:
|
||||
logger.warning("Error cancelling task: %s", e)
|
||||
|
||||
# Wait for tasks to finish cancellation with timeout
|
||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||
if remaining_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||
timeout=2.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for tasks to cancel")
|
||||
except Exception as e:
|
||||
logger.debug("Exception during task cancellation: %s", e)
|
||||
|
||||
# Close message journal
|
||||
try:
|
||||
await msg_journal.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing message journal: %s", e)
|
||||
|
||||
# Signal DNS worker to exit and await it
|
||||
try:
|
||||
if "dns_task" in locals() and dns_task:
|
||||
try:
|
||||
hbdclass.Host.dnsQ.put(None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||
logger.info("DNS worker finished")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for DNS worker to finish")
|
||||
dns_task.cancel()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("DNS worker was cancelled")
|
||||
except Exception as e:
|
||||
logger.warning("Error awaiting DNS worker: %s", e)
|
||||
finally:
|
||||
# Clear queue bridge to release any held references
|
||||
hbdclass.Host.dnsQ = None
|
||||
except Exception as e:
|
||||
logger.warning("Error stopping DNS worker: %s", e)
|
||||
|
||||
# Save state (hosts + sessions) on clean shutdown
|
||||
try:
|
||||
save_state(config, hbdclass)
|
||||
logger.info("State saved on shutdown")
|
||||
except Exception as e:
|
||||
logger.warning("Error saving state on shutdown: %s", e)
|
||||
|
||||
logger.info("All tasks cancelled")
|
||||
|
||||
|
||||
def load_pickled_hosts(config, hbdclass):
|
||||
"""Load pickled hosts from file, if available."""
|
||||
import os
|
||||
import pickle
|
||||
from . import config as config_mod
|
||||
from . import users as users_mod
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||
watchhosts = config_mod.get_watchhosts(config)
|
||||
if 1 and os.path.exists(pickfile):
|
||||
if config.get("verbose", False):
|
||||
logger.info("opening pickls %s", pickfile)
|
||||
pickf = open(pickfile, "rb")
|
||||
pick = pickle.Unpickler(pickf)
|
||||
try:
|
||||
hbdclass.Host.hosts = pick.load()
|
||||
data.msgs = pick.load()
|
||||
try:
|
||||
users_mod.load_sessions(pick.load())
|
||||
except Exception:
|
||||
pass # older pickle without sessions — fine
|
||||
pickf.close()
|
||||
except Exception as e:
|
||||
logger.exception("load pickled failed: %s", e)
|
||||
os.unlink(pickfile)
|
||||
hbdclass.Connection.htab = {}
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||
hbdclass.Host.hosts[h].fixup()
|
||||
access = config_mod.get_host_access(config, h)
|
||||
hbdclass.Host.hosts[h].apply_access(
|
||||
access["owner"], access["managers"], access["monitors"]
|
||||
)
|
||||
if config.get("verbose", False):
|
||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||
else:
|
||||
if config.get("verbose", False):
|
||||
logger.info("no pickled data")
|
||||
|
||||
|
||||
def run(config, config_path=None):
|
||||
"""Start the hbd service (blocking).
|
||||
|
||||
Manually manages the event loop to ensure clean shutdown.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
config_path: Path to config file (for reload support)
|
||||
"""
|
||||
import os
|
||||
|
||||
log_level = logging.WARNING
|
||||
if config.get("verbose", False):
|
||||
log_level = logging.INFO
|
||||
if config.get("debug", 0) > 0:
|
||||
log_level = logging.DEBUG
|
||||
logging.basicConfig(level=log_level)
|
||||
if not config.get("debug", 0):
|
||||
logging.getLogger("aiohttp.access").propagate = False
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||
users_mod.load_users(config)
|
||||
|
||||
# Write pidfile
|
||||
pidfile = config.get("pidfile", "")
|
||||
if pidfile:
|
||||
try:
|
||||
with open(pidfile, "w") as f:
|
||||
f.write(str(os.getpid()))
|
||||
except Exception as e:
|
||||
logger.warning("Failed to write pidfile %s: %s", pidfile, e)
|
||||
|
||||
eventlog(None, "INFO", f"hbd version {__version__} starting up")
|
||||
|
||||
if config_path:
|
||||
logger.info(f"Config file: {config_path} (reload with SIGHUP)")
|
||||
else:
|
||||
logger.warning("No config path provided - reload via SIGHUP disabled")
|
||||
|
||||
# Create and set the event loop manually
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(_run_async(config, config_path=config_path))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception in main: %s", e)
|
||||
finally:
|
||||
cleanup_function(config, hbdclass)
|
||||
logger.info("hbd shutdown complete")
|
||||
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
|
||||
notify_mod.closelog()
|
||||
# Remove pidfile
|
||||
if pidfile:
|
||||
try:
|
||||
os.unlink(pidfile)
|
||||
except Exception:
|
||||
pass
|
||||
# Explicitly close the loop
|
||||
try:
|
||||
# Cancel all remaining tasks
|
||||
pending = asyncio.all_tasks(loop)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
# Run one more cycle to process cancellations
|
||||
if pending:
|
||||
loop.run_until_complete(
|
||||
asyncio.gather(*pending, return_exceptions=True)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# Exit
|
||||
os._exit(0)
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Monitor helper for heartbeat daemon.
|
||||
|
||||
This module provides monitoring tasks for the heartbeat daemon.
|
||||
The primary reachability monitoring is now event-driven (timers set/reset
|
||||
on HTB arrival in udp.py) rather than periodic polling.
|
||||
|
||||
This module can be extended for additional monitoring tasks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
from . import notify as notify_mod
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
|
||||
async def cleanup_connections(hbdclass):
|
||||
"""Clean up connection timers on shutdown.
|
||||
|
||||
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||
"""
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
|
||||
@@ -0,0 +1,495 @@
|
||||
"""Notification helpers: email, pushover, matrix, mattermost, signal, sms and dispatcher.
|
||||
|
||||
Channel types supported:
|
||||
pushover - Pushover app notifications
|
||||
email - SMTP email
|
||||
matrix - Matrix (via matrix-nio)
|
||||
mattermost - Mattermost webhook
|
||||
signal - Signal via signal-cli subprocess
|
||||
sms_voipms - SMS via voip.ms REST API
|
||||
|
||||
Each channel can specify ``min_level: WARNING|CRITICAL`` (default: WARNING).
|
||||
|
||||
Notifications are dispatched to the owner + managers of the host, each via
|
||||
their own ``notification_channels`` list. When no users are configured the
|
||||
server runs silently (no notifications sent).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import smtplib
|
||||
import subprocess
|
||||
import time
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from . import data
|
||||
from . import ws as ws_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
|
||||
# Module-level state set via setup()
|
||||
_config: dict = {}
|
||||
|
||||
# Tracks which channels fired a WARNING/CRITICAL per host.
|
||||
# {host_name: set of channel_names} — used to route RECOVER to the same channels.
|
||||
_alerted_channels: dict = {}
|
||||
|
||||
logf = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level ordering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LEVEL_ORDER = {"RECOVER": 0, "INFO": 0, "WARNING": 1, "CRITICAL": 2}
|
||||
|
||||
def _level_value(level: str) -> int:
|
||||
return _LEVEL_ORDER.get(level.upper(), 0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Notification dataclass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class Notification:
|
||||
"""Structured notification payload."""
|
||||
title: str # e.g. "[CRITICAL] webserver01"
|
||||
body: str # detail message
|
||||
level: str # RECOVER | WARNING | CRITICAL | INFO
|
||||
url: str = "" # link to plugin metrics page
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module setup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def setup(cfg: dict, loop: Optional[asyncio.AbstractEventLoop] = None):
|
||||
"""Initialize notifier from configuration dict."""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
|
||||
|
||||
def reload_config(cfg: dict):
|
||||
"""Reload notification configuration on SIGHUP."""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
logger.info("Notification configuration reloaded")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Event log (websocket + file + in-memory)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def initlog(logfile):
|
||||
global logf
|
||||
try:
|
||||
logf = open(logfile, "a+")
|
||||
except Exception as e:
|
||||
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
||||
logf = sys.stderr
|
||||
return logf
|
||||
|
||||
|
||||
def closelog():
|
||||
global logf
|
||||
if logf and logf != sys.stderr:
|
||||
try:
|
||||
logf.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def eventlog(host, lvl, m, service=None):
|
||||
ts = time.time()
|
||||
msg = {
|
||||
"ts": ts,
|
||||
"host": host or None,
|
||||
"level": lvl,
|
||||
"service": service,
|
||||
"message": m,
|
||||
}
|
||||
data.msgs.append(msg)
|
||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||
if host:
|
||||
s += f"{host} "
|
||||
s += m
|
||||
logger.info(s)
|
||||
if logf:
|
||||
try:
|
||||
logf.write(s + "\n")
|
||||
logf.flush()
|
||||
except Exception as e:
|
||||
logger.warning("failed to write to logfile: %s", e)
|
||||
msg_to_websockets("message", msg)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-level channel drivers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _send_pushover(channel_cfg: dict, notif: Notification) -> bool:
|
||||
import http.client
|
||||
import urllib.parse
|
||||
token = channel_cfg.get("token", "")
|
||||
user = channel_cfg.get("user", "")
|
||||
if not token or not user:
|
||||
logger.warning("pushover: missing token or user")
|
||||
return False
|
||||
params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body}
|
||||
if channel_cfg.get("sound"):
|
||||
params["sound"] = channel_cfg["sound"]
|
||||
if notif.url:
|
||||
params["url"] = notif.url
|
||||
params["url_title"] = "Heartbeat"
|
||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||
try:
|
||||
conn.request(
|
||||
"POST",
|
||||
"/1/messages.json",
|
||||
urllib.parse.urlencode(params),
|
||||
{"Content-type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
r = conn.getresponse()
|
||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||
return r.status == 200
|
||||
except Exception as e:
|
||||
logger.error("pushover error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def _send_email(channel_cfg: dict, notif: Notification) -> bool:
|
||||
recipients = channel_cfg.get("recipients", [])
|
||||
sender = channel_cfg.get("sender", "")
|
||||
smtp_server = channel_cfg.get("smtp_server", "")
|
||||
smtp_port = channel_cfg.get("smtp_port", 587)
|
||||
smtp_user = channel_cfg.get("smtp_user")
|
||||
smtp_password = channel_cfg.get("smtp_password")
|
||||
|
||||
if not recipients or not sender or not smtp_server:
|
||||
logger.warning("email: missing recipients, sender, or smtp_server")
|
||||
return False
|
||||
|
||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||
body_text = notif.body
|
||||
if notif.url:
|
||||
body_text += f"\n\n{notif.url}"
|
||||
raw = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||
recipients[0] if isinstance(recipients, list) else recipients,
|
||||
sender,
|
||||
notif.title,
|
||||
date,
|
||||
body_text,
|
||||
)
|
||||
try:
|
||||
server = smtplib.SMTP(smtp_server, smtp_port)
|
||||
if smtp_port == 587:
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
if smtp_user and smtp_password:
|
||||
server.login(smtp_user, smtp_password)
|
||||
server.sendmail(sender, recipients, raw)
|
||||
server.quit()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("email send failed: %s", e)
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _send_mattermost(channel_cfg: dict, notif: Notification) -> bool:
|
||||
try:
|
||||
from mattermostdriver import Driver
|
||||
except ImportError:
|
||||
logger.error("mattermostdriver not installed")
|
||||
return False
|
||||
host = channel_cfg.get("host", "")
|
||||
token = channel_cfg.get("token", "")
|
||||
channel = channel_cfg.get("channel", "")
|
||||
if not host or not token or not channel:
|
||||
logger.warning("mattermost: missing host, token, or channel")
|
||||
return False
|
||||
text = f"**{notif.title}**\n{notif.body}"
|
||||
if notif.url:
|
||||
text += f"\n[Plugin metrics] {notif.url}"
|
||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||
mm = Driver(ses)
|
||||
payload: dict = {"text": text, "channel": channel, "username": channel_cfg.get("username", "hbd")}
|
||||
icon = channel_cfg.get("icon")
|
||||
if icon:
|
||||
payload["icon_url"] = icon
|
||||
try:
|
||||
rc = mm.webhooks.call_webhook(token, payload)
|
||||
return bool(rc is None or rc == "")
|
||||
except Exception as e:
|
||||
logger.error("mattermost error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def _send_signal(channel_cfg: dict, notif: Notification) -> bool:
|
||||
cli = channel_cfg.get("cli_path", "/usr/local/bin/signal-cli")
|
||||
user = channel_cfg.get("user", "")
|
||||
recipient = channel_cfg.get("recipient", "")
|
||||
if not user or not recipient:
|
||||
logger.warning("signal: missing user or recipient")
|
||||
return False
|
||||
msg = f"{notif.title}\n{notif.body}"
|
||||
if notif.url:
|
||||
msg += f"\n{notif.url}"
|
||||
try:
|
||||
res = subprocess.run([cli, "-u", user, "send", "-m", msg, recipient], capture_output=True)
|
||||
if res.returncode != 0:
|
||||
logger.error("signal failed: %s", res.stderr.decode())
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception("signal exception: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
async def _send_sms_voipms_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||
"""Send SMS via voip.ms REST API using multipart form-data POST."""
|
||||
import json
|
||||
import aiohttp
|
||||
|
||||
api_user = channel_cfg.get("api_user", "")
|
||||
api_password = channel_cfg.get("api_password", "")
|
||||
did = channel_cfg.get("did", "")
|
||||
dst = channel_cfg.get("dst", "")
|
||||
if not api_user or not api_password or not did or not dst:
|
||||
logger.warning("sms_voipms: missing api_user, api_password, did, or dst")
|
||||
return False
|
||||
|
||||
# SMS body: title + body, truncated to 160 chars
|
||||
text = f"{notif.title}: {notif.body}"
|
||||
if len(text) > 160:
|
||||
text = text[:157] + "..."
|
||||
|
||||
form_data = {
|
||||
"api_username": api_user,
|
||||
"api_password": api_password,
|
||||
"method": "sendSMS",
|
||||
"did": did,
|
||||
"dst": dst,
|
||||
"message": text,
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
with aiohttp.MultipartWriter("form-data") as mp:
|
||||
for key, value in form_data.items():
|
||||
part = mp.append(value)
|
||||
part.set_content_disposition("form-data", name=key)
|
||||
async with session.post("https://voip.ms/api/v1/rest.php", data=mp) as resp:
|
||||
body = await resp.text()
|
||||
if resp.status != 200:
|
||||
logger.error("sms_voipms HTTP %s: %s", resp.status, body)
|
||||
return False
|
||||
result = json.loads(body)
|
||||
if result.get("status") == "success":
|
||||
return True
|
||||
logger.error("sms_voipms error: %s", result.get("status"))
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error("sms_voipms exception: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
async def _send_matrix_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||
"""Send a Matrix message using matrix-nio."""
|
||||
try:
|
||||
from nio import AsyncClient, RoomMessageText # noqa: F401
|
||||
except ImportError:
|
||||
logger.error("matrix-nio not installed; pip install matrix-nio")
|
||||
return False
|
||||
|
||||
from nio import AsyncClient
|
||||
homeserver = channel_cfg.get("homeserver", "")
|
||||
access_token = channel_cfg.get("access_token", "")
|
||||
room_id = channel_cfg.get("room_id", "")
|
||||
if not homeserver or not access_token or not room_id:
|
||||
logger.warning("matrix: missing homeserver, access_token, or room_id")
|
||||
return False
|
||||
|
||||
text = f"{notif.title}\n{notif.body}"
|
||||
if notif.url:
|
||||
text += f"\n{notif.url}"
|
||||
html = f"<strong>{notif.title}</strong><br>{notif.body}"
|
||||
if notif.url:
|
||||
html += f'<br><a href="{notif.url}">Plugin metrics</a>'
|
||||
|
||||
client = AsyncClient(homeserver)
|
||||
client.access_token = access_token
|
||||
try:
|
||||
from nio import RoomSendResponse
|
||||
content = {
|
||||
"msgtype": "m.text",
|
||||
"body": text,
|
||||
"format": "org.matrix.custom.html",
|
||||
"formatted_body": html,
|
||||
}
|
||||
resp = await client.room_send(room_id, "m.room.message", content)
|
||||
if hasattr(resp, "event_id"):
|
||||
return True
|
||||
logger.error("matrix send failed: %s", resp)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error("matrix exception: %s", e)
|
||||
return False
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Channel dispatcher (all async — sync drivers run in a thread executor)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Sync drivers kept for `hbd notify` CLI usage (asyncio.run wraps them there).
|
||||
_DRIVERS = {
|
||||
"pushover": _send_pushover,
|
||||
"email": _send_email,
|
||||
"mattermost": _send_mattermost,
|
||||
"signal": _send_signal,
|
||||
}
|
||||
|
||||
_TIMEOUT = 15 # seconds per channel send
|
||||
|
||||
|
||||
async def _dispatch_to_channel(channel_name: str, channel_cfg: dict, notif: Notification) -> bool:
|
||||
"""Send *notif* to a single named channel, honouring min_level."""
|
||||
# Strip ownership metadata — notifier drivers only need delivery credentials.
|
||||
channel_cfg = {k: v for k, v in channel_cfg.items() if k not in ("owner", "private")}
|
||||
|
||||
level = notif.level.upper()
|
||||
if level != "RECOVER":
|
||||
min_level = channel_cfg.get("min_level", "WARNING").upper()
|
||||
if _level_value(level) < _level_value(min_level):
|
||||
logger.debug(
|
||||
"channel '%s': skipping level %s (min_level=%s)", channel_name, level, min_level
|
||||
)
|
||||
return True # filtered intentionally
|
||||
|
||||
ch_type = channel_cfg.get("type", "")
|
||||
try:
|
||||
if ch_type == "matrix":
|
||||
return await asyncio.wait_for(_send_matrix_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||
if ch_type == "sms_voipms":
|
||||
return await asyncio.wait_for(_send_sms_voipms_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||
sync_driver = _DRIVERS.get(ch_type)
|
||||
if sync_driver is None:
|
||||
logger.warning("unknown channel type '%s' for channel '%s'", ch_type, channel_name)
|
||||
return False
|
||||
return await asyncio.wait_for(
|
||||
asyncio.to_thread(sync_driver, channel_cfg, notif), timeout=_TIMEOUT
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("channel '%s' timed out after %ds", channel_name, _TIMEOUT)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Central dispatch function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_url(host_name: str) -> str:
|
||||
base_url = _config.get("base_url", "").rstrip("/")
|
||||
if not base_url:
|
||||
return ""
|
||||
return f"{base_url}/alerts?filter={host_name}"
|
||||
|
||||
|
||||
async def send_notification(host_name: str, notif: Notification) -> dict:
|
||||
"""Dispatch *notif* to all managers/owner of *host_name*.
|
||||
|
||||
Looks up the host's owner + managers, resolves each user's
|
||||
notification_channels, and dispatches. Silently does nothing if
|
||||
no users are configured.
|
||||
|
||||
Returns a dict of {channel_name: bool} results.
|
||||
"""
|
||||
from . import users as users_mod
|
||||
from . import hbdclass
|
||||
|
||||
if not users_mod.users_enabled():
|
||||
return {}
|
||||
|
||||
# Collect recipient usernames: owner + managers
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is None:
|
||||
logger.debug("send_notification: host '%s' not found", host_name)
|
||||
return {}
|
||||
|
||||
recipients: set[str] = set()
|
||||
owner = getattr(host, "owner", None)
|
||||
if owner:
|
||||
recipients.add(owner)
|
||||
for m in getattr(host, "managers", []):
|
||||
recipients.add(m)
|
||||
|
||||
if not recipients:
|
||||
logger.debug("send_notification: no owner/managers for '%s'", host_name)
|
||||
return {}
|
||||
|
||||
# Fill url if not already set
|
||||
if not notif.url:
|
||||
notif.url = _build_url(host_name)
|
||||
|
||||
global_channels: dict = _config.get("notification_channels", {})
|
||||
results: dict = {}
|
||||
level = notif.level.upper()
|
||||
is_alert = level in ("WARNING", "CRITICAL")
|
||||
is_recover = level in ("RECOVER",)
|
||||
|
||||
# For RECOVER: send to every channel that previously fired an alert for this host,
|
||||
# regardless of that channel's min_level.
|
||||
if is_recover and host_name in _alerted_channels:
|
||||
for channel_name in list(_alerted_channels[host_name]):
|
||||
channel_cfg = global_channels.get(channel_name)
|
||||
if not channel_cfg:
|
||||
continue
|
||||
try:
|
||||
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||
results[channel_name] = ok
|
||||
if ok:
|
||||
logger.info("recover sent to channel '%s': %s", channel_name, notif.title)
|
||||
except Exception as e:
|
||||
logger.error("error sending recover to channel '%s': %s", channel_name, e)
|
||||
del _alerted_channels[host_name]
|
||||
return results
|
||||
|
||||
for username in recipients:
|
||||
user = users_mod.get_user(username)
|
||||
if user is None:
|
||||
logger.debug("send_notification: user '%s' not found", username)
|
||||
continue
|
||||
for channel_name in user.notification_channels:
|
||||
if channel_name in results:
|
||||
continue
|
||||
channel_cfg = global_channels.get(channel_name)
|
||||
if not channel_cfg:
|
||||
logger.warning("channel '%s' not defined in notification_channels", channel_name)
|
||||
results[channel_name] = False
|
||||
continue
|
||||
try:
|
||||
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||
results[channel_name] = ok
|
||||
if ok:
|
||||
logger.info("notification sent to channel '%s': %s", channel_name, notif.title)
|
||||
if is_alert:
|
||||
_alerted_channels.setdefault(host_name, set()).add(channel_name)
|
||||
else:
|
||||
logger.warning("failed to send notification to channel '%s'", channel_name)
|
||||
except Exception as e:
|
||||
logger.error("error sending to channel '%s': %s", channel_name, e)
|
||||
results[channel_name] = False
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,254 @@
|
||||
"""OAuth2 provider support.
|
||||
|
||||
Config shape (in ~/.hb.yaml):
|
||||
|
||||
oauth:
|
||||
my-gitea: # route slug → /login/oauth/my-gitea
|
||||
type: gitea # "gitea" | "github" | "nextcloud"
|
||||
# omit type to default to "gitea"
|
||||
url: https://git.example.com # required for gitea and nextcloud
|
||||
client_id: <client-id>
|
||||
client_secret: <client-secret>
|
||||
label: "Work Gitea" # optional display name on login button
|
||||
logo: https://example.com/logo.png # optional logo URL
|
||||
|
||||
github:
|
||||
type: github
|
||||
client_id: <client-id>
|
||||
client_secret: <client-secret>
|
||||
|
||||
nextcloud:
|
||||
type: nextcloud
|
||||
url: https://cloud.example.com
|
||||
client_id: <client-id>
|
||||
client_secret: <client-secret>
|
||||
|
||||
Register the OAuth app with each provider and set the redirect URI to:
|
||||
https://<hbd-host>/login/oauth/<name>/callback
|
||||
"""
|
||||
|
||||
import logging
|
||||
import secrets
|
||||
import time
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STATE_TTL = 600 # 10 minutes
|
||||
|
||||
# state_token -> expiry timestamp
|
||||
_states: dict[str, float] = {}
|
||||
|
||||
|
||||
def make_state() -> str:
|
||||
"""Generate a CSRF state token, store it with TTL, and return it."""
|
||||
_purge_states()
|
||||
token = secrets.token_hex(32)
|
||||
_states[token] = time.time() + STATE_TTL
|
||||
return token
|
||||
|
||||
|
||||
def validate_state(state: str) -> bool:
|
||||
"""Return True if *state* is known and unexpired; always removes it."""
|
||||
expiry = _states.pop(state, None)
|
||||
if expiry is None:
|
||||
return False
|
||||
return time.time() < expiry
|
||||
|
||||
|
||||
def _purge_states() -> None:
|
||||
"""Remove all expired CSRF state tokens from the in-memory store."""
|
||||
now = time.time()
|
||||
expired = [k for k, exp in list(_states.items()) if exp < now]
|
||||
for k in expired:
|
||||
del _states[k]
|
||||
|
||||
|
||||
class OAuthError(Exception):
|
||||
"""Raised when the OAuth2 flow fails for any reason."""
|
||||
|
||||
|
||||
PROVIDER_DEFS: dict = {
|
||||
"gitea": {
|
||||
"authorize_url_tmpl": "{url}/login/oauth/authorize",
|
||||
"token_url_tmpl": "{url}/login/oauth/access_token",
|
||||
"profile_url_tmpl": "{url}/api/v1/user",
|
||||
"scope": "user:email",
|
||||
"field_map": {"username": "login", "full_name": "full_name", "avatar": "avatar_url"},
|
||||
"profile_data_path": [],
|
||||
"requires_url": True,
|
||||
"default_label": "Gitea",
|
||||
},
|
||||
"github": {
|
||||
"authorize_url_tmpl": "https://github.com/login/oauth/authorize",
|
||||
"token_url_tmpl": "https://github.com/login/oauth/access_token",
|
||||
"profile_url_tmpl": "https://api.github.com/user",
|
||||
"scope": "read:user",
|
||||
"field_map": {"username": "login", "full_name": "name", "avatar": "avatar_url"},
|
||||
"profile_data_path": [],
|
||||
"requires_url": False,
|
||||
"default_label": "GitHub",
|
||||
},
|
||||
"nextcloud": {
|
||||
"authorize_url_tmpl": "{url}/apps/oauth2/authorize",
|
||||
"token_url_tmpl": "{url}/apps/oauth2/api/v1/token",
|
||||
"profile_url_tmpl": "{url}/ocs/v2.php/cloud/user?format=json",
|
||||
"scope": "",
|
||||
"field_map": {"username": "id", "full_name": "display-name", "avatar": None},
|
||||
"profile_data_path": ["ocs", "data"],
|
||||
"requires_url": True,
|
||||
"default_label": "Nextcloud",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResolvedProvider:
|
||||
"""A fully resolved OAuth2 provider instance, ready to use."""
|
||||
name: str
|
||||
type: str
|
||||
label: str
|
||||
logo: str
|
||||
authorize_url: str
|
||||
token_url: str
|
||||
profile_url: str
|
||||
scope: str
|
||||
client_id: str
|
||||
client_secret: str
|
||||
field_map: dict
|
||||
profile_data_path: list
|
||||
|
||||
|
||||
def get_providers(config: dict) -> list[ResolvedProvider]:
|
||||
"""Return a ResolvedProvider for every valid entry in config['oauth'].
|
||||
|
||||
Entries with missing required fields or unknown types are skipped with
|
||||
a warning log. Order follows config declaration order.
|
||||
"""
|
||||
result = []
|
||||
oauth_cfg = config.get("oauth", {})
|
||||
if not isinstance(oauth_cfg, dict):
|
||||
return result
|
||||
for name, entry in oauth_cfg.items():
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
provider_type = entry.get("type", "gitea")
|
||||
defn = PROVIDER_DEFS.get(provider_type)
|
||||
if defn is None:
|
||||
logger.warning("OAuth: unknown provider type %r for %r, skipping", provider_type, name)
|
||||
continue
|
||||
client_id = entry.get("client_id", "")
|
||||
client_secret = entry.get("client_secret", "")
|
||||
if not client_id or not client_secret:
|
||||
logger.warning("OAuth: %r missing client_id or client_secret, skipping", name)
|
||||
continue
|
||||
url = entry.get("url", "").rstrip("/")
|
||||
if defn["requires_url"] and not url:
|
||||
logger.warning("OAuth: %r requires url but none configured, skipping", name)
|
||||
continue
|
||||
label = entry.get("label") or defn["default_label"]
|
||||
logo = entry.get("logo", "")
|
||||
result.append(ResolvedProvider(
|
||||
name=name,
|
||||
type=provider_type,
|
||||
label=label,
|
||||
logo=logo,
|
||||
authorize_url=defn["authorize_url_tmpl"].format(url=url),
|
||||
token_url=defn["token_url_tmpl"].format(url=url),
|
||||
profile_url=defn["profile_url_tmpl"].format(url=url),
|
||||
scope=defn["scope"],
|
||||
client_id=client_id,
|
||||
client_secret=client_secret,
|
||||
field_map=dict(defn["field_map"]),
|
||||
profile_data_path=list(defn["profile_data_path"]),
|
||||
))
|
||||
return result
|
||||
|
||||
|
||||
def is_enabled(config: dict) -> bool:
|
||||
"""Return True when at least one OAuth provider is fully configured."""
|
||||
return bool(get_providers(config))
|
||||
|
||||
|
||||
def build_auth_url(provider: ResolvedProvider, state: str, redirect_uri: str) -> str:
|
||||
"""Return the provider's OAuth2 authorization URL to redirect the browser to."""
|
||||
params: dict = {
|
||||
"client_id": provider.client_id,
|
||||
"redirect_uri": redirect_uri,
|
||||
"response_type": "code",
|
||||
"state": state,
|
||||
}
|
||||
if provider.scope:
|
||||
params["scope"] = provider.scope
|
||||
return f"{provider.authorize_url}?{urllib.parse.urlencode(params)}"
|
||||
|
||||
|
||||
async def exchange_code(provider: ResolvedProvider, code: str, redirect_uri: str) -> str:
|
||||
"""Exchange an authorization *code* for an access token.
|
||||
|
||||
Returns the access token string. Raises OAuthError on any failure.
|
||||
"""
|
||||
payload = {
|
||||
"client_id": provider.client_id,
|
||||
"client_secret": provider.client_secret,
|
||||
"code": code,
|
||||
"grant_type": "authorization_code",
|
||||
"redirect_uri": redirect_uri,
|
||||
}
|
||||
timeout = aiohttp.ClientTimeout(total=10)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.post(
|
||||
provider.token_url,
|
||||
json=payload,
|
||||
headers={"Accept": "application/json"},
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
text = await resp.text()
|
||||
raise OAuthError(f"Token exchange failed ({resp.status}): {text}")
|
||||
data = await resp.json()
|
||||
token = data.get("access_token")
|
||||
if not token:
|
||||
raise OAuthError(f"No access_token in response: {data}")
|
||||
except aiohttp.ClientError as exc:
|
||||
raise OAuthError(f"Token exchange network error: {exc}") from exc
|
||||
return token
|
||||
|
||||
|
||||
async def fetch_user(provider: ResolvedProvider, token: str) -> dict:
|
||||
"""Fetch the authenticated user's profile from the provider.
|
||||
|
||||
Returns a dict with keys: login, full_name, avatar_url.
|
||||
Raises OAuthError on any failure.
|
||||
"""
|
||||
timeout = aiohttp.ClientTimeout(total=10)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(
|
||||
provider.profile_url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {token}",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
text = await resp.text()
|
||||
raise OAuthError(f"User fetch failed ({resp.status}): {text}")
|
||||
data = await resp.json()
|
||||
except aiohttp.ClientError as exc:
|
||||
raise OAuthError(f"User fetch network error: {exc}") from exc
|
||||
|
||||
try:
|
||||
for key in provider.profile_data_path:
|
||||
data = data.get(key, {})
|
||||
avatar_field = provider.field_map.get("avatar")
|
||||
return {
|
||||
"login": data.get(provider.field_map["username"], ""),
|
||||
"full_name": data.get(provider.field_map["full_name"], ""),
|
||||
"avatar_url": data.get(avatar_field, "") if avatar_field else "",
|
||||
}
|
||||
except AttributeError:
|
||||
raise OAuthError(f"Unexpected profile response structure from {provider.type}")
|
||||
@@ -0,0 +1,498 @@
|
||||
"""Settings descriptor: maps config keys to display metadata.
|
||||
|
||||
``get_settings_sections(config)`` returns an ordered list of sections, each
|
||||
containing a list of field descriptors. The template iterates this structure
|
||||
generically, so adding editability later is a matter of:
|
||||
|
||||
1. Setting ``"editable": True`` on a field.
|
||||
2. Adding the matching ``<input>``/``<select>`` in the template
|
||||
(guided by ``"type"``).
|
||||
3. Wiring a POST handler in http.py.
|
||||
|
||||
Field descriptor keys
|
||||
---------------------
|
||||
key str Config key (for future form POST matching)
|
||||
label str Human-readable label
|
||||
description str One-line help text shown below the value
|
||||
value any Sanitized display value (secrets replaced with "•••")
|
||||
type str One of: text | number | port | boolean | path | duration |
|
||||
list | secret | size | select
|
||||
editable bool Reserved for future use — currently always False
|
||||
sensitive bool True when the raw value must never be shown
|
||||
"""
|
||||
|
||||
# Credential field names that should always be masked.
|
||||
_SECRET_KEYS = frozenset({
|
||||
"password", "token", "user_key", "api_key", "secret",
|
||||
"smtp_password", "smtp_user", "api_password", "access_token",
|
||||
})
|
||||
|
||||
CHANNEL_TYPE_SCHEMAS = {
|
||||
"pushover": {
|
||||
"label": "Pushover",
|
||||
"fields": [
|
||||
{"key": "token", "label": "App token", "type": "secret", "required": True},
|
||||
{"key": "user", "label": "User key", "type": "secret", "required": True},
|
||||
{"key": "sound", "label": "Sound", "type": "text", "required": False},
|
||||
],
|
||||
},
|
||||
"email": {
|
||||
"label": "E-mail",
|
||||
"fields": [
|
||||
{"key": "recipients", "label": "Recipients (comma-separated)", "type": "list", "required": True},
|
||||
{"key": "sender", "label": "From address", "type": "text", "required": True},
|
||||
{"key": "smtp_server", "label": "SMTP server", "type": "text", "required": True},
|
||||
{"key": "smtp_port", "label": "SMTP port", "type": "port", "required": False},
|
||||
{"key": "smtp_user", "label": "SMTP username", "type": "text", "required": False},
|
||||
{"key": "smtp_password", "label": "SMTP password", "type": "secret", "required": False},
|
||||
],
|
||||
},
|
||||
"signal": {
|
||||
"label": "Signal",
|
||||
"fields": [
|
||||
{"key": "user", "label": "Sender number", "type": "text", "required": True},
|
||||
{"key": "recipient", "label": "Recipient number", "type": "text", "required": True},
|
||||
{"key": "cli_path", "label": "signal-cli path", "type": "text", "required": False},
|
||||
],
|
||||
},
|
||||
"matrix": {
|
||||
"label": "Matrix",
|
||||
"fields": [
|
||||
{"key": "homeserver", "label": "Homeserver URL", "type": "text", "required": True},
|
||||
{"key": "access_token", "label": "Access token", "type": "secret", "required": True},
|
||||
{"key": "room_id", "label": "Room ID", "type": "text", "required": True},
|
||||
],
|
||||
},
|
||||
"sms_voipms": {
|
||||
"label": "SMS (voip.ms)",
|
||||
"fields": [
|
||||
{"key": "api_user", "label": "API username", "type": "text", "required": True},
|
||||
{"key": "api_password", "label": "API password", "type": "secret", "required": True},
|
||||
{"key": "did", "label": "DID (from)", "type": "text", "required": True},
|
||||
{"key": "dst", "label": "Destination", "type": "text", "required": True},
|
||||
],
|
||||
},
|
||||
"mattermost": {
|
||||
"label": "Mattermost",
|
||||
"fields": [
|
||||
{"key": "host", "label": "Host", "type": "text", "required": True},
|
||||
{"key": "token", "label": "Webhook token", "type": "secret", "required": True},
|
||||
{"key": "channel", "label": "Channel", "type": "text", "required": True},
|
||||
{"key": "username", "label": "Bot username", "type": "text", "required": False},
|
||||
{"key": "icon", "label": "Icon URL", "type": "text", "required": False},
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
_CHANNEL_TYPE_LABELS = {k: v["label"] for k, v in CHANNEL_TYPE_SCHEMAS.items()}
|
||||
|
||||
|
||||
def _mask(value):
|
||||
"""Return a masked placeholder for sensitive values."""
|
||||
if not value:
|
||||
return ""
|
||||
return "•••"
|
||||
|
||||
|
||||
def _fmt_size(n):
|
||||
"""Format a byte count as a human-readable string."""
|
||||
try:
|
||||
n = int(n)
|
||||
except (TypeError, ValueError):
|
||||
return str(n)
|
||||
for unit in ("B", "KB", "MB", "GB"):
|
||||
if n < 1024:
|
||||
return f"{n} {unit}"
|
||||
n //= 1024
|
||||
return f"{n} TB"
|
||||
|
||||
|
||||
def _fmt_duration(seconds):
|
||||
"""Format seconds into a human-readable duration string."""
|
||||
try:
|
||||
s = int(seconds)
|
||||
except (TypeError, ValueError):
|
||||
return str(seconds)
|
||||
if s < 60:
|
||||
return f"{s}s"
|
||||
if s < 3600:
|
||||
m, sec = divmod(s, 60)
|
||||
return f"{m}m {sec}s" if sec else f"{m}m"
|
||||
h, rem = divmod(s, 3600)
|
||||
m = rem // 60
|
||||
return f"{h}h {m}m" if m else f"{h}h"
|
||||
|
||||
|
||||
def _sanitize_channel(name, cfg):
|
||||
"""Return a sanitized copy of a notification channel config."""
|
||||
result = {}
|
||||
for k, v in cfg.items():
|
||||
if k in _SECRET_KEYS:
|
||||
result[k] = _mask(v)
|
||||
elif isinstance(v, list):
|
||||
result[k] = v
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_settings_sections(config: dict, threshold_checker=None) -> list:
|
||||
"""Return ordered list of setting sections for the settings page.
|
||||
|
||||
Each section:
|
||||
{
|
||||
"title": str,
|
||||
"description": str,
|
||||
"fields": [ field_descriptor, ... ]
|
||||
}
|
||||
|
||||
Each field_descriptor:
|
||||
{
|
||||
"key": str,
|
||||
"label": str,
|
||||
"description": str,
|
||||
"value": display_value,
|
||||
"raw": raw_config_value, # None for sensitive
|
||||
"type": str,
|
||||
"editable": bool,
|
||||
"sensitive": bool,
|
||||
}
|
||||
"""
|
||||
def field(key, label, ftype, description="", editable=False, sensitive=False):
|
||||
raw = config.get(key)
|
||||
if sensitive:
|
||||
display = _mask(raw)
|
||||
raw_out = None
|
||||
elif ftype == "size":
|
||||
display = _fmt_size(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "duration":
|
||||
display = _fmt_duration(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "boolean":
|
||||
display = bool(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "list":
|
||||
val = raw or []
|
||||
display = list(val) if not isinstance(val, list) else val
|
||||
raw_out = display
|
||||
else:
|
||||
display = raw if raw is not None else ""
|
||||
raw_out = raw
|
||||
return {
|
||||
"key": key,
|
||||
"label": label,
|
||||
"description": description,
|
||||
"value": display,
|
||||
"raw": raw_out,
|
||||
"type": ftype,
|
||||
"editable": editable,
|
||||
"sensitive": sensitive,
|
||||
}
|
||||
|
||||
# ---- Notification channels (complex, built separately) ----------------
|
||||
_METADATA_KEYS = {"type", "owner", "private", "min_level"}
|
||||
notif_channels = []
|
||||
for ch_name, ch_cfg in sorted((config.get("notification_channels") or {}).items()):
|
||||
if not isinstance(ch_cfg, dict):
|
||||
continue
|
||||
ch_type = ch_cfg.get("type", "")
|
||||
fields = []
|
||||
for k, v in ch_cfg.items():
|
||||
if k in _METADATA_KEYS:
|
||||
continue
|
||||
sensitive = k in _SECRET_KEYS
|
||||
fields.append({
|
||||
"key": k,
|
||||
"label": k.replace("_", " ").title(),
|
||||
"value": _mask(v) if sensitive else (
|
||||
", ".join(v) if isinstance(v, list) else str(v)
|
||||
),
|
||||
"sensitive": sensitive,
|
||||
})
|
||||
notif_channels.append({
|
||||
"name": ch_name,
|
||||
"type": ch_type,
|
||||
"type_label": _CHANNEL_TYPE_LABELS.get(ch_type, ch_type.title()),
|
||||
"owner": ch_cfg.get("owner"),
|
||||
"private": bool(ch_cfg.get("private", False)),
|
||||
"min_level": ch_cfg.get("min_level", "WARNING"),
|
||||
"fields": fields,
|
||||
})
|
||||
|
||||
# ---- Users (show metadata only, never password hashes) ----------------
|
||||
users_list = []
|
||||
for username, attrs in (config.get("users") or {}).items():
|
||||
if not isinstance(attrs, dict):
|
||||
continue
|
||||
users_list.append({
|
||||
"username": username,
|
||||
"full_name": attrs.get("full_name", ""),
|
||||
"admin": bool(attrs.get("admin", False)),
|
||||
"avatar": attrs.get("avatar", ""),
|
||||
"notification_channels": attrs.get("notification_channels", []),
|
||||
})
|
||||
|
||||
# ---- Threshold configurations -----------------------------------------
|
||||
def _tc_to_row(tc):
|
||||
return {
|
||||
"metric": tc.metric_path,
|
||||
"operator": tc.operator.value,
|
||||
"warning": tc.warning,
|
||||
"critical": tc.critical,
|
||||
"hysteresis": tc.hysteresis,
|
||||
"count": tc.count,
|
||||
"enabled": tc.enabled,
|
||||
"display": tc.display or "",
|
||||
"grace": tc.grace,
|
||||
}
|
||||
|
||||
threshold_config_list = []
|
||||
if threshold_checker is not None:
|
||||
if threshold_checker.threshold_configs:
|
||||
for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
|
||||
# For the default config use the merged effective set;
|
||||
# for named overrides use only the explicitly defined metrics
|
||||
# (threshold_raw_configs) so inherited defaults are not repeated.
|
||||
if cfg_name == "default":
|
||||
display_metrics = cfg_metrics
|
||||
else:
|
||||
display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
|
||||
metrics = sorted(
|
||||
[_tc_to_row(tc) for tc in display_metrics.values()],
|
||||
key=lambda m: m["metric"],
|
||||
)
|
||||
threshold_config_list.append({"name": cfg_name, "metrics": metrics})
|
||||
elif threshold_checker.thresholds:
|
||||
metrics = sorted(
|
||||
[_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
|
||||
key=lambda m: m["metric"],
|
||||
)
|
||||
threshold_config_list.append({"name": "default", "metrics": metrics})
|
||||
|
||||
# ---- Hosts summary ----------------------------------------------------
|
||||
hosts_list = []
|
||||
for hname, hcfg in sorted((config.get("hosts") or {}).items()):
|
||||
if not isinstance(hcfg, dict):
|
||||
continue
|
||||
hosts_list.append({
|
||||
"name": hname,
|
||||
"watch": bool(hcfg.get("watch", True)),
|
||||
"dyndns": bool(hcfg.get("dyndns", False)),
|
||||
"owner": hcfg.get("owner", ""),
|
||||
"managers": hcfg.get("managers", []),
|
||||
"monitors": hcfg.get("monitors", []),
|
||||
"threshold_configs": (
|
||||
list(v) if isinstance(v := hcfg.get("threshold_config"), list)
|
||||
else ([v] if v else [])
|
||||
),
|
||||
"notification_channels": hcfg.get("notification_channels", []),
|
||||
})
|
||||
|
||||
# ---- OAuth providers -------------------------------------------------------
|
||||
oauth_providers = []
|
||||
for pname, pattrs in (config.get("oauth") or {}).items():
|
||||
if not isinstance(pattrs, dict):
|
||||
continue
|
||||
cs = pattrs.get("client_secret", "")
|
||||
oauth_providers.append({
|
||||
"name": pname,
|
||||
"type": pattrs.get("type", "gitea"),
|
||||
"url": pattrs.get("url", ""),
|
||||
"client_id": pattrs.get("client_id", ""),
|
||||
"client_secret": "•••" if cs else "",
|
||||
"label": pattrs.get("label", ""),
|
||||
"logo": pattrs.get("logo", ""),
|
||||
})
|
||||
|
||||
return [
|
||||
{
|
||||
"id": "network",
|
||||
"title": "Network",
|
||||
"description": "Ports and bind addresses for all server sockets.",
|
||||
"section_mode": "form",
|
||||
"api_section": "server",
|
||||
"fields": [
|
||||
field("hb_port", "Heartbeat UDP port", "port",
|
||||
"UDP port the server listens on for heartbeat datagrams.", editable=True),
|
||||
field("hbd_host", "HTTP bind address", "text",
|
||||
"Interface to bind the HTTP server to. Empty = all interfaces.", editable=True),
|
||||
field("hbd_port", "HTTP API port", "port",
|
||||
"TCP port for the HTTP API and web UI.", editable=True),
|
||||
field("ws_port", "WebSocket port", "port",
|
||||
"TCP port for the plain WebSocket server.", editable=True),
|
||||
field("wss_port", "Secure WebSocket port", "port",
|
||||
"TCP port for WSS (TLS WebSocket). Leave empty to disable.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "tls",
|
||||
"title": "TLS / WebSocket Security",
|
||||
"description": "Certificate paths used when wss_port is set.",
|
||||
"section_mode": "form",
|
||||
"api_section": None,
|
||||
"fields": [
|
||||
field("cert_path", "Certificate directory", "path",
|
||||
"Directory containing the TLS certificate and key files."),
|
||||
field("wss_pem", "Certificate file", "text",
|
||||
"Filename of the TLS certificate chain (PEM format)."),
|
||||
field("wss_key", "Key file", "text",
|
||||
"Filename of the TLS private key (PEM format)."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "monitoring",
|
||||
"title": "Monitoring",
|
||||
"description": "Heartbeat timing and alert re-notification behaviour.",
|
||||
"section_mode": "form",
|
||||
"api_section": "server",
|
||||
"fields": [
|
||||
field("interval", "Heartbeat interval", "duration",
|
||||
"Expected time between heartbeat messages from each client.", editable=True),
|
||||
field("grace", "Grace period", "number",
|
||||
"Extra seconds to wait after a missed heartbeat before sending notifications.", editable=True),
|
||||
field("threshold_renotify_interval", "Re-notify interval", "duration",
|
||||
"How often to re-send notifications for ongoing threshold alerts.", editable=True),
|
||||
field("autosave_interval", "Autosave interval", "duration",
|
||||
"How often the server saves its state to disk."),
|
||||
field("base_url", "Base URL", "text",
|
||||
"Base URL for notification links.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "persistence",
|
||||
"title": "Persistence & Logging",
|
||||
"description": "State file and event log settings.",
|
||||
"section_mode": "form",
|
||||
"api_section": "server",
|
||||
"fields": [
|
||||
field("pickfile", "State file", "path",
|
||||
"Path to the pickle file used to persist host state across restarts.", editable=True),
|
||||
field("logfile", "Event log", "path",
|
||||
"Path to the event log file.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "journal",
|
||||
"title": "Message Journal",
|
||||
"description": "All received heartbeat and plugin messages are journalled here.",
|
||||
"section_mode": "form",
|
||||
"api_section": "server",
|
||||
"fields": [
|
||||
field("journal_enabled", "Enabled", "boolean",
|
||||
"Turn journalling on or off.", editable=True),
|
||||
field("journal_dir", "Journal directory","path",
|
||||
"Directory where journal files are written.", editable=True),
|
||||
field("journal_file", "Journal filename", "text",
|
||||
"Base filename for the journal (rotated copies get a numeric suffix)."),
|
||||
field("journal_max_size", "Max file size", "size",
|
||||
"Rotate the journal when it exceeds this size.", editable=True),
|
||||
field("journal_max_backups", "Backup count", "number",
|
||||
"Number of rotated journal files to keep.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "dns",
|
||||
"title": "Dynamic DNS",
|
||||
"description": "nsupdate-based DNS registration via nsupdate(8).",
|
||||
"section_mode": "form",
|
||||
"api_section": "dns",
|
||||
"fields": [
|
||||
field("nsupdate_bin", "nsupdate binary", "path",
|
||||
"Path to the nsupdate binary.", editable=True),
|
||||
field("rndc_key", "RNDC key file", "path",
|
||||
"Path to the rndc key file used to authenticate DNS updates.", editable=True),
|
||||
field("dyndomains", "Dynamic domains", "list",
|
||||
"Domains updated via nsupdate when a host with dyndns: true reports in.",
|
||||
editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "users",
|
||||
"title": "Users",
|
||||
"description": "Accounts defined in the config file. Password hashes are never shown.",
|
||||
"section_mode": "form",
|
||||
"api_section": "users",
|
||||
"users": users_list,
|
||||
"fields": [
|
||||
field("default_owner", "Default owner", "text",
|
||||
"Username that owns hosts with no explicit owner. "
|
||||
"Falls back to the first admin user.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "oauth",
|
||||
"title": "OAuth Providers",
|
||||
"description": "OAuth2 login providers. Client secrets are masked.",
|
||||
"section_mode": "form",
|
||||
"api_section": "oauth",
|
||||
"providers": oauth_providers,
|
||||
"fields": [],
|
||||
},
|
||||
{
|
||||
"id": "channels",
|
||||
"title": "Notification Channels",
|
||||
"description": "Named notification providers. Credentials are masked.",
|
||||
"section_mode": "channels",
|
||||
"api_section": "notification_channels",
|
||||
"channels": notif_channels,
|
||||
"fields": [
|
||||
field("default_notification_channels", "Default channels", "list",
|
||||
"Channels used when a host does not specify its own."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "hosts",
|
||||
"title": "Hosts",
|
||||
"description": "Host definitions loaded from the config file.",
|
||||
"section_mode": "hosts",
|
||||
"api_section": "hosts",
|
||||
"hosts": hosts_list,
|
||||
"fields": [],
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"title": "Threshold Configurations",
|
||||
"description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
|
||||
"section_mode": "thresholds",
|
||||
"api_section": "thresholds",
|
||||
"threshold_configs": threshold_config_list,
|
||||
"fields": [
|
||||
field("default_threshold_config", "Default config", "text",
|
||||
"Threshold config used for hosts with no explicit mapping.", editable=True),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "runtime",
|
||||
"title": "Runtime",
|
||||
"description": "Flags set at startup (require restart to change).",
|
||||
"section_mode": "form",
|
||||
"api_section": None,
|
||||
"fields": [
|
||||
field("foreground", "Foreground mode", "boolean",
|
||||
"Run in the foreground instead of daemonising."),
|
||||
field("verbose", "Verbose logging", "boolean",
|
||||
"Enable verbose log output."),
|
||||
field("debug", "Debug level", "number",
|
||||
"0 = off. Higher values increase log verbosity."),
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def get_settings_data(config: dict, threshold_checker=None) -> dict:
|
||||
"""Return sections list + auxiliary data for the settings template."""
|
||||
sections = get_settings_sections(config, threshold_checker=threshold_checker)
|
||||
all_channel_names = sorted((config.get("notification_channels") or {}).keys())
|
||||
all_usernames = sorted((config.get("users") or {}).keys())
|
||||
all_threshold_configs = sorted((config.get("threshold_configs") or {}).keys())
|
||||
return {
|
||||
"sections": sections,
|
||||
"all_channel_names": all_channel_names,
|
||||
"all_usernames": all_usernames,
|
||||
"all_threshold_configs": all_threshold_configs,
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 181 KiB |
@@ -140,3 +140,68 @@
|
||||
float: left;
|
||||
}
|
||||
|
||||
/* ── Responsive / mobile ── */
|
||||
|
||||
/* Suppress the global transition on mobile to avoid sluggish feel */
|
||||
@media (max-width: 640px) {
|
||||
* { transition: none !important; }
|
||||
|
||||
html, body {
|
||||
overflow: auto;
|
||||
height: auto;
|
||||
font-size: 16px; /* prevent iOS auto-zoom on inputs */
|
||||
}
|
||||
|
||||
/* Pages that use flex-column full-viewport layout need to relax on mobile */
|
||||
body[style*="height: 100vh"],
|
||||
body {
|
||||
height: auto !important;
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
/* Containers: full width, no fixed heights */
|
||||
.container {
|
||||
max-width: 100% !important;
|
||||
max-height: none !important;
|
||||
overflow: visible !important;
|
||||
padding: 8px !important;
|
||||
}
|
||||
|
||||
/* Log section: fixed reasonable height instead of flex-grow */
|
||||
.log-section {
|
||||
flex: none !important;
|
||||
max-height: 40vh !important;
|
||||
overflow-y: auto !important;
|
||||
}
|
||||
|
||||
/* Table section: allow vertical scroll, cap height */
|
||||
.table-section {
|
||||
max-height: 55vh !important;
|
||||
overflow-y: auto !important;
|
||||
overflow-x: auto !important;
|
||||
padding: 8px !important;
|
||||
}
|
||||
|
||||
/* Slightly larger tap targets in tables */
|
||||
#ntable td, #ntable th {
|
||||
padding: 4px 6px !important;
|
||||
font-size: 1.00em !important;
|
||||
}
|
||||
|
||||
/* Cards on plugin/alerts pages */
|
||||
.host-card, .alert-card, .card {
|
||||
padding: 10px !important;
|
||||
margin-bottom: 8px !important;
|
||||
}
|
||||
|
||||
/* Settings page tables */
|
||||
table { width: 100%; }
|
||||
|
||||
h1 { font-size: 1.2em !important; }
|
||||
h2 { font-size: 1em !important; }
|
||||
}
|
||||
|
||||
/* Suppress nav-username text on very narrow screens — avatar/initials is enough */
|
||||
@media (max-width: 400px) {
|
||||
.nav-username { display: none; }
|
||||
}
|
||||
@@ -0,0 +1,212 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
html, body { overflow: visible; }
|
||||
|
||||
.container {
|
||||
max-width: 700px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 4px;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 24px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.section {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||
padding: 20px 24px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.section h2 {
|
||||
font-size: 1em;
|
||||
font-weight: 700;
|
||||
color: #333;
|
||||
margin: 0 0 16px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.info-row {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
padding: 8px 0;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.info-row:last-child { border-bottom: none; }
|
||||
|
||||
.info-label {
|
||||
width: 160px;
|
||||
flex-shrink: 0;
|
||||
color: #666;
|
||||
font-size: 0.88em;
|
||||
}
|
||||
|
||||
.info-value {
|
||||
color: #222;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
.info-value a {
|
||||
color: #0066cc;
|
||||
text-decoration: none;
|
||||
}
|
||||
.info-value a:hover { text-decoration: underline; }
|
||||
|
||||
.version-badge {
|
||||
display: inline-block;
|
||||
padding: 3px 12px;
|
||||
background: #e8f0fe;
|
||||
color: #1a73e8;
|
||||
border-radius: 12px;
|
||||
font-size: 1.00em;
|
||||
font-weight: 600;
|
||||
font-family: monospace;
|
||||
}
|
||||
|
||||
.hb-logo {
|
||||
font-size: 2.5em;
|
||||
font-weight: 700;
|
||||
color: #0066cc;
|
||||
letter-spacing: -1px;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
|
||||
.hb-tagline {
|
||||
color: #555;
|
||||
font-size: 0.95em;
|
||||
}
|
||||
|
||||
.logo-section {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 20px;
|
||||
padding: 8px 0 4px;
|
||||
}
|
||||
|
||||
.logo-text { flex: 1; }
|
||||
|
||||
/* ── Dark mode ── */
|
||||
html[data-theme="dark"] h1 { color: var(--text); }
|
||||
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .section { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||
html[data-theme="dark"] .section h2 { color: var(--text); border-bottom-color: var(--border); }
|
||||
html[data-theme="dark"] .info-row { border-bottom-color: var(--border-4); }
|
||||
html[data-theme="dark"] .info-label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .info-value { color: var(--text); }
|
||||
html[data-theme="dark"] .info-value a { color: var(--link); }
|
||||
html[data-theme="dark"] .hb-logo { color: var(--link); }
|
||||
html[data-theme="dark"] .hb-tagline { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .version-badge { background: #1a3255; color: #60a5fa; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Heartbeat monitoring system</p>
|
||||
|
||||
<div class="section">
|
||||
<div class="logo-section">
|
||||
<div class="logo-text">
|
||||
<div class="hb-logo">Heartbeat</div>
|
||||
<div class="hb-tagline">Lightweight host monitoring over UDP</div>
|
||||
</div>
|
||||
<span class="version-badge">v{{ hbd_version }}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>Version</h2>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Server version</span>
|
||||
<span class="info-value">{{ hbd_version }}</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Python</span>
|
||||
<span class="info-value">{{ python_version }}</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">License</span>
|
||||
<span class="info-value">MIT</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>Runtime</h2>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Host</span>
|
||||
<span class="info-value">{{ server_hostname }}</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Started</span>
|
||||
<span class="info-value">{{ start_time_str }}</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Uptime</span>
|
||||
<span class="info-value" id="uptime-value">{{ uptime_str }}</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Hosts monitored</span>
|
||||
<span class="info-value">{{ host_count }}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>Contact & Source</h2>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Author</span>
|
||||
<span class="info-value">Andreas Wrede</span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Email</span>
|
||||
<span class="info-value"><a href="mailto:aew.hbd@wrede.ca">aew.hbd@wrede.ca</a></span>
|
||||
</div>
|
||||
<div class="info-row">
|
||||
<span class="info-label">Repository</span>
|
||||
<span class="info-value"><a href="https://git.wrede.ca/andreas/heartbeat" target="_blank" rel="noopener">git.wrede.ca/andreas/heartbeat</a></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
var startEpoch = {{ start_epoch }};
|
||||
var el = document.getElementById('uptime-value');
|
||||
if (!el) return;
|
||||
function fmt(s) {
|
||||
var d = Math.floor(s / 86400);
|
||||
var h = Math.floor((s % 86400) / 3600);
|
||||
var m = Math.floor((s % 3600) / 60);
|
||||
var sec = s % 60;
|
||||
if (d > 0) return d + 'd ' + h + 'h ' + m + 'm';
|
||||
if (h > 0) return h + 'h ' + m + 'm ' + sec + 's';
|
||||
return m + 'm ' + sec + 's';
|
||||
}
|
||||
function tick() {
|
||||
var up = Math.floor(Date.now() / 1000 - startEpoch);
|
||||
el.textContent = fmt(up);
|
||||
}
|
||||
tick();
|
||||
setInterval(tick, 1000);
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,623 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
|
||||
html, body {
|
||||
height: auto;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.summary-cards {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 10px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.summary-card {
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 6px 14px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
border-left: 4px solid #ddd;
|
||||
}
|
||||
|
||||
.summary-card.critical { border-left-color: #ea1e0f; }
|
||||
.summary-card.warning { border-left-color: #ff9800; }
|
||||
.summary-card.ok { border-left-color: #4caf50; }
|
||||
|
||||
.summary-number {
|
||||
font-size: 1.4em;
|
||||
font-weight: bold;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
.summary-number.critical { color: #ea1e0f; }
|
||||
.summary-number.warning { color: #ff9800; }
|
||||
.summary-number.ok { color: #4caf50; }
|
||||
|
||||
.summary-label {
|
||||
color: #666;
|
||||
font-size: 1.00em;
|
||||
}
|
||||
|
||||
.filters {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
padding: 15px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
display: flex;
|
||||
gap: 15px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.filter-label {
|
||||
font-weight: bold;
|
||||
color: #555;
|
||||
}
|
||||
|
||||
.filter-button {
|
||||
padding: 8px 16px;
|
||||
border: 2px solid #ddd;
|
||||
background: white;
|
||||
border-radius: 20px;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.filter-button:hover {
|
||||
border-color: #2196f3;
|
||||
}
|
||||
|
||||
.filter-button.active {
|
||||
background: #2196f3;
|
||||
color: white;
|
||||
border-color: #2196f3;
|
||||
}
|
||||
|
||||
.filter-input {
|
||||
padding: 7px 12px;
|
||||
border: 2px solid #ddd;
|
||||
border-radius: 20px;
|
||||
font-size: 0.9em;
|
||||
outline: none;
|
||||
width: 200px;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
|
||||
.filter-input:focus {
|
||||
border-color: #2196f3;
|
||||
}
|
||||
|
||||
.filter-input.invalid {
|
||||
border-color: #f44336;
|
||||
}
|
||||
|
||||
.alerts-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.alert-item {
|
||||
border-left: 5px solid #ddd;
|
||||
padding: 15px;
|
||||
margin-bottom: 15px;
|
||||
background: #fafafa;
|
||||
border-radius: 4px;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.alert-item.acknowledged {
|
||||
opacity: 0.8;
|
||||
background: #f0f0f0;
|
||||
}
|
||||
|
||||
.alert-item:hover {
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
transform: translateX(5px);
|
||||
}
|
||||
|
||||
.alert-item.critical {
|
||||
border-left-color: #f44336;
|
||||
background: #ffebee;
|
||||
}
|
||||
|
||||
.alert-item.warning {
|
||||
border-left-color: #ff9800;
|
||||
background: #fff3e0;
|
||||
}
|
||||
|
||||
.alert-item.unknown {
|
||||
border-left-color: #9e9e9e;
|
||||
background: #f5f5f5;
|
||||
}
|
||||
|
||||
.alert-main {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.alert-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 15px;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.alert-level {
|
||||
padding: 4px 12px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.75em;
|
||||
font-weight: bold;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.alert-level.critical {
|
||||
background: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-level.warning {
|
||||
background: #ff9800;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-level.unknown {
|
||||
background: #9e9e9e;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-hostname {
|
||||
font-weight: bold;
|
||||
color: #0066cc;
|
||||
font-size: 1.1em;
|
||||
text-decoration: none;
|
||||
}
|
||||
.alert-hostname:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.alert-metric {
|
||||
color: #0066cc;
|
||||
font-size: 1.1em;
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.alert-details {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
color: #666;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.alert-value {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.alert-duration {
|
||||
color: #999;
|
||||
font-size: 1.00em;
|
||||
}
|
||||
|
||||
.alert-actions {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
margin-left: 15px;
|
||||
}
|
||||
|
||||
.acknowledge-btn {
|
||||
padding: 8px 16px;
|
||||
background: #2196f3;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 1.00em;
|
||||
transition: all 0.2s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.acknowledge-btn:hover {
|
||||
background: #1976d2;
|
||||
transform: scale(1.05);
|
||||
}
|
||||
|
||||
.acknowledge-btn:disabled {
|
||||
background: #ccc;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
.acknowledged-badge {
|
||||
padding: 4px 8px;
|
||||
background: #4caf50;
|
||||
color: white;
|
||||
border-radius: 4px;
|
||||
font-size: 0.75em;
|
||||
text-align: center;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.no-alerts {
|
||||
text-align: center;
|
||||
padding: 60px 20px;
|
||||
color: #999;
|
||||
}
|
||||
|
||||
.no-alerts-icon {
|
||||
font-size: 4em;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.error {
|
||||
background: #ffebee;
|
||||
border-left: 4px solid #f44336;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
border-radius: 4px;
|
||||
color: #c62828;
|
||||
}
|
||||
|
||||
.refresh-info {
|
||||
text-align: center;
|
||||
color: #999;
|
||||
font-size: 1.00em;
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.last-update {
|
||||
color: #666;
|
||||
font-size: 0.9em;
|
||||
text-align: right;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
/* ── Dark mode ── */
|
||||
html[data-theme="dark"] h1 { color: var(--text); }
|
||||
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .summary-card { background: var(--surface); }
|
||||
html[data-theme="dark"] .summary-label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .filters { background: var(--surface); }
|
||||
html[data-theme="dark"] .filter-label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .filter-button { background: var(--surface-2); border-color: var(--border); color: var(--text); }
|
||||
html[data-theme="dark"] .filter-button.active { background: #2196f3; color: #fff; border-color: #2196f3; }
|
||||
html[data-theme="dark"] .filter-input { background: var(--input-bg); border-color: var(--input-border); color: var(--text); }
|
||||
html[data-theme="dark"] .alerts-container { background: var(--surface); }
|
||||
html[data-theme="dark"] .alert-item { background: var(--surface-2); }
|
||||
html[data-theme="dark"] .alert-item.acknowledged { background: var(--surface-3); }
|
||||
html[data-theme="dark"] .alert-item.critical { background: #2e0a0a; border-left-color: #f44336; }
|
||||
html[data-theme="dark"] .alert-item.warning { background: #2e1a00; border-left-color: #ff9800; }
|
||||
html[data-theme="dark"] .alert-item.unknown { background: var(--surface-2); }
|
||||
html[data-theme="dark"] .alert-hostname { color: var(--link); }
|
||||
html[data-theme="dark"] .alert-details { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .alert-value { color: var(--text); }
|
||||
html[data-theme="dark"] .alert-duration { color: var(--text-muted); }
|
||||
html[data-theme="dark"] .last-update { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .refresh-info { color: var(--text-muted); border-top-color: var(--border); }
|
||||
html[data-theme="dark"] .no-alerts,
|
||||
html[data-theme="dark"] .loading { color: var(--text-muted); }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
|
||||
|
||||
<div class="summary-cards" id="summary-cards">
|
||||
<div class="summary-card critical">
|
||||
<div class="summary-label">Critical</div>
|
||||
<div class="summary-number critical" id="critical-count">-</div>
|
||||
</div>
|
||||
<div class="summary-card warning">
|
||||
<div class="summary-label">Warning</div>
|
||||
<div class="summary-number warning" id="warning-count">-</div>
|
||||
</div>
|
||||
<div class="summary-card ok">
|
||||
<div class="summary-label">Total Hosts</div>
|
||||
<div class="summary-number ok" id="host-count">-</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="filters">
|
||||
<span class="filter-label">Show:</span>
|
||||
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
|
||||
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
|
||||
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
|
||||
<input id="host-filter" class="filter-input" type="text" placeholder="host filter (regex)" oninput="onHostFilterInput(this)">
|
||||
</div>
|
||||
|
||||
<div class="alerts-container">
|
||||
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
|
||||
<div id="alerts-list">
|
||||
<div class="loading">Loading alerts...</div>
|
||||
</div>
|
||||
<div class="refresh-info">
|
||||
Auto-refreshing every 15 seconds
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let currentFilter = 'all';
|
||||
let allAlerts = [];
|
||||
let hostFilterRe = null;
|
||||
|
||||
async function loadAlerts() {
|
||||
try {
|
||||
const response = await fetch('/api/0/alerts');
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
allAlerts = data.alerts;
|
||||
|
||||
// Update summary cards
|
||||
document.getElementById('critical-count').textContent = data.summary.critical || 0;
|
||||
document.getElementById('warning-count').textContent = data.summary.warning || 0;
|
||||
document.getElementById('host-count').textContent = data.host_count || 0;
|
||||
|
||||
// Update last update time
|
||||
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
|
||||
|
||||
// Render alerts
|
||||
renderAlerts(allAlerts);
|
||||
|
||||
} catch (error) {
|
||||
document.getElementById('alerts-list').innerHTML =
|
||||
`<div class="error">Failed to load alerts: ${error.message}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
function renderAlerts(alerts) {
|
||||
const container = document.getElementById('alerts-list');
|
||||
|
||||
// Filter alerts based on current filter
|
||||
let filteredAlerts = alerts;
|
||||
if (currentFilter !== 'all') {
|
||||
filteredAlerts = filteredAlerts.filter(alert =>
|
||||
alert.level.toLowerCase() === currentFilter
|
||||
);
|
||||
}
|
||||
if (hostFilterRe) {
|
||||
filteredAlerts = filteredAlerts.filter(alert => hostFilterRe.test(alert.hostname));
|
||||
}
|
||||
|
||||
if (filteredAlerts.length === 0) {
|
||||
if (currentFilter === 'all' && alerts.length === 0) {
|
||||
container.innerHTML = `
|
||||
<div class="no-alerts">
|
||||
<div class="no-alerts-icon">✓</div>
|
||||
<h2>All Systems Normal</h2>
|
||||
<p>No active alerts at this time</p>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
container.innerHTML = `
|
||||
<div class="no-alerts">
|
||||
<p>No ${currentFilter} alerts</p>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
for (const alert of filteredAlerts) {
|
||||
html += renderAlert(alert);
|
||||
}
|
||||
container.innerHTML = html;
|
||||
}
|
||||
|
||||
function renderAlert(alert) {
|
||||
const level = alert.level.toLowerCase();
|
||||
const duration = getDuration(alert.since);
|
||||
const acknowledged = alert.acknowledged || false;
|
||||
|
||||
// Use formatted message if available, otherwise build from individual fields
|
||||
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||
if (alert.formatted_message) {
|
||||
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
|
||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||
}
|
||||
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||
}
|
||||
|
||||
// Build actions section
|
||||
let actionsHtml = '';
|
||||
if (acknowledged) {
|
||||
actionsHtml = `
|
||||
<div class="alert-actions">
|
||||
<div class="acknowledged-badge">✓ Acknowledged</div>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
actionsHtml = `
|
||||
<div class="alert-actions">
|
||||
<button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
|
||||
Acknowledge
|
||||
</button>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
|
||||
<div class="alert-main">
|
||||
<div class="alert-header">
|
||||
<span class="alert-level ${level}">${alert.level}</span>
|
||||
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||
<span class="alert-metric">${(alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path).replace(/_status_code$/, '')}</span>
|
||||
</div>
|
||||
<div class="alert-details">
|
||||
<span>${valueText}</span>
|
||||
<span class="alert-duration">Active for ${duration}</span>
|
||||
</div>
|
||||
</div>
|
||||
${actionsHtml}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function formatValue(value) {
|
||||
if (typeof value === 'number') {
|
||||
if (value > 1000) {
|
||||
return value.toLocaleString();
|
||||
}
|
||||
return value.toFixed(2);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function getDuration(timestamp) {
|
||||
const now = Date.now() / 1000;
|
||||
const seconds = Math.floor(now - timestamp);
|
||||
|
||||
if (seconds < 60) {
|
||||
return `${seconds}s`;
|
||||
} else if (seconds < 3600) {
|
||||
return `${Math.floor(seconds / 60)}m`;
|
||||
} else if (seconds < 86400) {
|
||||
const hours = Math.floor(seconds / 3600);
|
||||
const minutes = Math.floor((seconds % 3600) / 60);
|
||||
return `${hours}h ${minutes}m`;
|
||||
} else {
|
||||
const days = Math.floor(seconds / 86400);
|
||||
const hours = Math.floor((seconds % 86400) / 3600);
|
||||
return `${days}d ${hours}h`;
|
||||
}
|
||||
}
|
||||
|
||||
function filterAlerts(filter) {
|
||||
currentFilter = filter;
|
||||
|
||||
// Update active button
|
||||
document.querySelectorAll('.filter-button').forEach(btn => {
|
||||
btn.classList.remove('active');
|
||||
});
|
||||
event.target.classList.add('active');
|
||||
|
||||
// Re-render with new filter
|
||||
renderAlerts(allAlerts);
|
||||
}
|
||||
|
||||
async function acknowledgeAlert(hostname, metricPath, event) {
|
||||
// Prevent event bubbling
|
||||
if (event) {
|
||||
event.stopPropagation();
|
||||
}
|
||||
|
||||
// Disable the button
|
||||
const button = event.target;
|
||||
button.disabled = true;
|
||||
button.textContent = 'Acknowledging...';
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/0/alerts/acknowledge', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
hostname: hostname,
|
||||
metric_path: metricPath,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
// Update the alert in our local data
|
||||
const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
|
||||
if (alert) {
|
||||
alert.acknowledged = true;
|
||||
alert.acknowledged_at = result.acknowledged_at;
|
||||
}
|
||||
|
||||
// Re-render alerts
|
||||
renderAlerts(allAlerts);
|
||||
|
||||
} catch (error) {
|
||||
alert(`Failed to acknowledge alert: ${error.message}`);
|
||||
button.disabled = false;
|
||||
button.textContent = 'Acknowledge';
|
||||
}
|
||||
}
|
||||
|
||||
function onHostFilterInput(input) {
|
||||
const val = input.value.trim();
|
||||
if (!val) {
|
||||
hostFilterRe = null;
|
||||
input.classList.remove('invalid');
|
||||
} else {
|
||||
try {
|
||||
hostFilterRe = new RegExp(val, 'i');
|
||||
input.classList.remove('invalid');
|
||||
} catch (_) {
|
||||
hostFilterRe = null;
|
||||
input.classList.add('invalid');
|
||||
}
|
||||
}
|
||||
renderAlerts(allAlerts);
|
||||
}
|
||||
|
||||
// Auto-refresh every 15 seconds
|
||||
setInterval(loadAlerts, 15000);
|
||||
|
||||
// Initialise filter from URL query string (?filter=...)
|
||||
(function () {
|
||||
const param = new URLSearchParams(window.location.search).get('filter');
|
||||
if (param) {
|
||||
const input = document.getElementById('host-filter');
|
||||
input.value = param;
|
||||
onHostFilterInput(input);
|
||||
}
|
||||
})();
|
||||
|
||||
// Initial load
|
||||
loadAlerts();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,5 +1,5 @@
|
||||
<footer>
|
||||
<div id="copyright">
|
||||
©2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||
©2002-2026 <A HREF="mailto:aew.hbd@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||
</div>
|
||||
</footer>
|
||||
@@ -0,0 +1,386 @@
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||
<title>{{ title }}</title>
|
||||
{% if extra_scripts %}<script src="{{ extra_scripts }}"></script>{% endif %}
|
||||
<script>
|
||||
/* Apply saved theme before first paint to avoid flash */
|
||||
(function() {
|
||||
try {
|
||||
var p = localStorage.getItem('hbd_theme') || 'auto';
|
||||
var dark = p === 'dark' || (p === 'auto' && window.matchMedia('(prefers-color-scheme: dark)').matches);
|
||||
if (dark) document.documentElement.setAttribute('data-theme', 'dark');
|
||||
} catch(e) {}
|
||||
})();
|
||||
</script>
|
||||
<style>
|
||||
/* ── Theme variables ── */
|
||||
:root {
|
||||
--bg: #f5f5f5;
|
||||
--surface: #ffffff;
|
||||
--surface-2: #f8f8f8;
|
||||
--surface-3: #f5f5f5;
|
||||
--text: #222222;
|
||||
--text-2: #333333;
|
||||
--text-3: #555555;
|
||||
--text-sec: #666666;
|
||||
--text-muted: #888888;
|
||||
--text-dim: #aaaaaa;
|
||||
--text-ghost: #cccccc;
|
||||
--border: #e0e0e0;
|
||||
--border-2: #eeeeee;
|
||||
--border-3: #f0f0f0;
|
||||
--border-4: #f5f5f5;
|
||||
--link: #0066cc;
|
||||
--nav-bg: #ffffff;
|
||||
--input-bg: #ffffff;
|
||||
--input-border: #cccccc;
|
||||
--shadow-sm: rgba(0,0,0,.08);
|
||||
--shadow: rgba(0,0,0,.10);
|
||||
--shadow-nav: rgba(0,0,0,.10);
|
||||
}
|
||||
html[data-theme="dark"] {
|
||||
color-scheme: dark;
|
||||
--bg: #111827;
|
||||
--surface: #1f2937;
|
||||
--surface-2: #283447;
|
||||
--surface-3: #374151;
|
||||
--text: #e5e7eb;
|
||||
--text-2: #d1d5db;
|
||||
--text-3: #9ca3af;
|
||||
--text-sec: #9ca3af;
|
||||
--text-muted: #6b7280;
|
||||
--text-dim: #4b5563;
|
||||
--text-ghost: #374151;
|
||||
--border: #374151;
|
||||
--border-2: #2d3748;
|
||||
--border-3: #253040;
|
||||
--border-4: #1e2a38;
|
||||
--link: #60a5fa;
|
||||
--nav-bg: #1f2937;
|
||||
--input-bg: #283447;
|
||||
--input-border: #4b5563;
|
||||
--shadow-sm: rgba(0,0,0,.30);
|
||||
--shadow: rgba(0,0,0,.40);
|
||||
--shadow-nav: rgba(0,0,0,.40);
|
||||
}
|
||||
|
||||
/* ── Reset / shared baseline ── */
|
||||
*, *::before, *::after { box-sizing: border-box; }
|
||||
html {
|
||||
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
|
||||
font-size: 14px;
|
||||
}
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 10px;
|
||||
padding-top: 60px;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
}
|
||||
h1 { font-size: 1.5em; color: var(--text-2); margin: 0 0 5px; }
|
||||
h2 { font-size: 1.1em; color: var(--text-2); margin: 0 0 8px; }
|
||||
p { margin: 0; }
|
||||
|
||||
/* Navigation bar — shared across all pages */
|
||||
.nav {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
z-index: 200;
|
||||
background: var(--nav-bg);
|
||||
padding: 6px 12px;
|
||||
box-shadow: 0 2px 4px var(--shadow-nav);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
}
|
||||
.nav-links { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
|
||||
.nav a {
|
||||
margin-right: 20px;
|
||||
text-decoration: none;
|
||||
color: var(--link);
|
||||
font-weight: 500;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.nav a:hover { text-decoration: underline; }
|
||||
.nav a.active { color: var(--text-2); font-weight: bold; }
|
||||
.nav-user {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
text-decoration: none;
|
||||
color: var(--text-2);
|
||||
font-size: 0.9em;
|
||||
font-weight: 500;
|
||||
padding: 4px 8px;
|
||||
border-radius: 20px;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.nav-user:hover { background: var(--surface-2); text-decoration: none; }
|
||||
.nav-username {
|
||||
max-width: 0;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
opacity: 0;
|
||||
transition: max-width 0.2s ease, opacity 0.2s ease;
|
||||
}
|
||||
.nav-user:hover .nav-username {
|
||||
max-width: 160px;
|
||||
opacity: 1;
|
||||
}
|
||||
.nav-avatar {
|
||||
width: 28px; height: 28px;
|
||||
border-radius: 50%;
|
||||
object-fit: cover;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.nav-initials {
|
||||
width: 28px; height: 28px;
|
||||
border-radius: 50%;
|
||||
background: var(--link);
|
||||
color: #fff;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 0.75em;
|
||||
font-weight: 700;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
/* ── Mobile nav: hamburger toggle ── */
|
||||
.nav-hamburger {
|
||||
display: none;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
width: 26px; height: 20px;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
background: none;
|
||||
border: none;
|
||||
padding: 0;
|
||||
}
|
||||
.nav-hamburger span {
|
||||
display: block;
|
||||
height: 3px;
|
||||
background: var(--text-muted);
|
||||
border-radius: 2px;
|
||||
}
|
||||
|
||||
@media (max-width: 640px) {
|
||||
.nav-hamburger { display: flex; }
|
||||
.nav-links {
|
||||
display: none;
|
||||
width: 100%;
|
||||
flex-direction: column;
|
||||
align-items: flex-start;
|
||||
padding-top: 8px;
|
||||
border-top: 1px solid var(--border-2);
|
||||
order: 3;
|
||||
}
|
||||
.nav-links.nav-open { display: flex; }
|
||||
.nav-links a { margin-right: 0; padding: 6px 0; font-size: 1em; }
|
||||
}
|
||||
|
||||
/* ── Global dark-mode: inputs ── */
|
||||
html[data-theme="dark"] input:not([type=checkbox]):not([type=radio]),
|
||||
html[data-theme="dark"] select,
|
||||
html[data-theme="dark"] textarea {
|
||||
background-color: var(--input-bg);
|
||||
border-color: var(--input-border);
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
/* Pending config publish button */
|
||||
.nav-publish-btn {
|
||||
background: #e65100;
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
padding: 4px 10px;
|
||||
font-size: 0.82em;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
white-space: nowrap;
|
||||
margin-left: auto;
|
||||
}
|
||||
.nav-publish-btn:hover { background: #bf360c; }
|
||||
.nav-publish-btn:disabled { opacity: 0.7; cursor: default; }
|
||||
|
||||
/* Swiss railway clock — nav */
|
||||
.nav-pie {
|
||||
flex-shrink: 0;
|
||||
line-height: 0;
|
||||
margin-left: auto;
|
||||
padding: 4px 4px 4px 0;
|
||||
}
|
||||
#alert-pie { display: block; cursor: default; }
|
||||
.nav-clock {
|
||||
flex-shrink: 0;
|
||||
line-height: 0;
|
||||
padding: 4px 4px 4px 0;
|
||||
cursor: pointer;
|
||||
}
|
||||
#swiss-clock { display: block; }
|
||||
|
||||
/* Swiss railway clock — full-page overlay */
|
||||
#clock-overlay {
|
||||
display: none;
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
z-index: 9999;
|
||||
background: #1a1a1a;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
cursor: pointer;
|
||||
}
|
||||
#clock-overlay.visible { display: flex; }
|
||||
#swiss-clock-overlay { display: block; }
|
||||
</style>
|
||||
<script>
|
||||
/* ── Swiss Federal Railway (SBB) clock ── */
|
||||
|
||||
/* Draw one frame of the clock onto any canvas element. */
|
||||
function drawSwissClock(canvas) {
|
||||
var SIZE = canvas.width;
|
||||
var R = SIZE / 2;
|
||||
var ctx = canvas.getContext('2d');
|
||||
var now = new Date();
|
||||
var h = now.getHours() % 12;
|
||||
var m = now.getMinutes();
|
||||
var s = now.getSeconds();
|
||||
var ms = now.getMilliseconds();
|
||||
|
||||
/* Seconds hand idles ~1.5 s at 12 before advancing (SBB behaviour) */
|
||||
var sFrac = s + ms / 1000;
|
||||
var sAngle = sFrac >= 58.5 ? 0 : (sFrac / 58.5) * Math.PI * 2;
|
||||
|
||||
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||
|
||||
/* face */
|
||||
ctx.beginPath();
|
||||
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||
ctx.fillStyle = '#fff';
|
||||
ctx.fill();
|
||||
ctx.strokeStyle = '#333';
|
||||
ctx.lineWidth = SIZE * 0.018;
|
||||
ctx.stroke();
|
||||
|
||||
/* tick marks */
|
||||
for (var i = 0; i < 60; i++) {
|
||||
var a = (i / 60) * Math.PI * 2 - Math.PI / 2;
|
||||
var isHour = (i % 5 === 0);
|
||||
ctx.beginPath();
|
||||
ctx.moveTo(R + Math.cos(a) * (isHour ? R * 0.72 : R * 0.88),
|
||||
R + Math.sin(a) * (isHour ? R * 0.72 : R * 0.88));
|
||||
ctx.lineTo(R + Math.cos(a) * R * 0.94,
|
||||
R + Math.sin(a) * R * 0.94);
|
||||
ctx.strokeStyle = '#222';
|
||||
ctx.lineWidth = isHour ? SIZE * 0.027 : SIZE * 0.011;
|
||||
ctx.lineCap = 'butt';
|
||||
ctx.stroke();
|
||||
}
|
||||
|
||||
/* hands */
|
||||
function hand(angle, tip, tail, width, color) {
|
||||
ctx.save();
|
||||
ctx.translate(R, R);
|
||||
ctx.rotate(angle);
|
||||
ctx.beginPath();
|
||||
ctx.moveTo(tail, 0);
|
||||
ctx.lineTo(tip, 0);
|
||||
ctx.strokeStyle = color;
|
||||
ctx.lineWidth = width;
|
||||
ctx.lineCap = 'square';
|
||||
ctx.stroke();
|
||||
ctx.restore();
|
||||
}
|
||||
|
||||
hand((sFrac >= 58.5 ? m + 1 : m) / 60 * Math.PI * 2 - Math.PI / 2,
|
||||
R * 0.88, -R * 0.12, SIZE * 0.027, '#222'); /* minute */
|
||||
hand((h + m / 60) / 12 * Math.PI * 2 - Math.PI / 2,
|
||||
R * 0.58, -R * 0.12, SIZE * 0.039, '#222'); /* hour */
|
||||
hand(sAngle - Math.PI / 2, R * 0.78, -R * 0.22,
|
||||
SIZE * 0.013, '#e00'); /* second tail+tip */
|
||||
|
||||
/* round dot at tip of second hand */
|
||||
var dotR = SIZE * 0.028;
|
||||
ctx.save();
|
||||
ctx.translate(R, R);
|
||||
ctx.rotate(sAngle - Math.PI / 2);
|
||||
ctx.beginPath();
|
||||
ctx.arc(R * 0.78, 0, dotR, 0, Math.PI * 2);
|
||||
ctx.fillStyle = '#e00';
|
||||
ctx.fill();
|
||||
ctx.restore();
|
||||
|
||||
/* centre cap */
|
||||
ctx.beginPath();
|
||||
ctx.arc(R, R, R * 0.04, 0, Math.PI * 2);
|
||||
ctx.fillStyle = '#222';
|
||||
ctx.fill();
|
||||
}
|
||||
|
||||
/* Resize the overlay canvas to fit the viewport, keeping it square. */
|
||||
function resizeOverlayClock() {
|
||||
var oc = document.getElementById('swiss-clock-overlay');
|
||||
if (!oc) return;
|
||||
var size = Math.min(window.innerWidth, window.innerHeight) * 0.88;
|
||||
size = Math.floor(size);
|
||||
oc.width = size;
|
||||
oc.height = size;
|
||||
}
|
||||
|
||||
/* Main tick — redraws both nav clock and (if visible) overlay clock. */
|
||||
function clockTick() {
|
||||
var nav = document.getElementById('swiss-clock');
|
||||
if (nav) drawSwissClock(nav);
|
||||
var overlay = document.getElementById('clock-overlay');
|
||||
if (overlay && overlay.classList.contains('visible')) {
|
||||
var oc = document.getElementById('swiss-clock-overlay');
|
||||
if (oc) drawSwissClock(oc);
|
||||
}
|
||||
var delay = 100 - (Date.now() % 100);
|
||||
setTimeout(clockTick, delay);
|
||||
}
|
||||
|
||||
/* Keep auto-theme in sync with system setting changes */
|
||||
try {
|
||||
window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', function(e) {
|
||||
var pref = localStorage.getItem('hbd_theme') || 'auto';
|
||||
if (pref === 'auto') {
|
||||
if (e.matches) { document.documentElement.setAttribute('data-theme', 'dark'); }
|
||||
else { document.documentElement.removeAttribute('data-theme'); }
|
||||
}
|
||||
});
|
||||
} catch(e) {}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
/* Start the shared tick loop */
|
||||
clockTick();
|
||||
|
||||
/* Overlay toggle — clicking the nav clock opens it */
|
||||
var navClock = document.querySelector('.nav-clock');
|
||||
var overlay = document.getElementById('clock-overlay');
|
||||
if (navClock && overlay) {
|
||||
navClock.addEventListener('click', function() {
|
||||
resizeOverlayClock();
|
||||
overlay.classList.add('visible');
|
||||
});
|
||||
overlay.addEventListener('click', function() {
|
||||
overlay.classList.remove('visible');
|
||||
});
|
||||
window.addEventListener('resize', function() {
|
||||
if (overlay.classList.contains('visible')) resizeOverlayClock();
|
||||
});
|
||||
}
|
||||
});
|
||||
</script>
|
||||
<script src="static/sorttable.js"></script>
|
||||
</head>
|
||||
@@ -0,0 +1,693 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
height: 100vh;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
@media (max-width: 640px) {
|
||||
body {
|
||||
height: auto;
|
||||
min-height: 100vh;
|
||||
overflow: auto;
|
||||
flex-direction: column;
|
||||
}
|
||||
.container {
|
||||
max-height: none;
|
||||
overflow: visible;
|
||||
}
|
||||
.table-section {
|
||||
max-height: 55vh;
|
||||
}
|
||||
.log-section {
|
||||
flex: none;
|
||||
max-height: 40vh;
|
||||
}
|
||||
}
|
||||
|
||||
.container {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
max-width: 1600px;
|
||||
width: 100%;
|
||||
margin: 0 auto;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 5px;
|
||||
margin-top: 15px;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
h2 {
|
||||
color: #333;
|
||||
margin-bottom: 10px;
|
||||
font-size: 1.2em;
|
||||
padding: 10px 15px;
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 15px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.content {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
.table-section {
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
overflow-x: auto;
|
||||
overflow-y: auto;
|
||||
max-height: 60vh;
|
||||
}
|
||||
|
||||
.log-section {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
#ntable {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
#ntable td,
|
||||
#ntable th {
|
||||
border: 1px solid #e0e0e0;
|
||||
text-align: left;
|
||||
padding: 2px 4px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
#ntable tr:nth-child(even) {
|
||||
background-color: #fafafa;
|
||||
}
|
||||
|
||||
#ntable tr:hover {
|
||||
background-color: #e3f2fd;
|
||||
}
|
||||
|
||||
#ntable tbody tr.row-warning {
|
||||
background-color: #fff8c5;
|
||||
}
|
||||
|
||||
#ntable tbody tr.row-critical {
|
||||
background-color: #fde8e8;
|
||||
}
|
||||
|
||||
#ntable tbody tr.row-warning:hover {
|
||||
background-color: #fff0a0;
|
||||
}
|
||||
|
||||
#ntable tbody tr.row-critical:hover {
|
||||
background-color: #f9c8c8;
|
||||
}
|
||||
|
||||
#ntable th {
|
||||
padding: 6px 8px;
|
||||
background-color: #2196f3;
|
||||
color: white;
|
||||
font-weight: 600;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
#ntable
|
||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||
content: " ⇅";
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
/* Alert count column styling */
|
||||
#ntable td.alert-warning {
|
||||
color: #ff9800;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
#ntable td.alert-critical {
|
||||
color: #f44336;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Scrollbar styling */
|
||||
.log-section::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
}
|
||||
|
||||
.log-section::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.log-section::-webkit-scrollbar-thumb {
|
||||
background: #888;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.log-section::-webkit-scrollbar-thumb:hover {
|
||||
background: #555;
|
||||
}
|
||||
|
||||
/* Message styling */
|
||||
#messages {
|
||||
font-size: 1.00em;
|
||||
line-height: 1.0;
|
||||
}
|
||||
|
||||
#messages .log-entry {
|
||||
padding: 5px 0;
|
||||
border-bottom: 1px solid #f0f0f0;
|
||||
display: flex;
|
||||
gap: 0.5em;
|
||||
align-items: baseline;
|
||||
}
|
||||
|
||||
.log-ts { color: #888; white-space: nowrap; }
|
||||
.log-level { font-weight: bold; min-width: 6em; }
|
||||
.log-host { font-weight: 600; }
|
||||
.log-service { color: #888; }
|
||||
|
||||
.log-warning .log-level { color: #b8860b; }
|
||||
.log-critical .log-level { color: #c00; }
|
||||
.log-recover .log-level { color: #2a7a2a; }
|
||||
.log-info .log-level { color: #555; }
|
||||
|
||||
.log-section-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
flex-wrap: wrap;
|
||||
margin-bottom: 10px;
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
padding: 8px 15px;
|
||||
}
|
||||
|
||||
.log-section-title {
|
||||
font-size: 1.2em;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.log-filter-bar {
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.log-filter-bar input[type="text"],
|
||||
.log-filter-bar select {
|
||||
padding: 3px 7px;
|
||||
border: 1px solid #ccc;
|
||||
border-radius: 4px;
|
||||
font-size: 1.00em;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.log-filter-bar input[type="text"] { width: 110px; }
|
||||
|
||||
/* Modal for connection status messages */
|
||||
.connection-modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
z-index: 1000;
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
.connection-modal.show {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.connection-modal-content {
|
||||
background-color: white;
|
||||
padding: 30px 40px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
|
||||
min-width: 300px;
|
||||
}
|
||||
|
||||
.connection-modal-content p {
|
||||
margin: 0;
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
/* State indicators */
|
||||
.state-up {
|
||||
color: #4caf50;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.state-down {
|
||||
color: #f44336;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.state-overdue {
|
||||
color: #ff9800;
|
||||
font-weight: 700;
|
||||
}
|
||||
#ntable a.host-link { color: inherit; text-decoration: none; }
|
||||
#ntable a.host-link:hover { text-decoration: underline; }
|
||||
|
||||
/* ── Dark mode ── */
|
||||
html[data-theme="dark"] h1,
|
||||
html[data-theme="dark"] h2 { color: var(--text); }
|
||||
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||
html[data-theme="dark"] h2,
|
||||
html[data-theme="dark"] .table-section,
|
||||
html[data-theme="dark"] .log-section,
|
||||
html[data-theme="dark"] .log-section-header { background: var(--surface); }
|
||||
html[data-theme="dark"] .log-section-title { color: var(--text); }
|
||||
html[data-theme="dark"] #ntable td,
|
||||
html[data-theme="dark"] #ntable th { border-color: var(--border); }
|
||||
html[data-theme="dark"] #ntable tr:nth-child(even) { background: var(--surface-2); }
|
||||
html[data-theme="dark"] #ntable tr:hover { background: #1e3a5f; }
|
||||
html[data-theme="dark"] #ntable tbody tr.row-warning { background: #3a2800; }
|
||||
html[data-theme="dark"] #ntable tbody tr.row-critical { background: #3a0a0a; }
|
||||
html[data-theme="dark"] #ntable tbody tr.row-warning:hover { background: #4a3200; }
|
||||
html[data-theme="dark"] #ntable tbody tr.row-critical:hover { background: #4a1010; }
|
||||
html[data-theme="dark"] #messages .log-entry { border-bottom-color: var(--border-3); }
|
||||
html[data-theme="dark"] .log-ts,
|
||||
html[data-theme="dark"] .log-service { color: var(--text-muted); }
|
||||
html[data-theme="dark"] .log-info .log-level { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .log-filter-bar input,
|
||||
html[data-theme="dark"] .log-filter-bar select { color: var(--text); }
|
||||
html[data-theme="dark"] .connection-modal-content { background: var(--surface); color: var(--text); }
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
var nTable = document;
|
||||
var name_idx = {};
|
||||
var c = 0;
|
||||
var HBD_VERSION = "{{ hbd_version }}";
|
||||
|
||||
function hostNameHtml(data) {
|
||||
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
|
||||
var nameHtml = data.name;
|
||||
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||
nameHtml += ' 🥀';
|
||||
}
|
||||
var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||
return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
|
||||
}
|
||||
|
||||
function setup() {
|
||||
name_idx = {};
|
||||
nTable = document.getElementById("ntable");
|
||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||
if (i == 0) continue;
|
||||
var cell = nTable.rows[i].cells[0];
|
||||
var name = cell.dataset.name || cell.innerText.replace(/\s*🥀\s*$/, '').trim();
|
||||
name_idx[name] = nTable.rows[i];
|
||||
}
|
||||
}
|
||||
|
||||
function updateRowAlert(row, data) {
|
||||
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||
var criticalAcked = data.alert_critical_acked || 0;
|
||||
var warningUnacked = data.alert_warning_unacked || 0;
|
||||
var warningAcked = data.alert_warning_acked || 0;
|
||||
row.classList.remove('row-warning', 'row-critical');
|
||||
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||
row.classList.add('row-critical');
|
||||
} else if (warningUnacked > 0 || warningAcked > 0) {
|
||||
row.classList.add('row-warning');
|
||||
}
|
||||
}
|
||||
|
||||
function createRow(data) {
|
||||
var row = document.createElement("tr");
|
||||
var c_name = document.createElement("td");
|
||||
var c_warning = document.createElement("td");
|
||||
c_warning.style.textAlign = "center";
|
||||
c_warning.style.color = "#ff9800";
|
||||
c_warning.style.fontWeight = "bold";
|
||||
var c_critical = document.createElement("td");
|
||||
c_critical.style.textAlign = "center";
|
||||
c_critical.style.color = "#f44336";
|
||||
c_critical.style.fontWeight = "bold";
|
||||
var c_ipv4addr = document.createElement("td");
|
||||
var c_ipv4state = document.createElement("td");
|
||||
var c_ipv4latency = document.createElement("td");
|
||||
c_ipv4latency.style.textAlign = "right";
|
||||
var c_ipv4statets = document.createElement("td");
|
||||
c_ipv4statets.style.textAlign = "right";
|
||||
var c_ipv6addr = document.createElement("td");
|
||||
var c_ipv6state = document.createElement("td");
|
||||
var c_ipv6latency = document.createElement("td");
|
||||
c_ipv6latency.style.textAlign = "right";
|
||||
var c_ipv6statets = document.createElement("td");
|
||||
c_ipv6statets.style.textAlign = "right";
|
||||
row.appendChild(c_name);
|
||||
row.appendChild(c_warning);
|
||||
row.appendChild(c_critical);
|
||||
row.appendChild(c_ipv4addr);
|
||||
row.appendChild(c_ipv4state);
|
||||
row.appendChild(c_ipv4latency);
|
||||
row.appendChild(c_ipv4statets);
|
||||
row.appendChild(c_ipv6addr);
|
||||
row.appendChild(c_ipv6state);
|
||||
row.appendChild(c_ipv6latency);
|
||||
row.appendChild(c_ipv6statets);
|
||||
c_name.dataset.name = data.name;
|
||||
c_name.innerHTML = hostNameHtml(data);
|
||||
|
||||
// Set alert counts in "x/y" format (unacked/acked)
|
||||
var warningUnacked = data.alert_warning_unacked || 0;
|
||||
var warningAcked = data.alert_warning_acked || 0;
|
||||
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||
var criticalAcked = data.alert_critical_acked || 0;
|
||||
|
||||
if (warningUnacked > 0 || warningAcked > 0) {
|
||||
c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||
} else {
|
||||
c_warning.innerHTML = "";
|
||||
}
|
||||
|
||||
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||
c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||
} else {
|
||||
c_critical.innerHTML = "";
|
||||
}
|
||||
|
||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||
c_ipv4state.innerHTML = data.connections[0].state;
|
||||
if (data.connections.length > 1) {
|
||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||
c_ipv6state.innerHTML = data.connections[1].state;
|
||||
}
|
||||
var table = document.getElementById("ntablebody"); // find table to append to
|
||||
table.appendChild(row); // append row to table
|
||||
name_idx[c_name] = row;
|
||||
updateRowAlert(row, data);
|
||||
}
|
||||
|
||||
function formatTS(ts) {
|
||||
const now = new Date();
|
||||
const d = new Date(ts * 1000);
|
||||
|
||||
const pad = n => String(n).padStart(2, '0');
|
||||
const timeStr = `${pad(d.getHours())}:${pad(d.getMinutes())}:${pad(d.getSeconds())}`;
|
||||
|
||||
// Same calendar day → show time only
|
||||
if (d.toDateString() === now.toDateString()) {
|
||||
return timeStr;
|
||||
}
|
||||
|
||||
// Within 8 days → show "-X d hh:mm:ss"
|
||||
const todayStart = new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
||||
const dStart = new Date(d.getFullYear(), d.getMonth(), d.getDate());
|
||||
const diffDays = Math.round((todayStart - dStart) / 86400000);
|
||||
if (diffDays < 8) {
|
||||
return `-${diffDays}d ${timeStr}`;
|
||||
}
|
||||
|
||||
// Older → date only
|
||||
return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}`;
|
||||
}
|
||||
|
||||
function update_table(data) {
|
||||
if (!(data.name in name_idx)) {
|
||||
createRow(data);
|
||||
setup();
|
||||
}
|
||||
|
||||
// Update name cell (version indicator)
|
||||
var nameCell = name_idx[data.name].cells[0];
|
||||
nameCell.dataset.name = data.name;
|
||||
nameCell.innerHTML = hostNameHtml(data);
|
||||
|
||||
// Update warning and critical counts in "x/y" format (unacked/acked)
|
||||
var warningUnacked = data.alert_warning_unacked || 0;
|
||||
var warningAcked = data.alert_warning_acked || 0;
|
||||
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||
var criticalAcked = data.alert_critical_acked || 0;
|
||||
|
||||
if (warningUnacked > 0 || warningAcked > 0) {
|
||||
name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||
} else {
|
||||
name_idx[data.name].cells[1].innerHTML = "";
|
||||
}
|
||||
|
||||
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||
name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||
} else {
|
||||
name_idx[data.name].cells[2].innerHTML = "";
|
||||
}
|
||||
|
||||
for (var i = 0; i < data.connections.length; i++) {
|
||||
// Offset by 2 for the warning/critical count columns
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
|
||||
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
|
||||
data.connections[i].statetime
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = '<span class="state-up">up</span>';
|
||||
latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
latency = "";
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = "";
|
||||
name_idx[data.name].cells[6 + i * 4].innerHTML = "";
|
||||
} else if (data.connections[i].state == "down") {
|
||||
state = '<span class="state-down">down</span>';
|
||||
latency = "-";
|
||||
} else if (data.connections[i].state == "overdue") {
|
||||
state = '<span class="state-overdue">overdue</span>';
|
||||
latency = "-";
|
||||
} else {
|
||||
state = "<b>" + data.connections[i].state + "</b>";
|
||||
latency = "-";
|
||||
}
|
||||
}
|
||||
name_idx[data.name].cells[4 + i * 4].innerHTML = state;
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
|
||||
}
|
||||
updateRowAlert(name_idx[data.name], data);
|
||||
}
|
||||
|
||||
function applyLogFilters() {
|
||||
var hostFilter = document.getElementById('filter-host').value.toLowerCase().trim();
|
||||
var levelFilter = document.getElementById('filter-level').value;
|
||||
var msgFilter = document.getElementById('filter-msg').value.toLowerCase().trim();
|
||||
document.querySelectorAll('#messages .log-entry').forEach(function(entry) {
|
||||
var show = true;
|
||||
if (hostFilter && !(entry.dataset.host || '').toLowerCase().includes(hostFilter)) show = false;
|
||||
if (levelFilter && entry.dataset.level !== levelFilter) show = false;
|
||||
if (msgFilter) {
|
||||
var msgEl = entry.querySelector('.log-msg');
|
||||
if (!msgEl || !msgEl.textContent.toLowerCase().includes(msgFilter)) show = false;
|
||||
}
|
||||
entry.style.display = show ? '' : 'none';
|
||||
});
|
||||
}
|
||||
|
||||
function WS_Connect() {
|
||||
if ("WebSocket" in window) {
|
||||
//N.B: subprotocol field causes chrome to error 1006
|
||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||
|
||||
ws_hbd.onopen = function () {
|
||||
// Web Socket is connected, send data using send()
|
||||
console.log("ws connect {{heartbeat_ws_url}}");
|
||||
// Hide modal window if visible
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.remove("show");
|
||||
}
|
||||
ws_hbd.send("heartbeat_web");
|
||||
};
|
||||
|
||||
ws_hbd.onerror = function (event) {
|
||||
console.log(event);
|
||||
};
|
||||
|
||||
ws_hbd.onmessage = function (event) {
|
||||
/* console.log(event.data); */
|
||||
var state = JSON.parse(event.data);
|
||||
/* console.log("State: " + state.type); */
|
||||
if (state.type == "host") {
|
||||
update_table(state.data);
|
||||
} else if (state.type == "message") {
|
||||
var msgs = document.getElementById("messages");
|
||||
var msg = state.data;
|
||||
var _d = new Date(msg.ts * 1000);
|
||||
function _p(n) { return n < 10 ? '0' + n : '' + n; }
|
||||
var ts_str = _d.getFullYear() + '-' + _p(_d.getMonth()+1) + '-' + _p(_d.getDate())
|
||||
+ ' ' + _p(_d.getHours()) + ':' + _p(_d.getMinutes()) + ':' + _p(_d.getSeconds());
|
||||
var lvl = (msg.level || "INFO").toLowerCase();
|
||||
var hostVal = msg.host || '';
|
||||
var html = '<div class="log-entry log-' + lvl + '" data-level="' + lvl + '" data-host="' + hostVal.replace(/"/g, '"') + '">';
|
||||
html += '<span class="log-ts">' + ts_str + '</span>';
|
||||
html += '<span class="log-level">' + (msg.level || "") + '</span>';
|
||||
if (msg.host) html += '<span class="log-host">' + msg.host + '</span>';
|
||||
if (msg.service) html += '<span class="log-service">' + msg.service + '</span>';
|
||||
html += '<span class="log-msg">' + msg.message + '</span>';
|
||||
html += '</div>';
|
||||
msgs.insertAdjacentHTML(state.history ? "beforeend" : "afterbegin", html);
|
||||
applyLogFilters();
|
||||
}
|
||||
cnt++;
|
||||
};
|
||||
|
||||
ws_hbd.onclose = function (event) {
|
||||
/* console.log(event); */
|
||||
console.log("Connection is closed, reopening");
|
||||
// Show modal window
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.add("show");
|
||||
}
|
||||
setTimeout(function () {
|
||||
WS_Connect();
|
||||
}, 3000);
|
||||
};
|
||||
} else {
|
||||
// The browser doesn't support WebSocket
|
||||
console.log("WebSocket NOT supported by your Browser!");
|
||||
}
|
||||
}
|
||||
WS_Connect();
|
||||
</script>
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
{% include 'menu.html' %}
|
||||
|
||||
<div class="container">
|
||||
<div>
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Real-time host monitoring and event log</p>
|
||||
</div>
|
||||
|
||||
<div class="table-section">
|
||||
<table id="ntable" class="sortable">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th style="text-align: center" title="Warning Alerts">⚠️</th>
|
||||
<th style="text-align: center" title="Critical Alerts">🔴</th>
|
||||
<th>IPv4 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latency</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
<th>IPv6 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latency</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="ntablebody">
|
||||
{% for host in hosts %}
|
||||
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||
{%- set warning_acked = host.alert_warning_acked -%}
|
||||
{%- if warning_unacked > 0 or warning_acked > 0 -%}
|
||||
{{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
|
||||
{%- endif -%}
|
||||
</td>
|
||||
<td style="text-align: center; color: #f44336; font-weight: bold;">
|
||||
{%- set critical_unacked = host.alert_critical_unacked -%}
|
||||
{%- set critical_acked = host.alert_critical_acked -%}
|
||||
{%- if critical_unacked > 0 or critical_acked > 0 -%}
|
||||
{{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
|
||||
{%- endif -%}
|
||||
</td>
|
||||
{% for conn in host.connections %}
|
||||
<td>{{ conn.addr if conn.addr else '' }}</td>
|
||||
<td>{{ conn.state if conn.state else '' }}</td>
|
||||
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
|
||||
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
|
||||
{% endfor %}
|
||||
{% if host.connections|length == 0 %}
|
||||
<td></td><td></td><td></td><td></td>
|
||||
<td></td><td></td><td></td><td></td>
|
||||
{% elif host.connections|length == 1 %}
|
||||
<td></td><td></td><td></td><td></td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="log-section">
|
||||
<div class="log-section-header">
|
||||
<span class="log-section-title">Log of Events</span>
|
||||
<div class="log-filter-bar">
|
||||
<input type="text" id="filter-host" placeholder="Host…" title="Filter by host" />
|
||||
<select id="filter-level" title="Filter by level">
|
||||
<option value="">All levels</option>
|
||||
<option value="info">INFO</option>
|
||||
<option value="warning">WARNING</option>
|
||||
<option value="critical">CRITICAL</option>
|
||||
<option value="recover">RECOVER</option>
|
||||
<option value="unknown">UNKNOWN</option>
|
||||
</select>
|
||||
<input type="text" id="filter-msg" placeholder="Message…" title="Filter by message text" />
|
||||
</div>
|
||||
</div>
|
||||
<div id="messages"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% include 'foot.html' %}
|
||||
|
||||
<!-- Connection status modal -->
|
||||
<div id="connectionModal" class="connection-modal">
|
||||
<div class="connection-modal-content">
|
||||
<p>⚠️ Connection is closed, reopening...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
setup();
|
||||
document.getElementById('filter-host').addEventListener('input', applyLogFilters);
|
||||
document.getElementById('filter-level').addEventListener('change', applyLogFilters);
|
||||
document.getElementById('filter-msg').addEventListener('input', applyLogFilters);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,2 @@
|
||||
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||
s<header>{{ header }}</header> -->
|
||||
@@ -0,0 +1,134 @@
|
||||
<div class="nav">
|
||||
<button class="nav-hamburger" id="nav-hamburger-btn" aria-label="Menu" aria-expanded="false">
|
||||
<span></span><span></span><span></span>
|
||||
</button>
|
||||
<div class="nav-links" id="nav-links">
|
||||
<a href="/live"{% if active_page == "live" %} class="active"{% endif %}>Live Dashboard</a>
|
||||
<a href="/plugins"{% if active_page == "plugins" %} class="active"{% endif %}>Host Overview</a>
|
||||
<a href="/alerts"{% if active_page == "alerts" %} class="active"{% endif %}>Alerts</a>
|
||||
{% if current_user and current_user.admin %}
|
||||
<a href="/settings"{% if active_page == "settings" %} class="active"{% endif %}>Settings</a>
|
||||
{% endif %}
|
||||
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
||||
</div>
|
||||
{% if current_user and current_user.admin %}
|
||||
<button id="nav-publish-btn" class="nav-publish-btn" onclick="navPublishConfig()" style="display:none" title="Publish pending config changes to .hb.yaml">⚠ Publish Config</button>
|
||||
{% endif %}
|
||||
<div class="nav-pie" title="Host alert status">
|
||||
<canvas id="alert-pie" width="44" height="44"></canvas>
|
||||
</div>
|
||||
<div class="nav-clock" title="Click for full-screen clock">
|
||||
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
||||
</div>
|
||||
{% if current_user %}
|
||||
<a href="/profile" class="nav-user{% if active_page == 'profile' %} active{% endif %}" title="{{ current_user.full_name or current_user.username }}">
|
||||
{% if current_user.avatar %}
|
||||
<img class="nav-avatar" src="{{ current_user.avatar_url }}" alt="{{ current_user.full_name or current_user.username }}">
|
||||
{% else %}
|
||||
<span class="nav-initials">{{ (current_user.full_name or current_user.username)[:1] | upper }}</span>
|
||||
{% endif %}
|
||||
<span class="nav-username">{{ current_user.full_name or current_user.username }}</span>
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- Full-page clock overlay (click anywhere to dismiss) -->
|
||||
<div id="clock-overlay">
|
||||
<canvas id="swiss-clock-overlay" width="400" height="400"></canvas>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
var btn = document.getElementById('nav-hamburger-btn');
|
||||
var links = document.getElementById('nav-links');
|
||||
if (btn && links) {
|
||||
btn.addEventListener('click', function() {
|
||||
var open = links.classList.toggle('nav-open');
|
||||
btn.setAttribute('aria-expanded', open ? 'true' : 'false');
|
||||
});
|
||||
}
|
||||
})();
|
||||
|
||||
function drawAlertPie(critical, warning, ok) {
|
||||
var canvas = document.getElementById('alert-pie');
|
||||
if (!canvas) return;
|
||||
var ctx = canvas.getContext('2d');
|
||||
var SIZE = canvas.width;
|
||||
var R = SIZE / 2;
|
||||
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||
var total = critical + warning + ok;
|
||||
if (total === 0) {
|
||||
ctx.beginPath();
|
||||
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||
ctx.fillStyle = '#ccc';
|
||||
ctx.fill();
|
||||
return;
|
||||
}
|
||||
var slices = [
|
||||
{ value: critical, color: '#e53935' },
|
||||
{ value: warning, color: '#ffb300' },
|
||||
{ value: ok, color: '#43a047' }
|
||||
];
|
||||
var start = -Math.PI / 2;
|
||||
slices.forEach(function(s) {
|
||||
if (s.value === 0) return;
|
||||
var sweep = (s.value / total) * Math.PI * 2;
|
||||
ctx.beginPath();
|
||||
ctx.moveTo(R, R);
|
||||
ctx.arc(R, R, R - 1, start, start + sweep);
|
||||
ctx.closePath();
|
||||
ctx.fillStyle = s.color;
|
||||
ctx.fill();
|
||||
start += sweep;
|
||||
});
|
||||
}
|
||||
|
||||
function updateAlertPie() {
|
||||
fetch('/api/0/alert_summary').then(function(r) {
|
||||
if (!r.ok) return;
|
||||
return r.json();
|
||||
}).then(function(d) {
|
||||
if (d) drawAlertPie(d.critical || 0, d.warning || 0, d.ok || 0);
|
||||
}).catch(function() {});
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
updateAlertPie();
|
||||
setInterval(updateAlertPie, 30000);
|
||||
navCheckPendingConfig();
|
||||
window.addEventListener('storage', navCheckPendingConfig);
|
||||
});
|
||||
|
||||
function navCheckPendingConfig() {
|
||||
var btn = document.getElementById('nav-publish-btn');
|
||||
if (!btn) return;
|
||||
btn.style.display = localStorage.getItem('hbd_pending_config') ? '' : 'none';
|
||||
}
|
||||
|
||||
async function navPublishConfig() {
|
||||
var btn = document.getElementById('nav-publish-btn');
|
||||
var pending = localStorage.getItem('hbd_pending_config');
|
||||
if (!pending) return;
|
||||
var staged;
|
||||
try { staged = JSON.parse(pending); } catch(e) { return; }
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Saving…'; }
|
||||
try {
|
||||
var resp = await fetch('/api/0/config', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: pending
|
||||
});
|
||||
if (resp.ok) {
|
||||
localStorage.removeItem('hbd_pending_config');
|
||||
window.location.reload();
|
||||
} else {
|
||||
var err = await resp.json().catch(function() { return {}; });
|
||||
alert('Error: ' + (err.error || resp.statusText));
|
||||
if (btn) { btn.disabled = false; btn.textContent = '⚠ Publish Config'; }
|
||||
}
|
||||
} catch(e) {
|
||||
alert('Network error: ' + e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = '⚠ Publish Config'; }
|
||||
}
|
||||
}
|
||||
</script>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,842 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
html, body { overflow: visible; }
|
||||
|
||||
.container {
|
||||
max-width: 900px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 4px;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 24px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
/* ---- Profile card ---- */
|
||||
.profile-card {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||
padding: 28px 32px;
|
||||
margin-bottom: 24px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 28px;
|
||||
}
|
||||
|
||||
.avatar-large {
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
border-radius: 50%;
|
||||
object-fit: cover;
|
||||
flex-shrink: 0;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||
}
|
||||
|
||||
.avatar-initials-large {
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
border-radius: 50%;
|
||||
background: #0066cc;
|
||||
color: #fff;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 2em;
|
||||
font-weight: 700;
|
||||
flex-shrink: 0;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||
}
|
||||
|
||||
.profile-info { flex: 1; }
|
||||
|
||||
.profile-name {
|
||||
font-size: 1.4em;
|
||||
font-weight: 700;
|
||||
color: #222;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.profile-username {
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 10px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.78em;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.4px;
|
||||
}
|
||||
|
||||
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||
.badge-user { background: #f1f3f4; color: #555; }
|
||||
|
||||
.profile-logout {
|
||||
margin-top: 14px;
|
||||
}
|
||||
|
||||
.btn-logout {
|
||||
display: inline-block;
|
||||
padding: 6px 16px;
|
||||
border-radius: 4px;
|
||||
background: #f44336;
|
||||
color: #fff;
|
||||
font-size: 1.00em;
|
||||
font-weight: 500;
|
||||
text-decoration: none;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-logout:hover { background: #d32f2f; text-decoration: none; }
|
||||
|
||||
/* ---- Section cards ---- */
|
||||
.section {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||
padding: 20px 24px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.section h2 {
|
||||
font-size: 1em;
|
||||
font-weight: 700;
|
||||
color: #333;
|
||||
margin: 0 0 16px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
/* ---- Settings rows ---- */
|
||||
.settings-row {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
padding: 8px 0;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.settings-row:last-child { border-bottom: none; }
|
||||
|
||||
.settings-label {
|
||||
width: 180px;
|
||||
flex-shrink: 0;
|
||||
color: #666;
|
||||
font-size: 0.88em;
|
||||
}
|
||||
|
||||
.settings-value { color: #222; }
|
||||
|
||||
.settings-empty { color: #aaa; font-style: italic; }
|
||||
|
||||
/* ---- Host lists ---- */
|
||||
.host-grid {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.host-chip {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 4px 12px;
|
||||
border-radius: 16px;
|
||||
font-size: 1.00em;
|
||||
font-weight: 500;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.host-chip.owner { background: #e8f5e9; color: #2e7d32; }
|
||||
.host-chip.manager { background: #e3f2fd; color: #1565c0; }
|
||||
.host-chip.monitor { background: #f3e5f5; color: #6a1b9a; }
|
||||
|
||||
.host-chip-dot {
|
||||
width: 7px; height: 7px; border-radius: 50%;
|
||||
}
|
||||
.owner .host-chip-dot { background: #2e7d32; }
|
||||
.manager .host-chip-dot { background: #1565c0; }
|
||||
.monitor .host-chip-dot { background: #6a1b9a; }
|
||||
|
||||
.no-hosts {
|
||||
color: #aaa;
|
||||
font-size: 0.9em;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/* ---- Notification channels ---- */
|
||||
.channel-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 6px 0;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.channel-row:last-child { border-bottom: none; }
|
||||
|
||||
.channel-type {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 10px;
|
||||
font-size: 0.78em;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
background: #f1f3f4;
|
||||
color: #555;
|
||||
min-width: 70px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.channel-name { color: #333; }
|
||||
|
||||
.edit-section { margin-top: 20px; }
|
||||
.edit-section h4 { font-size: .88em; font-weight: 600; color: #333; margin: 0 0 10px; text-transform: uppercase; letter-spacing: .04em; border-bottom: 1px solid #eee; padding-bottom: 6px; }
|
||||
.edit-field { margin-bottom: 10px; }
|
||||
.edit-field label { display: block; font-size: .82em; color: #666; margin-bottom: 3px; }
|
||||
.edit-input { width: 100%; border: 1px solid #ccc; border-radius: 4px; padding: 5px 8px; font-size: .88em; box-sizing: border-box; }
|
||||
.edit-input:focus { border-color: #0066cc; outline: none; }
|
||||
.status-msg { font-size: .82em; margin-left: 8px; }
|
||||
.save-row { display: flex; align-items: center; margin-top: 8px; }
|
||||
.btn-save { background: #0066cc; color: #fff; border: none; border-radius: 4px; padding: 5px 14px; font-size: .85em; cursor: pointer; }
|
||||
.btn-save:hover { background: #0055aa; }
|
||||
/* ---- Channel chip picker ---- */
|
||||
.ch-picker { }
|
||||
.ch-picker-label { font-size: .8em; font-weight: 600; color: #888; text-transform: uppercase; letter-spacing: .04em; margin-bottom: 6px; }
|
||||
.ch-chips { display: flex; flex-wrap: wrap; gap: 6px; min-height: 32px; margin-bottom: 10px; }
|
||||
.ch-chip {
|
||||
display: inline-flex; align-items: center; gap: 5px;
|
||||
padding: 4px 10px; border-radius: 14px; font-size: .85em; font-weight: 500; cursor: pointer;
|
||||
border: none; font-family: inherit;
|
||||
}
|
||||
.ch-chip.selected { background: #e3f2fd; color: #1565c0; }
|
||||
.ch-chip.selected:hover { background: #bbdefb; }
|
||||
.ch-chip.available { background: #f1f3f4; color: #555; }
|
||||
.ch-chip.available:hover { background: #e8eaf6; color: #283593; }
|
||||
.ch-chip-x { font-size: .9em; line-height: 1; color: inherit; opacity: .7; }
|
||||
|
||||
/* ---- My Channels card list ---- */
|
||||
.my-ch-card {
|
||||
border: 1px solid #e8eaf6; border-radius: 6px; margin-bottom: 8px; overflow: hidden;
|
||||
}
|
||||
.my-ch-header {
|
||||
display: flex; align-items: center; gap: 8px; padding: 8px 12px;
|
||||
background: #f8f9ff; border-bottom: 1px solid #e8eaf6;
|
||||
}
|
||||
.my-ch-name { font-weight: 600; font-size: .9em; color: #222; }
|
||||
.my-ch-type { padding: 2px 7px; border-radius: 8px; font-size: .72em; font-weight: 600; background: #e8eaf6; color: #3949ab; }
|
||||
.my-ch-private { padding: 2px 7px; border-radius: 8px; font-size: .72em; font-weight: 600; background: #fce4ec; color: #c62828; }
|
||||
.my-ch-actions { margin-left: auto; display: flex; gap: 5px; }
|
||||
.btn-sm-edit { background: #888; color: #fff; border: none; border-radius: 4px; padding: 2px 8px; font-size: .78em; cursor: pointer; }
|
||||
.btn-sm-edit:hover { background: #666; }
|
||||
.btn-sm-del { background: transparent; color: #c62828; border: 1px solid #e0e0e0; border-radius: 4px; padding: 2px 7px; font-size: .78em; cursor: pointer; }
|
||||
.btn-sm-del:hover { background: #fce4ec; }
|
||||
|
||||
/* ---- Theme picker ---- */
|
||||
.theme-btns { display: flex; gap: 6px; }
|
||||
.theme-btn {
|
||||
padding: 5px 14px;
|
||||
border: 1px solid var(--border, #e0e0e0);
|
||||
border-radius: 4px;
|
||||
background: var(--surface-3, #f5f5f5);
|
||||
color: var(--text-sec, #666);
|
||||
cursor: pointer;
|
||||
font-size: .88em;
|
||||
font-family: inherit;
|
||||
}
|
||||
.theme-btn:hover { border-color: var(--link, #0066cc); color: var(--link, #0066cc); }
|
||||
.theme-btn.active { background: var(--link, #0066cc); color: #fff; border-color: var(--link, #0066cc); }
|
||||
|
||||
/* ── Dark mode ── */
|
||||
html[data-theme="dark"] h1 { color: var(--text); }
|
||||
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .profile-card { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||
html[data-theme="dark"] .profile-name { color: var(--text); }
|
||||
html[data-theme="dark"] .profile-username { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .badge-admin { background: #1a3255; color: #7aa8f0; }
|
||||
html[data-theme="dark"] .badge-user { background: var(--surface-3); color: var(--text-sec); }
|
||||
html[data-theme="dark"] .section { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||
html[data-theme="dark"] .section h2 { color: var(--text); border-bottom-color: var(--border); }
|
||||
html[data-theme="dark"] .settings-row { border-bottom-color: var(--border-4); }
|
||||
html[data-theme="dark"] .settings-label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .settings-value { color: var(--text); }
|
||||
html[data-theme="dark"] .settings-empty { color: var(--text-dim); }
|
||||
html[data-theme="dark"] .edit-section h4 { color: var(--text); border-bottom-color: var(--border); }
|
||||
html[data-theme="dark"] .edit-field label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .edit-input { background: var(--input-bg); border-color: var(--input-border); color: var(--text); }
|
||||
html[data-theme="dark"] .channel-row { border-bottom-color: var(--border-4); }
|
||||
html[data-theme="dark"] .channel-name { color: var(--text); }
|
||||
html[data-theme="dark"] .ch-picker-label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .ch-chip.selected { background: #1a3255; color: #60a5fa; }
|
||||
html[data-theme="dark"] .ch-chip.available { background: var(--surface-3); color: var(--text-sec); }
|
||||
html[data-theme="dark"] .ch-chip.available:hover { background: var(--border); color: var(--link); }
|
||||
html[data-theme="dark"] .my-ch-card { border-color: var(--border); }
|
||||
html[data-theme="dark"] .my-ch-header { background: var(--surface-2); border-bottom-color: var(--border); }
|
||||
html[data-theme="dark"] .my-ch-name { color: var(--text); }
|
||||
html[data-theme="dark"] .host-chip.owner { background: #0d2e17; color: #66bb6a; }
|
||||
html[data-theme="dark"] .host-chip.manager { background: #0d1f40; color: #64b5f6; }
|
||||
html[data-theme="dark"] .host-chip.monitor { background: #1e0d30; color: #ba68c8; }
|
||||
html[data-theme="dark"] .no-hosts { color: var(--text-dim); }
|
||||
html[data-theme="dark"] .ch-modal-box { background: var(--surface); color: var(--text); }
|
||||
html[data-theme="dark"] .ch-modal-box h3 { color: var(--text); }
|
||||
html[data-theme="dark"] .ch-form-row label { color: var(--text-sec); }
|
||||
html[data-theme="dark"] .ch-form-divider { color: var(--text-muted); border-top-color: var(--border); }
|
||||
|
||||
/* ---- Channel modal (for My Channels CRUD) ---- */
|
||||
.ch-modal-overlay {
|
||||
position: fixed; inset: 0; background: rgba(0,0,0,.4);
|
||||
display: flex; align-items: center; justify-content: center; z-index: 1001;
|
||||
}
|
||||
.ch-modal-box {
|
||||
background: #fff; border-radius: 8px; padding: 24px;
|
||||
min-width: 360px; max-width: 520px; width: 95%;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,.2);
|
||||
}
|
||||
.ch-modal-box h3 { margin: 0 0 16px; font-size: 1em; }
|
||||
.ch-form-row { margin-bottom: 12px; }
|
||||
.ch-form-row label { display: block; font-size: .83em; font-weight: 600; color: #555; margin-bottom: 3px; }
|
||||
.ch-form-row input[type=text], .ch-form-row input[type=password], .ch-form-row select {
|
||||
width: 100%; border: 1px solid #ccc; border-radius: 4px; padding: 5px 8px;
|
||||
font-size: .88em; box-sizing: border-box; font-family: inherit;
|
||||
}
|
||||
.ch-form-row input:focus, .ch-form-row select:focus { border-color: #0066cc; outline: none; }
|
||||
.ch-form-divider { font-size: .78em; font-weight: 700; text-transform: uppercase; letter-spacing: .05em; color: #888; margin: 14px 0 8px; border-top: 1px solid #eee; padding-top: 10px; }
|
||||
.ch-modal-footer { display: flex; justify-content: flex-end; gap: 8px; margin-top: 18px; }
|
||||
.ch-modal-status { font-size: .83em; margin-top: 8px; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Your account settings and host access</p>
|
||||
|
||||
<!-- Profile card -->
|
||||
<div class="profile-card">
|
||||
{% if current_user and current_user.avatar %}
|
||||
<img class="avatar-large" src="{{ current_user.avatar_url }}" alt="">
|
||||
{% else %}
|
||||
<div class="avatar-initials-large">
|
||||
{{ ((current_user.full_name if current_user else '') or (current_user.username if current_user else '?'))[:1] | upper }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="profile-info">
|
||||
<div class="profile-name">{{ current_user.full_name if current_user and current_user.full_name else (current_user.username if current_user else '—') }}</div>
|
||||
<div class="profile-username">@{{ current_user.username if current_user else '—' }}</div>
|
||||
{% if current_user and current_user.admin %}
|
||||
<span class="badge badge-admin">Admin</span>
|
||||
{% else %}
|
||||
<span class="badge badge-user">User</span>
|
||||
{% endif %}
|
||||
<div class="profile-logout">
|
||||
<a href="/logout" class="btn-logout">Sign out</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Account settings -->
|
||||
<div class="section">
|
||||
<h2>Account</h2>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Username</span>
|
||||
<span class="settings-value">{{ current_user.username if current_user else '—' }}</span>
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Full name</span>
|
||||
{% if current_user and current_user.full_name %}
|
||||
<span class="settings-value">{{ current_user.full_name }}</span>
|
||||
{% else %}
|
||||
<span class="settings-empty">Not set</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Role</span>
|
||||
<span class="settings-value">{{ 'Administrator' if current_user and current_user.admin else 'User' }}</span>
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Avatar</span>
|
||||
{% if current_user and current_user.avatar %}
|
||||
<span class="settings-value" style="word-break:break-all;">{{ current_user.avatar }}</span>
|
||||
{% else %}
|
||||
<span class="settings-empty">Not set (initials used)</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if current_user %}
|
||||
<!-- ---- Editable identity ---- -->
|
||||
<div class="section edit-section">
|
||||
<h4>Identity</h4>
|
||||
<div class="edit-field">
|
||||
<label for="profile-fullname">Display name</label>
|
||||
<input id="profile-fullname" class="edit-input" type="text" value="{{ current_user.full_name | e }}" placeholder="Full name">
|
||||
</div>
|
||||
<div class="edit-field">
|
||||
<label for="profile-avatar">Avatar URL or path</label>
|
||||
<input id="profile-avatar" class="edit-input" type="text" value="{{ current_user.avatar | e }}" placeholder="/path/to/avatar.png or https://…">
|
||||
</div>
|
||||
<div class="save-row">
|
||||
<button class="btn-save" onclick="saveIdentity()">Save</button>
|
||||
<span id="identity-status" class="status-msg"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- ---- Change password ---- -->
|
||||
<div class="section edit-section">
|
||||
<h4>Change password</h4>
|
||||
<div class="edit-field">
|
||||
<label for="profile-current-pw">Current password</label>
|
||||
<input id="profile-current-pw" class="edit-input" type="password" autocomplete="current-password">
|
||||
</div>
|
||||
<div class="edit-field">
|
||||
<label for="profile-new-pw">New password</label>
|
||||
<input id="profile-new-pw" class="edit-input" type="password" autocomplete="new-password">
|
||||
</div>
|
||||
<div class="save-row">
|
||||
<button class="btn-save" onclick="changePassword()">Change password</button>
|
||||
<span id="password-status" class="status-msg"></span>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Notification channels — chip picker -->
|
||||
<div class="section">
|
||||
<h2>Notification Channels</h2>
|
||||
{% if current_user %}
|
||||
<p style="font-size:.82em;color:#888;margin:0 0 12px">Click a channel to add or remove it from your alert list.</p>
|
||||
{% if all_channels %}
|
||||
<div class="ch-picker">
|
||||
<div class="ch-picker-label">Selected</div>
|
||||
<div id="selected-chips" class="ch-chips">
|
||||
{% for ch in all_channels %}
|
||||
{% if ch.name in (current_user.notification_channels or []) %}
|
||||
<button class="ch-chip selected" data-ch="{{ ch.name | e }}" onclick="toggleChip(this)">
|
||||
{{ ch.name | e }} <span class="ch-chip-x">×</span>
|
||||
</button>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set selected_set = current_user.notification_channels or [] %}
|
||||
{% set has_selected = selected_set | length > 0 %}
|
||||
{% if not has_selected %}
|
||||
<span style="font-size:.83em;color:#bbb;font-style:italic;align-self:center">None selected</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="ch-picker-label">Available</div>
|
||||
<div id="available-chips" class="ch-chips">
|
||||
{% for ch in all_channels %}
|
||||
{% if ch.name not in (current_user.notification_channels or []) %}
|
||||
<button class="ch-chip available" data-ch="{{ ch.name | e }}" onclick="toggleChip(this)">
|
||||
+ {{ ch.name | e }}
|
||||
</button>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% else %}
|
||||
<p style="font-size:.83em;color:#bbb;font-style:italic">No notification channels available. You can create your own below.</p>
|
||||
{% endif %}
|
||||
<div class="save-row">
|
||||
<button class="btn-save" onclick="saveChannels()">Save channels</button>
|
||||
<span id="channels-status" class="status-msg"></span>
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="no-hosts">Log in to manage notification channels.</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- My Channels — create/edit/delete own channels -->
|
||||
{% if current_user %}
|
||||
<div class="section">
|
||||
<h2>My Channels</h2>
|
||||
<p style="font-size:.82em;color:#888;margin:0 0 12px">Channels you own. Public channels are available to all users; private channels are visible only to you.</p>
|
||||
<div id="my-channels-list">
|
||||
{% set my_channels = all_channels | selectattr('owner', 'equalto', current_user.username) | list %}
|
||||
{% for ch in my_channels %}
|
||||
<div class="my-ch-card" id="mychcard-{{ ch.name | e }}">
|
||||
<div class="my-ch-header">
|
||||
<span class="my-ch-name">{{ ch.name | e }}</span>
|
||||
<span class="my-ch-type">{{ ch.type | e }}</span>
|
||||
{% if ch.private %}<span class="my-ch-private">private</span>{% endif %}
|
||||
<span class="my-ch-actions">
|
||||
<button class="btn-sm-edit" onclick="openMyChModal('{{ ch.name | e }}')">Edit</button>
|
||||
<button class="btn-sm-del" onclick="deleteMyChannel('{{ ch.name | e }}')">✕</button>
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% if not my_channels %}
|
||||
<p id="my-channels-empty" style="font-size:.83em;color:#bbb;font-style:italic">No channels yet.</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="save-row" style="margin-top:8px">
|
||||
<button class="btn-save" onclick="openMyChModal()">+ New channel</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- My Channels modal -->
|
||||
<div id="my-ch-modal" class="ch-modal-overlay" style="display:none" onclick="if(event.target===this)closeMyChModal()">
|
||||
<div class="ch-modal-box">
|
||||
<h3 id="my-ch-modal-title">New Channel</h3>
|
||||
<div class="ch-form-row">
|
||||
<label>Channel name</label>
|
||||
<input type="text" id="my-ch-name" placeholder="e.g. my_pushover" autocomplete="off">
|
||||
</div>
|
||||
<div class="ch-form-row">
|
||||
<label>Type</label>
|
||||
<select id="my-ch-type" onchange="onMyChTypeChange()">
|
||||
<option value="">— select —</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="my-ch-type-fields"></div>
|
||||
<div class="ch-form-divider">Options</div>
|
||||
<div class="ch-form-row">
|
||||
<label>Minimum alert level</label>
|
||||
<select id="my-ch-min-level">
|
||||
<option value="WARNING">WARNING (and above)</option>
|
||||
<option value="CRITICAL">CRITICAL only</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="ch-form-row">
|
||||
<label style="display:flex;align-items:center;gap:6px;cursor:pointer">
|
||||
<input type="checkbox" id="my-ch-private"> Private — visible only to you
|
||||
</label>
|
||||
</div>
|
||||
<div id="my-ch-modal-status" class="ch-modal-status"></div>
|
||||
<div class="ch-modal-footer">
|
||||
<button class="btn-save" style="background:#888" onclick="closeMyChModal()">Cancel</button>
|
||||
<button class="btn-save" onclick="saveMyChannel()">Save</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Appearance -->
|
||||
<div class="section">
|
||||
<h2>Appearance</h2>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Theme</span>
|
||||
<div class="theme-btns">
|
||||
<button class="theme-btn" data-theme-val="auto" onclick="setTheme('auto')">Auto</button>
|
||||
<button class="theme-btn" data-theme-val="light" onclick="setTheme('light')">Light</button>
|
||||
<button class="theme-btn" data-theme-val="dark" onclick="setTheme('dark')">Dark</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Host access -->
|
||||
<div class="section">
|
||||
<h2>Host Access</h2>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Owner</span>
|
||||
<div class="host-grid">
|
||||
{% if owned_hosts %}
|
||||
{% for h in owned_hosts %}
|
||||
<span class="host-chip owner"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Manager</span>
|
||||
<div class="host-grid">
|
||||
{% if managed_hosts %}
|
||||
{% for h in managed_hosts %}
|
||||
<span class="host-chip manager"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 4px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Monitor</span>
|
||||
<div class="host-grid">
|
||||
{% if monitored_hosts %}
|
||||
{% for h in monitored_hosts %}
|
||||
<span class="host-chip monitor"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<script>
|
||||
// ---- Theme ----
|
||||
function applyTheme(pref) {
|
||||
var dark = pref === 'dark' ||
|
||||
(pref === 'auto' && window.matchMedia('(prefers-color-scheme: dark)').matches);
|
||||
if (dark) { document.documentElement.setAttribute('data-theme', 'dark'); }
|
||||
else { document.documentElement.removeAttribute('data-theme'); }
|
||||
}
|
||||
function setTheme(pref) {
|
||||
try { localStorage.setItem('hbd_theme', pref); } catch(e) {}
|
||||
applyTheme(pref);
|
||||
document.querySelectorAll('.theme-btn').forEach(function(b) {
|
||||
b.classList.toggle('active', b.dataset.themeVal === pref);
|
||||
});
|
||||
}
|
||||
(function() {
|
||||
var pref = 'auto';
|
||||
try { pref = localStorage.getItem('hbd_theme') || 'auto'; } catch(e) {}
|
||||
document.querySelectorAll('.theme-btn').forEach(function(b) {
|
||||
b.classList.toggle('active', b.dataset.themeVal === pref);
|
||||
});
|
||||
})();
|
||||
|
||||
// ---- Identity ----
|
||||
async function saveIdentity() {
|
||||
const full_name = document.getElementById('profile-fullname').value;
|
||||
const avatar = document.getElementById('profile-avatar').value;
|
||||
const resp = await fetch('/api/0/users/me', {
|
||||
method: 'PUT',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({full_name, avatar}),
|
||||
});
|
||||
if (resp.ok) {
|
||||
showStatus('identity-status', 'Saved', '#2e7d32');
|
||||
} else {
|
||||
const err = await resp.json().catch(() => ({}));
|
||||
showStatus('identity-status', err.error || 'Error saving', '#c62828');
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Password ----
|
||||
async function changePassword() {
|
||||
const current = document.getElementById('profile-current-pw').value;
|
||||
const newpw = document.getElementById('profile-new-pw').value;
|
||||
if (!current || !newpw) {
|
||||
showStatus('password-status', 'Both fields are required', '#c62828');
|
||||
return;
|
||||
}
|
||||
const resp = await fetch('/api/0/users/me', {
|
||||
method: 'PUT',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({password: {current, new: newpw}}),
|
||||
});
|
||||
if (resp.ok) {
|
||||
document.getElementById('profile-current-pw').value = '';
|
||||
document.getElementById('profile-new-pw').value = '';
|
||||
showStatus('password-status', 'Password changed', '#2e7d32');
|
||||
} else {
|
||||
const err = await resp.json().catch(() => ({}));
|
||||
showStatus('password-status', err.error || 'Error', '#c62828');
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Channel chip picker ----
|
||||
function toggleChip(btn) {
|
||||
const name = btn.dataset.ch;
|
||||
const isSelected = btn.classList.contains('selected');
|
||||
if (isSelected) {
|
||||
// Move to available
|
||||
btn.classList.remove('selected');
|
||||
btn.classList.add('available');
|
||||
btn.innerHTML = '+ ' + escHtml(name);
|
||||
btn.onclick = function() { toggleChip(this); };
|
||||
document.getElementById('available-chips').appendChild(btn);
|
||||
// Remove "None selected" placeholder if it exists
|
||||
} else {
|
||||
// Move to selected
|
||||
btn.classList.remove('available');
|
||||
btn.classList.add('selected');
|
||||
btn.innerHTML = escHtml(name) + ' <span class="ch-chip-x">×</span>';
|
||||
btn.onclick = function() { toggleChip(this); };
|
||||
document.getElementById('selected-chips').appendChild(btn);
|
||||
}
|
||||
// Update placeholder visibility
|
||||
const sel = document.getElementById('selected-chips');
|
||||
const placeholder = sel.querySelector('span[style]');
|
||||
const hasChips = sel.querySelectorAll('.ch-chip.selected').length > 0;
|
||||
if (placeholder) placeholder.style.display = hasChips ? 'none' : '';
|
||||
}
|
||||
|
||||
async function saveChannels() {
|
||||
const notification_channels = [
|
||||
...document.querySelectorAll('#selected-chips .ch-chip.selected')
|
||||
].map(b => b.dataset.ch);
|
||||
const resp = await fetch('/api/0/users/me', {
|
||||
method: 'PUT',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({notification_channels}),
|
||||
});
|
||||
if (resp.ok) {
|
||||
showStatus('channels-status', 'Saved', '#2e7d32');
|
||||
} else {
|
||||
const err = await resp.json().catch(() => ({}));
|
||||
showStatus('channels-status', err.error || 'Error saving', '#c62828');
|
||||
}
|
||||
}
|
||||
|
||||
// ---- My Channels CRUD ----
|
||||
let _myChSchemas = {};
|
||||
let _myChEditName = null;
|
||||
|
||||
async function _loadMyChSchemas() {
|
||||
try {
|
||||
const r = await fetch('/api/0/notification_channel_types');
|
||||
_myChSchemas = await r.json();
|
||||
const sel = document.getElementById('my-ch-type');
|
||||
if (!sel) return;
|
||||
Object.entries(_myChSchemas).forEach(([k, v]) => {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = k; opt.textContent = v.label;
|
||||
sel.appendChild(opt);
|
||||
});
|
||||
} catch(e) { console.warn('Could not load channel schemas', e); }
|
||||
}
|
||||
|
||||
function onMyChTypeChange() {
|
||||
const type = document.getElementById('my-ch-type').value;
|
||||
const container = document.getElementById('my-ch-type-fields');
|
||||
container.innerHTML = '';
|
||||
if (!type || !_myChSchemas[type]) return;
|
||||
const divider = document.createElement('div');
|
||||
divider.className = 'ch-form-divider';
|
||||
divider.textContent = _myChSchemas[type].label + ' settings';
|
||||
container.appendChild(divider);
|
||||
(_myChSchemas[type].fields || []).forEach(sf => {
|
||||
const row = document.createElement('div');
|
||||
row.className = 'ch-form-row';
|
||||
const lbl = document.createElement('label');
|
||||
lbl.textContent = sf.label + (sf.required ? ' *' : '');
|
||||
const inp = document.createElement('input');
|
||||
inp.type = sf.type === 'secret' ? 'password' : 'text';
|
||||
inp.id = 'mychf-' + sf.key;
|
||||
inp.placeholder = sf.required ? '(required)' : '(optional)';
|
||||
inp.autocomplete = 'off';
|
||||
row.appendChild(lbl);
|
||||
row.appendChild(inp);
|
||||
container.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
async function openMyChModal(name) {
|
||||
_myChEditName = name || null;
|
||||
document.getElementById('my-ch-modal-status').textContent = '';
|
||||
document.getElementById('my-ch-modal-title').textContent = name ? 'Edit Channel' : 'New Channel';
|
||||
document.getElementById('my-ch-name').value = name || '';
|
||||
document.getElementById('my-ch-name').disabled = !!name;
|
||||
document.getElementById('my-ch-type').value = '';
|
||||
document.getElementById('my-ch-type-fields').innerHTML = '';
|
||||
document.getElementById('my-ch-min-level').value = 'WARNING';
|
||||
document.getElementById('my-ch-private').checked = false;
|
||||
|
||||
if (name) {
|
||||
try {
|
||||
const r = await fetch('/api/0/notification_channels');
|
||||
const channels = await r.json();
|
||||
const ch = channels.find(c => c.name === name);
|
||||
if (ch) {
|
||||
document.getElementById('my-ch-type').value = ch.type;
|
||||
onMyChTypeChange();
|
||||
document.getElementById('my-ch-min-level').value = ch.min_level || 'WARNING';
|
||||
document.getElementById('my-ch-private').checked = ch.private || false;
|
||||
(ch.fields || []).forEach(f => {
|
||||
const inp = document.getElementById('mychf-' + f.key);
|
||||
if (inp) inp.value = f.value || '';
|
||||
});
|
||||
}
|
||||
} catch(e) { console.warn('Failed to load channel', e); }
|
||||
}
|
||||
document.getElementById('my-ch-modal').style.display = 'flex';
|
||||
}
|
||||
|
||||
function closeMyChModal() {
|
||||
document.getElementById('my-ch-modal').style.display = 'none';
|
||||
}
|
||||
|
||||
async function saveMyChannel() {
|
||||
const name = document.getElementById('my-ch-name').value.trim();
|
||||
const type = document.getElementById('my-ch-type').value;
|
||||
const minLevel = document.getElementById('my-ch-min-level').value;
|
||||
const isPrivate = document.getElementById('my-ch-private').checked;
|
||||
const statusEl = document.getElementById('my-ch-modal-status');
|
||||
statusEl.textContent = '';
|
||||
|
||||
if (!name) { statusEl.textContent = 'Name is required.'; statusEl.style.color = '#c62828'; return; }
|
||||
if (!type) { statusEl.textContent = 'Please select a type.'; statusEl.style.color = '#c62828'; return; }
|
||||
|
||||
const body = { name, type, min_level: minLevel, private: isPrivate };
|
||||
if (_myChSchemas[type]) {
|
||||
(_myChSchemas[type].fields || []).forEach(sf => {
|
||||
const inp = document.getElementById('mychf-' + sf.key);
|
||||
if (inp) body[sf.key] = inp.value;
|
||||
});
|
||||
}
|
||||
|
||||
const isEdit = !!_myChEditName;
|
||||
const url = isEdit
|
||||
? '/api/0/notification_channels/' + encodeURIComponent(_myChEditName)
|
||||
: '/api/0/notification_channels';
|
||||
const method = isEdit ? 'PUT' : 'POST';
|
||||
try {
|
||||
const r = await fetch(url, { method, headers: {'Content-Type': 'application/json'}, body: JSON.stringify(body) });
|
||||
if (r.ok) {
|
||||
closeMyChModal();
|
||||
window.location.reload();
|
||||
} else {
|
||||
const err = await r.json().catch(() => ({}));
|
||||
statusEl.textContent = err.error || 'Error saving.';
|
||||
statusEl.style.color = '#c62828';
|
||||
}
|
||||
} catch(e) {
|
||||
statusEl.textContent = 'Network error: ' + e.message;
|
||||
statusEl.style.color = '#c62828';
|
||||
}
|
||||
}
|
||||
|
||||
async function deleteMyChannel(name) {
|
||||
if (!confirm('Delete channel "' + name + '"?')) return;
|
||||
try {
|
||||
const r = await fetch('/api/0/notification_channels/' + encodeURIComponent(name), { method: 'DELETE' });
|
||||
if (r.ok) {
|
||||
window.location.reload();
|
||||
} else {
|
||||
const err = await r.json().catch(() => ({}));
|
||||
alert('Error: ' + (err.error || 'Could not delete.'));
|
||||
}
|
||||
} catch(e) { alert('Network error: ' + e.message); }
|
||||
}
|
||||
|
||||
// ---- Utilities ----
|
||||
function showStatus(id, msg, color) {
|
||||
const el = document.getElementById(id);
|
||||
if (!el) return;
|
||||
el.textContent = msg;
|
||||
el.style.color = color;
|
||||
setTimeout(() => { el.textContent = ''; }, 3000);
|
||||
}
|
||||
|
||||
function escHtml(s) {
|
||||
return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', _loadMyChSchemas);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,529 @@
|
||||
"""UDP listener and datagram processing."""
|
||||
|
||||
import asyncio
|
||||
import socket
|
||||
import struct
|
||||
import time
|
||||
import zlib
|
||||
import logging
|
||||
|
||||
from platform import system as platform_system
|
||||
|
||||
from ..common.proto import stodict, oldmtodict
|
||||
from ..common.utils import dur
|
||||
from . import notify as notify_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
|
||||
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
|
||||
# Python's socket module on all platforms
|
||||
platform = platform_system()
|
||||
if platform == "Darwin":
|
||||
_SO_TIMESTAMP = 1024 # SO_TIMESTAMP on macOS (not in Python's socket module)
|
||||
elif platform == "Linux":
|
||||
_SO_TIMESTAMP = 29 # Linux value (not in older Python versions)
|
||||
elif platform == "FreeBSD":
|
||||
_SO_TIMESTAMP = 32 # FreeBSD value (not in older Python versions)
|
||||
else:
|
||||
logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
|
||||
_SO_TIMESTAMP = None
|
||||
|
||||
# struct timeval uses two native C longs: tv_sec and tv_usec
|
||||
_TIMEVAL = struct.Struct('@ll')
|
||||
|
||||
|
||||
def enable_kernel_timestamps(sock) -> bool:
|
||||
"""Try to enable SO_TIMESTAMP on *sock*.
|
||||
|
||||
Returns True if the kernel will supply receive timestamps, False otherwise
|
||||
(unsupported platform, older kernel, or insufficient permissions).
|
||||
"""
|
||||
try:
|
||||
sock.setsockopt(socket.SOL_SOCKET, _SO_TIMESTAMP, 1)
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _extract_kernel_ts(ancdata) -> float | None:
|
||||
"""Parse recvmsg ancillary data and return the kernel receive time.
|
||||
|
||||
Returns seconds as a float, or None if no SO_TIMESTAMP cmsg is present.
|
||||
"""
|
||||
for cmsg_level, cmsg_type, cmsg_data in ancdata:
|
||||
if cmsg_level == socket.SOL_SOCKET and cmsg_type == _SO_TIMESTAMP:
|
||||
if len(cmsg_data) >= _TIMEVAL.size:
|
||||
sec, usec = _TIMEVAL.unpack_from(cmsg_data)
|
||||
return sec + usec * 1e-6
|
||||
return None
|
||||
|
||||
|
||||
class RecvmsgTransport:
|
||||
"""Thin wrapper used when SO_TIMESTAMP is active (add_reader path).
|
||||
|
||||
Exposes the same sendto() / close() interface as asyncio's DatagramTransport
|
||||
so the rest of the code does not need to know which path is in use.
|
||||
"""
|
||||
def __init__(self, loop, sock):
|
||||
self._loop = loop
|
||||
self._sock = sock
|
||||
|
||||
def sendto(self, data, addr):
|
||||
try:
|
||||
self._sock.sendto(data, addr)
|
||||
except Exception as e:
|
||||
logger.debug("sendto failed: %s", e)
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self._loop.remove_reader(self._sock.fileno())
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self._sock.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def make_recvmsg_reader(sock, handler, transport):
|
||||
"""Return a callback suitable for loop.add_reader().
|
||||
|
||||
Reads one datagram per call using recvmsg() so that kernel timestamps in
|
||||
the ancillary data are accessible. Falls back to time.time() if the
|
||||
cmsg is missing.
|
||||
|
||||
handler(msg, addr, transport, kernel_ts) – same signature as udp_handler
|
||||
in main.py with the optional kernel_ts argument.
|
||||
"""
|
||||
BUFSIZE = 65536
|
||||
ANCBUFSIZE = 128 # enough for one struct timespec cmsg
|
||||
|
||||
def _read():
|
||||
try:
|
||||
data, ancdata, _, addr = sock.recvmsg(BUFSIZE, ANCBUFSIZE)
|
||||
except BlockingIOError:
|
||||
return
|
||||
except OSError as e:
|
||||
logger.warning("recvmsg error: %s", e)
|
||||
return
|
||||
try:
|
||||
kernel_ts = _extract_kernel_ts(ancdata)
|
||||
msg = parse_message(data)
|
||||
if msg:
|
||||
handler(msg, addr, transport, kernel_ts)
|
||||
except Exception:
|
||||
logger.exception("Error processing datagram from %s", addr)
|
||||
|
||||
return _read
|
||||
|
||||
|
||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||
def __init__(self, config=None, handler=None):
|
||||
super().__init__()
|
||||
self.config = config or {}
|
||||
self.handler = handler
|
||||
|
||||
def connection_made(self, transport):
|
||||
self.transport = transport
|
||||
logger.info("UDP Server listening...")
|
||||
|
||||
def datagram_received(self, data, addr):
|
||||
logger.debug("Received from %s", addr)
|
||||
try:
|
||||
msg = parse_message(data)
|
||||
if self.handler:
|
||||
# handler can be a callable provided by the application
|
||||
# pass the transport so handlers can send replies (ACKs/commands)
|
||||
self.handler(msg, addr, self.transport)
|
||||
except Exception:
|
||||
logger.exception("Error while processing datagram from %s", addr)
|
||||
|
||||
|
||||
def parse_message(data: bytes):
|
||||
"""Parse a raw datagram into a message dict.
|
||||
|
||||
Uses the protocol decoding helpers and falls back to old format when
|
||||
decoding returns an empty dict (compat with older clients).
|
||||
"""
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
# fallback to old format
|
||||
msg = oldmtodict(data)
|
||||
return msg
|
||||
|
||||
|
||||
def dicttos(ID, d):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
return opk
|
||||
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||
|
||||
|
||||
def _set_connectivity_alert(host, afam, level_name):
|
||||
"""Update (or clear) a connectivity alert_state entry for a host/address-family.
|
||||
|
||||
level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so
|
||||
that recovered hosts don't clutter the Alerts Dashboard.
|
||||
"""
|
||||
from .threshold import AlertState, AlertLevel
|
||||
metric_path = f"connectivity.{afam}"
|
||||
level = getattr(AlertLevel, level_name, AlertLevel.OK)
|
||||
if level == AlertLevel.OK:
|
||||
host.alert_states.pop(metric_path, None)
|
||||
return
|
||||
if metric_path not in host.alert_states:
|
||||
host.alert_states[metric_path] = AlertState(metric_path)
|
||||
state = host.alert_states[metric_path]
|
||||
state.update(level, level_name)
|
||||
|
||||
|
||||
def _make_timer_callbacks(uname, host, ctx):
|
||||
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||
|
||||
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||
"""
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
cfg = ctx.get("config", {})
|
||||
|
||||
async def on_unknown(connection):
|
||||
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||
# Keep connectivity alert active when host transitions to unknown
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
async def on_overdue(connection):
|
||||
if connection.getstate() != connection.__class__.UP:
|
||||
return
|
||||
now = time.time()
|
||||
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL", msg)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||
))
|
||||
# Track in alert_states so the Alerts Dashboard shows this
|
||||
_set_connectivity_alert(host, connection.afam, "CRITICAL")
|
||||
if threshold_checker:
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path="rtt",
|
||||
value=float("inf"),
|
||||
alert_states=host.alert_states,
|
||||
)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||
|
||||
return on_overdue, on_unknown
|
||||
|
||||
|
||||
def restore_connection_timers(hbdclass, ctx):
|
||||
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||
|
||||
For UP connections, the remaining time until overdue is calculated from
|
||||
lastbeat so that clients that vanished during hbd's downtime are detected.
|
||||
For OVERDUE connections, the UNKNOWN drop timer is restored.
|
||||
"""
|
||||
now = time.time()
|
||||
cfg = ctx.get("config", {})
|
||||
grace = cfg.get("grace", 2)
|
||||
|
||||
restored = 0
|
||||
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||
interval = host.interval
|
||||
for afam, conn in list(host.connections.items()):
|
||||
state = conn.getstate()
|
||||
if state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
# Give hosts one full (interval + grace) of extra time on startup
|
||||
# so hosts that were silent while hbd was down are not immediately
|
||||
# flagged as overdue before they have a chance to check in.
|
||||
startup_grace = interval + grace
|
||||
remaining = max(startup_grace, 2 * startup_grace - elapsed)
|
||||
conn.reset_overdue_timer(remaining, on_overdue)
|
||||
logger.debug(
|
||||
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
|
||||
uname, afam, remaining, elapsed, startup_grace,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
elif state == hbdclass.Connection.OVERDUE:
|
||||
elapsed_overdue = now - conn.statetime
|
||||
remaining = DROPOVERDUE - elapsed_overdue
|
||||
if remaining <= 1:
|
||||
# Already past the drop window — mark UNKNOWN immediately
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
logger.info(
|
||||
"Marking %s/%s UNKNOWN (overdue %.1f days)",
|
||||
uname, afam, elapsed_overdue / 86400,
|
||||
)
|
||||
else:
|
||||
conn.reset_overdue_timer(remaining, on_unknown)
|
||||
logger.debug(
|
||||
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||
uname, afam, remaining,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
logger.info("Restored timers for %d connection(s)", restored)
|
||||
|
||||
|
||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
"""Handle a parsed datagram message.
|
||||
|
||||
ctx is a dictionary with runtime dependencies:
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- msg_journal: MessageJournal instance for logging all messages
|
||||
- DEBUG, verbose
|
||||
"""
|
||||
if not msg:
|
||||
return
|
||||
now = ctx.get("recv_ts") or time.time()
|
||||
|
||||
# Log message to journal
|
||||
msg_journal = ctx.get("msg_journal")
|
||||
if msg_journal:
|
||||
# Create async task to log message (non-blocking)
|
||||
import asyncio
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(msg_journal.log_message(msg, addr, now))
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to log message to journal: {e}")
|
||||
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
|
||||
# normalize addr (ip, port)
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from ..common.utils import shortname
|
||||
from . import config as config_mod
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
# Use new config function to check dyndns
|
||||
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||
host.dyn = uname in dyndnshosts
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
host.watched = uname in watchhosts
|
||||
# Apply user-access settings from config
|
||||
access = config_mod.get_host_access(cfg, uname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt"))
|
||||
except TypeError:
|
||||
rtt = None
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
# send ACK back; ask client to resend plugin info when we have none yet
|
||||
rmsg = {"time": time.time()}
|
||||
if not host.plugin_data:
|
||||
rmsg["request_update"] = 1
|
||||
opkt = dicttos("ACK", rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
elif msg.get("ID") == "PLG":
|
||||
# Handle plugin data message
|
||||
plugin_name = msg.get("plugin")
|
||||
if plugin_name:
|
||||
# Extract plugin fields, dropping protocol metadata fields
|
||||
plugin_data = {k: v for k, v in msg.items()
|
||||
if k not in ("ID", "plugin", "id", "name")}
|
||||
# Store plugin data with timestamp
|
||||
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||
|
||||
# If os_info reports an owner and none is configured server-side, apply it
|
||||
if plugin_name == "os_info":
|
||||
config_owner = config_mod.get_host_access(cfg, uname).get("owner")
|
||||
default_owner = config_mod.get_default_owner(cfg)
|
||||
inferred_owner = plugin_data.get("owner", config_owner or default_owner)
|
||||
host.owner = inferred_owner
|
||||
logger.info(f"owner for {uname} is {host.owner}")
|
||||
if DEBUG > 1:
|
||||
print(f"Stored plugin data for {uname}: {plugin_name}")
|
||||
|
||||
# Check thresholds if checker is available
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker:
|
||||
try:
|
||||
state_changes = threshold_checker.check_plugin_data(
|
||||
host_name=uname,
|
||||
plugin_name=plugin_name,
|
||||
data=plugin_data,
|
||||
alert_states=host.alert_states,
|
||||
)
|
||||
if DEBUG > 1 and state_changes:
|
||||
print(f"Threshold state changes for {uname}: {state_changes}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
|
||||
|
||||
# Notify websockets of plugin update
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("plugin", {
|
||||
"host": uname,
|
||||
"plugin": plugin_name,
|
||||
"data": plugin_data,
|
||||
"timestamp": now
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
conn, res = host.conndata(cid, ip, rtt, now)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print("conndata failed: %s" % e)
|
||||
return
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
|
||||
))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
service = msg.get("service", "unknown")
|
||||
message = msg.get("msg", None)
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||
))
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
# Clear connectivity alert now that the host is back up
|
||||
_set_connectivity_alert(host, conn.afam, "OK")
|
||||
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||
# it was never down, it just hasn't been seen before.
|
||||
if not newh:
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
elif d < 4:
|
||||
# Transient blip (likely client restart) — skip log and notification
|
||||
m = None
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
if m:
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||
))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
else:
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
m = "%s shutdown" % conn.afam
|
||||
eventlog(uname, "INFO", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||
))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
_set_connectivity_alert(host, conn.afam, "CRITICAL")
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
# Reset overdue timer on every heartbeat
|
||||
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||
grace = cfg.get("grace", 2)
|
||||
timeout_seconds = interval + grace
|
||||
on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
|
||||
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||
|
||||
# Check RTT thresholds using the threshold checker
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker and rtt and rtt > 0:
|
||||
# Metric path for RTT is simply "rtt"
|
||||
metric_path = "rtt"
|
||||
|
||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=rtt,
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# send any commands we have queued
|
||||
while len(host.cmds):
|
||||
op, rmsg = host.cmds[0]
|
||||
if op == "CMD":
|
||||
del host.cmds[0]
|
||||
eventlog(uname, "INFO", "command sent")
|
||||
elif op == "UPD":
|
||||
del host.cmds[0]
|
||||
eventlog(uname, "INFO", "update initiated")
|
||||
opkt = dicttos(op, rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send cmd/update: %s" % e))
|
||||
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send websocket message: %s" % e))
|
||||
@@ -0,0 +1,271 @@
|
||||
"""User management: loading, authentication, and session tracking.
|
||||
|
||||
Users are defined in the config file under the ``users`` key:
|
||||
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
avatar: /path/to/avatar.png # file path, URL, or base64 data URI
|
||||
password: pbkdf2:sha256:... # generated with: hbd passwd
|
||||
admin: true # optional server-level admin
|
||||
notification_channels: [pushover_standard]
|
||||
|
||||
Roles are assigned per-host:
|
||||
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
|
||||
If no users are defined the server runs in unauthenticated mode (backwards
|
||||
compatible). When users are defined every API call must carry a valid session
|
||||
token in an ``Authorization: Bearer <token>`` or ``X-Auth-Token`` header,
|
||||
obtained via ``POST /api/0/auth/login``.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import hmac
|
||||
import logging
|
||||
import secrets
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session lifetime in seconds (24 hours).
|
||||
SESSION_TTL = 86400
|
||||
|
||||
# Global session store: token -> {"username": str, "expires": float, "created": float}
|
||||
_sessions: dict = {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# User class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class User:
|
||||
def __init__(
|
||||
self,
|
||||
username: str,
|
||||
full_name: str = "",
|
||||
avatar: str = "",
|
||||
password_hash: str = "",
|
||||
admin: bool = False,
|
||||
notification_channels: list | None = None,
|
||||
):
|
||||
self.username = username
|
||||
self.full_name = full_name
|
||||
self.avatar = avatar
|
||||
self.password_hash = password_hash
|
||||
self.admin = admin
|
||||
self.notification_channels: list = notification_channels or []
|
||||
|
||||
def check_password(self, password: str) -> bool:
|
||||
if not self.password_hash:
|
||||
return False
|
||||
return _verify_password(password, self.password_hash)
|
||||
|
||||
def avatar_is_local(self) -> bool:
|
||||
"""Return True when the avatar is a local filesystem path (starts with '/')."""
|
||||
return bool(self.avatar and self.avatar.startswith("/"))
|
||||
|
||||
def avatar_url(self) -> str:
|
||||
"""Return the URL to use as an <img src>.
|
||||
|
||||
Local file paths are served via the /api/0/users/{username}/avatar
|
||||
endpoint. External URLs and data URIs are returned as-is.
|
||||
"""
|
||||
if self.avatar_is_local():
|
||||
return f"/api/0/users/{self.username}/avatar"
|
||||
return self.avatar
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"username": self.username,
|
||||
"full_name": self.full_name,
|
||||
"avatar": self.avatar,
|
||||
"avatar_url": self.avatar_url(),
|
||||
"admin": self.admin,
|
||||
"notification_channels": self.notification_channels,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Password hashing (PBKDF2-HMAC-SHA256, stdlib only)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def hash_password(password: str) -> str:
|
||||
"""Return a storable hash for *password*.
|
||||
|
||||
Format: ``pbkdf2:sha256:<iterations>:<salt>:<hex-digest>``
|
||||
|
||||
Use this to generate the ``password`` value in the config file::
|
||||
|
||||
python -c "from hbd.server.users import hash_password; print(hash_password('secret'))"
|
||||
|
||||
Or via the CLI::
|
||||
|
||||
hbd passwd
|
||||
"""
|
||||
salt = secrets.token_hex(16)
|
||||
iterations = 260_000
|
||||
dk = hashlib.pbkdf2_hmac(
|
||||
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||
)
|
||||
return f"pbkdf2:sha256:{iterations}:{salt}:{dk.hex()}"
|
||||
|
||||
|
||||
def _verify_password(password: str, stored_hash: str) -> bool:
|
||||
"""Return True if *password* matches *stored_hash*."""
|
||||
try:
|
||||
parts = stored_hash.split(":")
|
||||
if len(parts) != 5 or parts[0] != "pbkdf2" or parts[1] != "sha256":
|
||||
return False
|
||||
_, _, iterations_str, salt, expected_hex = parts
|
||||
iterations = int(iterations_str)
|
||||
dk = hashlib.pbkdf2_hmac(
|
||||
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||
)
|
||||
return hmac.compare_digest(dk.hex(), expected_hex)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global user registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# username -> User
|
||||
users: dict = {}
|
||||
|
||||
|
||||
def load_users(config: dict) -> dict:
|
||||
"""Populate the global user registry from *config*.
|
||||
|
||||
Called once at startup and again on SIGHUP config reload.
|
||||
Returns the new ``users`` dict.
|
||||
"""
|
||||
global users
|
||||
old_users = dict(users) # snapshot before rebuild
|
||||
users_cfg = config.get("users", {})
|
||||
if not isinstance(users_cfg, dict):
|
||||
users = {}
|
||||
# Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
|
||||
for username, existing_user in old_users.items():
|
||||
if not existing_user.password_hash and username not in users:
|
||||
users[username] = existing_user
|
||||
return users
|
||||
|
||||
result: dict = {}
|
||||
for username, attrs in users_cfg.items():
|
||||
if not isinstance(attrs, dict):
|
||||
logger.warning("Skipping user %r: expected a mapping", username)
|
||||
continue
|
||||
result[username] = User(
|
||||
username=username,
|
||||
full_name=attrs.get("full_name", ""),
|
||||
avatar=attrs.get("avatar", ""),
|
||||
password_hash=attrs.get("password", ""),
|
||||
admin=bool(attrs.get("admin", False)),
|
||||
notification_channels=attrs.get("notification_channels", []),
|
||||
)
|
||||
|
||||
users = result
|
||||
# Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
|
||||
for username, existing_user in old_users.items():
|
||||
if not existing_user.password_hash and username not in users:
|
||||
users[username] = existing_user
|
||||
logger.info("Loaded %d user(s) from config", len(users))
|
||||
return users
|
||||
|
||||
|
||||
def users_enabled() -> bool:
|
||||
"""Return True if at least one user is configured (auth-required mode)."""
|
||||
return bool(users)
|
||||
|
||||
|
||||
def get_user(username: str) -> "User | None":
|
||||
return users.get(username)
|
||||
|
||||
|
||||
def authenticate(username: str, password: str) -> "User | None":
|
||||
"""Return the User if credentials are valid, else None."""
|
||||
user = users.get(username)
|
||||
if user and user.check_password(password):
|
||||
return user
|
||||
return None
|
||||
|
||||
|
||||
def provision_oauth_user(username: str, full_name: str, avatar: str) -> "User":
|
||||
"""Create or update a user sourced from an OAuth2 provider.
|
||||
|
||||
New users are inserted with no password_hash — they can only authenticate
|
||||
via OAuth. Existing users (e.g. defined in config with a password) have
|
||||
their display name and avatar refreshed; all other attributes are preserved.
|
||||
"""
|
||||
user = users.get(username)
|
||||
if user is None:
|
||||
user = User(username=username, full_name=full_name, avatar=avatar)
|
||||
users[username] = user
|
||||
logger.info("Provisioned OAuth user %r", username)
|
||||
else:
|
||||
if full_name:
|
||||
user.full_name = full_name
|
||||
if avatar:
|
||||
user.avatar = avatar
|
||||
return user
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def create_session(username: str) -> str:
|
||||
"""Create a new session for *username* and return the opaque token."""
|
||||
_purge_expired_sessions()
|
||||
token = secrets.token_hex(32)
|
||||
_sessions[token] = {
|
||||
"username": username,
|
||||
"expires": time.time() + SESSION_TTL,
|
||||
"created": time.time(),
|
||||
}
|
||||
return token
|
||||
|
||||
|
||||
def get_session_user(token: str) -> "User | None":
|
||||
"""Return the User for a valid *token*, or None if missing/expired."""
|
||||
if not token:
|
||||
return None
|
||||
session = _sessions.get(token)
|
||||
if not session:
|
||||
return None
|
||||
if session["expires"] < time.time():
|
||||
del _sessions[token]
|
||||
return None
|
||||
return get_user(session["username"])
|
||||
|
||||
|
||||
def delete_session(token: str) -> None:
|
||||
"""Invalidate *token* (logout)."""
|
||||
_sessions.pop(token, None)
|
||||
|
||||
|
||||
def _purge_expired_sessions() -> None:
|
||||
now = time.time()
|
||||
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
|
||||
for t in expired:
|
||||
del _sessions[t]
|
||||
|
||||
|
||||
def save_sessions() -> dict:
|
||||
"""Return a snapshot of non-expired sessions suitable for pickling."""
|
||||
_purge_expired_sessions()
|
||||
return dict(_sessions)
|
||||
|
||||
|
||||
def load_sessions(snapshot: dict) -> None:
|
||||
"""Restore sessions from a pickled snapshot, dropping any that have expired."""
|
||||
global _sessions
|
||||
now = time.time()
|
||||
_sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
|
||||
logger.debug("Restored %d session(s) from pickle", len(_sessions))
|
||||
@@ -0,0 +1,160 @@
|
||||
"""WebSocket handler and broadcast helpers for hbd.
|
||||
|
||||
WebSocket connections are served through the regular HTTP port via the
|
||||
/ws route registered in http.py (aiohttp WebSocketResponse upgrade).
|
||||
The separate standalone WebSocket server on ws_port is no longer used.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Callable, Iterable, Optional
|
||||
from . import data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Map of WebSocket → User object (or None when auth is disabled)
|
||||
_connections: dict = {}
|
||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||
_verbose: bool = False
|
||||
|
||||
|
||||
def setup(
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
get_hosts: Optional[Callable[[], Iterable]] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""Register the running loop and initial-state callback.
|
||||
|
||||
Call this once from _run_async before starting the HTTP server.
|
||||
"""
|
||||
global _loop, _get_hosts, _verbose
|
||||
_loop = loop
|
||||
_get_hosts = get_hosts
|
||||
_verbose = verbose
|
||||
|
||||
|
||||
def _user_can_see_host(user, host_name: str) -> bool:
|
||||
"""Return True if *user* may see updates for *host_name* (manager or higher)."""
|
||||
from . import hbdclass, users as users_mod
|
||||
if user is None or not users_mod.users_enabled():
|
||||
return True
|
||||
if user.admin:
|
||||
return True
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is None:
|
||||
return False
|
||||
return host.is_manager(user.username)
|
||||
|
||||
|
||||
def _get_token(request) -> str:
|
||||
"""Extract session token from request (mirrors logic in http.py)."""
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth.startswith("Bearer "):
|
||||
return auth[7:].strip()
|
||||
token = request.headers.get("X-Auth-Token", "")
|
||||
if token:
|
||||
return token
|
||||
return request.cookies.get("hbd_session", "")
|
||||
|
||||
|
||||
async def handler(request):
|
||||
"""aiohttp WebSocket upgrade handler — register as GET /ws."""
|
||||
from aiohttp import web
|
||||
from . import users as users_mod
|
||||
|
||||
ws = web.WebSocketResponse()
|
||||
await ws.prepare(request)
|
||||
|
||||
token = _get_token(request)
|
||||
user = users_mod.get_session_user(token) if token else None
|
||||
|
||||
_connections[ws] = user
|
||||
remote = request.remote
|
||||
logger.info("WebSocket connected from %s", remote)
|
||||
|
||||
try:
|
||||
# Send current host state, filtered to hosts this user may see
|
||||
if _get_hosts:
|
||||
try:
|
||||
for h in list(_get_hosts()):
|
||||
host_name = h.get("raw_name") or h.get("name", "")
|
||||
if _user_can_see_host(user, host_name):
|
||||
await ws.send_str(json.dumps({"type": "host", "data": h}))
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial hosts: %s", e)
|
||||
|
||||
# Send recent messages newest-first so the client can append them in
|
||||
# display order without reordering on arrival (tagged history=True so
|
||||
# the client knows to append rather than prepend).
|
||||
if data.msgs:
|
||||
try:
|
||||
for m in reversed(data.msgs):
|
||||
host_name = m.get("host") if isinstance(m, dict) else None
|
||||
if not host_name or _user_can_see_host(user, host_name):
|
||||
await ws.send_str(json.dumps({"type": "message", "data": m, "history": True}))
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial messages: %s", e)
|
||||
|
||||
# Keep connection open, ignore incoming frames
|
||||
async for msg in ws:
|
||||
from aiohttp import WSMsgType
|
||||
if msg.type == WSMsgType.TEXT:
|
||||
if _verbose:
|
||||
logger.debug("ws recv from %s: %s", remote, msg.data)
|
||||
elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("WebSocket handler error from %s: %s", remote, e)
|
||||
finally:
|
||||
_connections.pop(ws, None)
|
||||
logger.info("WebSocket disconnected from %s", remote)
|
||||
|
||||
return ws
|
||||
|
||||
|
||||
def broadcast(typ: str, payload) -> bool:
|
||||
"""Thread-safe broadcast to all connected WebSocket clients.
|
||||
|
||||
For host and plugin updates, only sends to clients whose user has
|
||||
manager-or-higher access to that host. Other message types are
|
||||
broadcast to all clients.
|
||||
|
||||
Can be called from any thread; schedules sends on the event loop.
|
||||
Returns False if the loop is not running yet.
|
||||
"""
|
||||
if not _loop:
|
||||
return False
|
||||
|
||||
# Determine the host name for access-filtered message types
|
||||
host_name: Optional[str] = None
|
||||
if typ in ("host", "plugin"):
|
||||
host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
|
||||
elif typ == "message" and isinstance(payload, dict):
|
||||
host_name = payload.get("host")
|
||||
|
||||
jmsg = json.dumps({"type": typ, "data": payload})
|
||||
|
||||
async def _send_all():
|
||||
dead = set()
|
||||
for ws, user in list(_connections.items()):
|
||||
try:
|
||||
if ws.closed:
|
||||
dead.add(ws)
|
||||
continue
|
||||
if host_name is not None and not _user_can_see_host(user, host_name):
|
||||
continue
|
||||
await ws.send_str(jmsg)
|
||||
except Exception:
|
||||
dead.add(ws)
|
||||
for ws in dead:
|
||||
_connections.pop(ws, None)
|
||||
|
||||
asyncio.run_coroutine_threadsafe(_send_all(), _loop)
|
||||
return True
|
||||
|
||||
|
||||
def connection_count() -> int:
|
||||
return len(_connections)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 5.3 KiB |
@@ -1,7 +0,0 @@
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||
<title>{{ title }}</title>
|
||||
<script src="{{ extra_scripts }}"></script>
|
||||
</head>
|
||||
@@ -1,281 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
.content {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.table {
|
||||
/* flex: 1; */
|
||||
flex-grow: none;
|
||||
}
|
||||
|
||||
.log {
|
||||
flex: 2;
|
||||
flex-grow: 1;
|
||||
|
||||
}
|
||||
|
||||
#ntable {
|
||||
border-collapse: collapse;
|
||||
font-size: 95%;
|
||||
/* width: 100%; */
|
||||
}
|
||||
|
||||
#ntable td,
|
||||
#ntable th {
|
||||
border: 1px solid #ddd;
|
||||
text-align: left;
|
||||
padding: 0px;
|
||||
}
|
||||
|
||||
#ntable tr:nth-child(even) {
|
||||
background-color: #f2f2f2;
|
||||
}
|
||||
|
||||
#ntable tr:hover {
|
||||
background-color: #ddd;
|
||||
}
|
||||
|
||||
#ntable th {
|
||||
padding-top: 12px;
|
||||
padding-bottom: 12px;
|
||||
background-color: #9d9d9d;
|
||||
color: white;
|
||||
}
|
||||
|
||||
#ntable
|
||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||
content: " \2195";
|
||||
}
|
||||
|
||||
/* Modal for connection status messages */
|
||||
.connection-modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
z-index: 1000;
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.4);
|
||||
}
|
||||
|
||||
.connection-modal.show {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.connection-modal-content {
|
||||
background-color: #f9f9f9;
|
||||
padding: 20px;
|
||||
border: 1px solid #888;
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
|
||||
min-width: 300px;
|
||||
}
|
||||
|
||||
.connection-modal-content p {
|
||||
margin: 10px 0;
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
var nTable = document;
|
||||
var name_idx = {};
|
||||
var c = 0;
|
||||
|
||||
function setup() {
|
||||
name_idx = {};
|
||||
nTable = document.getElementById("ntable");
|
||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||
if (i == 0) continue;
|
||||
name = nTable.rows[i].cells[0].innerText;
|
||||
name_idx[name] = nTable.rows[i];
|
||||
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
|
||||
}
|
||||
}
|
||||
|
||||
function createRow(data) {
|
||||
var row = document.createElement("tr");
|
||||
var c_name = document.createElement("td");
|
||||
var c_ver = document.createElement("td");
|
||||
var c_ipv4addr = document.createElement("td");
|
||||
var c_ipv4state = document.createElement("td");
|
||||
var c_ipv4latency = document.createElement("td");
|
||||
c_ipv4latency.style.textAlign = "right";
|
||||
var c_ipv4statets = document.createElement("td");
|
||||
c_ipv4statets.style.textAlign = "right";
|
||||
var c_ipv6addr = document.createElement("td");
|
||||
var c_ipv6state = document.createElement("td");
|
||||
var c_ipv6latency = document.createElement("td");
|
||||
c_ipv6latency.style.textAlign = "right";
|
||||
var c_ipv6statets = document.createElement("td");
|
||||
c_ipv6statets.style.textAlign = "right";
|
||||
row.appendChild(c_name);
|
||||
row.appendChild(c_ver);
|
||||
row.appendChild(c_ipv4addr);
|
||||
row.appendChild(c_ipv4state);
|
||||
row.appendChild(c_ipv4latency);
|
||||
row.appendChild(c_ipv4statets);
|
||||
row.appendChild(c_ipv6addr);
|
||||
row.appendChild(c_ipv6state);
|
||||
row.appendChild(c_ipv6latency);
|
||||
row.appendChild(c_ipv6statets);
|
||||
if (data.dyn) {
|
||||
c_name.innerHTML = "<b>" + data.name + "</b>";
|
||||
} else {
|
||||
c_name.innerHTML = data.name;
|
||||
}
|
||||
c_ver.innerHTML = data.cver;
|
||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||
c_ipv4state.innerHTML = data.connections[0].state;
|
||||
if (data.connections.length > 1) {
|
||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||
c_ipv6state.innerHTML = data.connections[1].state;
|
||||
}
|
||||
var table = document.getElementById("ntablebody"); // find table to append to
|
||||
table.appendChild(row); // append row to table
|
||||
name_idx[c_name] = row;
|
||||
}
|
||||
|
||||
function formatTS(ts) {
|
||||
const milliseconds = ts * 1000;
|
||||
const dateObject = new Date(milliseconds);
|
||||
return dateObject.toLocaleString("de-DE");
|
||||
}
|
||||
|
||||
function update_table(data) {
|
||||
if (!(data.name in name_idx)) {
|
||||
createRow(data);
|
||||
setup();
|
||||
}
|
||||
|
||||
for (var i = 0; i < data.connections.length; i++) {
|
||||
name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
|
||||
data.connections[i].statetime
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = "up";
|
||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
latency = "";
|
||||
name_idx[data.name].cells[2 + i * 4].innerHTML = "";
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = "";
|
||||
} else {
|
||||
state = "<b>" + data.connections[i].state + "</b>";
|
||||
latency = "-";
|
||||
}
|
||||
}
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = state;
|
||||
name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
|
||||
}
|
||||
}
|
||||
|
||||
function WS_Connect() {
|
||||
if ("WebSocket" in window) {
|
||||
//N.B: subprotocol field causes chrome to error 1006
|
||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||
|
||||
ws_hbd.onopen = function () {
|
||||
// Web Socket is connected, send data using send()
|
||||
console.log("ws connect {{heartbeat_ws_url}}");
|
||||
// Hide modal window if visible
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.remove("show");
|
||||
}
|
||||
ws_hbd.send("heartbeat_web");
|
||||
};
|
||||
|
||||
ws_hbd.onerror = function (event) {
|
||||
console.log(event);
|
||||
};
|
||||
|
||||
ws_hbd.onmessage = function (event) {
|
||||
/* console.log(event.data); */
|
||||
var state = JSON.parse(event.data);
|
||||
/* console.log("State: " + state.type); */
|
||||
if (state.type == "host") {
|
||||
update_table(state.data);
|
||||
} else if (state.type == "message") {
|
||||
var msgs = document.getElementById("messages");
|
||||
msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
|
||||
}
|
||||
cnt++;
|
||||
};
|
||||
|
||||
ws_hbd.onclose = function (event) {
|
||||
/* console.log(event); */
|
||||
console.log("Connection is closed, reopening");
|
||||
// Show modal window
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.add("show");
|
||||
}
|
||||
setTimeout(function () {
|
||||
WS_Connect();
|
||||
}, 3000);
|
||||
};
|
||||
} else {
|
||||
// The browser doesn't support WebSocket
|
||||
console.log("WebSocket NOT supported by your Browser!");
|
||||
}
|
||||
}
|
||||
WS_Connect();
|
||||
</script>
|
||||
<body>
|
||||
{% include 'menu.html' %}
|
||||
|
||||
<div id="content" class="content" style="overflow: hidden">
|
||||
<div id="table" class="table" style="overflow: hidden">
|
||||
<!-- <h2>{{title}}</h2> -->
|
||||
<table id="ntable" class="sortable">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Ver</th>
|
||||
<th>IPv4 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latencey</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
<th>IPv6 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latencey</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="ntablebody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div id="log" class="log" style="overflow: auto;">
|
||||
<h2>Log of Events</h2>
|
||||
<div id="messages">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% include 'foot.html' %}
|
||||
|
||||
<!-- Connection status modal -->
|
||||
<div id="connectionModal" class="connection-modal">
|
||||
<div class="connection-modal-content">
|
||||
<p>⚠️ Connection is closed, reopening...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
setup();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,3 +0,0 @@
|
||||
<label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||
<header>{{ header }}</header>
|
||||
|
||||
-220
@@ -1,220 +0,0 @@
|
||||
"""UDP listener and datagram processing."""
|
||||
|
||||
import asyncio
|
||||
import zlib
|
||||
import logging
|
||||
|
||||
from .proto import stodict, oldmtodict
|
||||
from hbd.utils import dur
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||
def __init__(self, config=None, handler=None):
|
||||
super().__init__()
|
||||
self.config = config or {}
|
||||
self.handler = handler
|
||||
|
||||
def connection_made(self, transport):
|
||||
self.transport = transport
|
||||
logger.info("UDP Server listening...")
|
||||
|
||||
def datagram_received(self, data, addr):
|
||||
logger.debug("Received from %s", addr)
|
||||
try:
|
||||
msg = parse_message(data)
|
||||
if self.handler:
|
||||
# handler can be a callable provided by the application
|
||||
# pass the transport so handlers can send replies (ACKs/commands)
|
||||
self.handler(msg, addr, self.transport)
|
||||
except Exception:
|
||||
logger.exception("Error while processing datagram from %s", addr)
|
||||
|
||||
|
||||
def parse_message(data: bytes):
|
||||
"""Parse a raw datagram into a message dict.
|
||||
|
||||
Uses the protocol decoding helpers and falls back to old format when
|
||||
decoding returns an empty dict (compat with older clients).
|
||||
"""
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
# fallback to old format
|
||||
msg = oldmtodict(data)
|
||||
return msg
|
||||
|
||||
|
||||
def dicttos(ID, d, compress=False):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
else:
|
||||
zpk = pk
|
||||
opk = ID + ":" + zpk
|
||||
return opk
|
||||
|
||||
|
||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
"""Handle a parsed datagram message.
|
||||
|
||||
ctx is a dictionary with runtime dependencies:
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- pushmsg: callable(message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- DEBUG, verbose
|
||||
"""
|
||||
if not msg:
|
||||
return
|
||||
now = __import__("time").time()
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
log = ctx.get("log")
|
||||
pushmsg = ctx.get("pushmsg")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
|
||||
# normalize addr (ip, port)
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from hbd.utils import shortname
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt", None))
|
||||
except Exception:
|
||||
rtt = None
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
host.setcver(msg.get("ver", 0))
|
||||
|
||||
try:
|
||||
conn, res = host.conndata(cid, ip, rtt, now)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print("conndata failed: %s" % e)
|
||||
return
|
||||
|
||||
if res:
|
||||
if log:
|
||||
log(uname, res)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s" % (host.name, res))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
service = msg.get("service", "unknown")
|
||||
message = msg.get("msg", None)
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
if log:
|
||||
log(uname, "booted")
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
m = "%s booted" % (host.name)
|
||||
if pushmsg:
|
||||
pushmsg(m)
|
||||
if message:
|
||||
if log:
|
||||
log(uname, "msg: %s" % message, service=service)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
if log:
|
||||
log(uname, m)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
else:
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
if log:
|
||||
log(uname, "%s shutdown" % conn.afam)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
# send ACK back
|
||||
rmsg = {"time": __import__("time").time()}
|
||||
if host.cver < 1:
|
||||
opkt = b"ACK"
|
||||
else:
|
||||
opkt = dicttos("ACK", rmsg, host.cver > 1)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
# send any commands we have queued
|
||||
while len(host.cmds):
|
||||
op, rmsg = host.cmds[0]
|
||||
if op == "CMD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "command sent")
|
||||
if host.cver < 1:
|
||||
rmsg = rmsg["cmd"]
|
||||
elif op == "UPD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "update initiated")
|
||||
if host.cver < 1:
|
||||
if log:
|
||||
log(uname, " ver 0 does not support UPD")
|
||||
continue
|
||||
if host.cver < 1:
|
||||
opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
|
||||
if isinstance(opkt, str):
|
||||
opkt = opkt.encode()
|
||||
else:
|
||||
opkt = dicttos(op, rmsg, True)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send cmd/update: %s" % e))
|
||||
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1,143 +0,0 @@
|
||||
"""WebSocket server and broadcast helpers for hbd.
|
||||
|
||||
Provides an asyncio-based WebSocket server and a thread-safe broadcast
|
||||
function that other threads or synchronous code can call.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Callable, Iterable, Optional
|
||||
|
||||
import websockets
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_connections = set()
|
||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||
_get_msgs: Optional[Callable[[], Iterable]] = None
|
||||
_verbose = False
|
||||
|
||||
|
||||
async def _handler(websocket, path=None):
|
||||
_connections.add(websocket)
|
||||
remote_address = websocket.remote_address
|
||||
if path is None:
|
||||
path = getattr(websocket, "path", None)
|
||||
if _verbose:
|
||||
logger.info("DBG ws_serve: %s: %s", remote_address, path)
|
||||
try:
|
||||
# send initial hosts
|
||||
if _get_hosts:
|
||||
for h in _get_hosts():
|
||||
jmsg = json.dumps({"type": "host", "data": h})
|
||||
await websocket.send(jmsg)
|
||||
# send recent messages
|
||||
if _get_msgs:
|
||||
for m in list(_get_msgs())[-100:]:
|
||||
jmsg = json.dumps({"type": "message", "data": m})
|
||||
await websocket.send(jmsg)
|
||||
|
||||
# keep connection open until client disconnects
|
||||
async for _ in websocket:
|
||||
# we don't expect meaningful incoming messages besides the initial
|
||||
# client 'hello' that some clients send; ignore for now
|
||||
if _verbose:
|
||||
logger.debug("received ws data: %s", _)
|
||||
|
||||
except (
|
||||
websockets.exceptions.ConnectionClosedOK,
|
||||
websockets.exceptions.ConnectionClosedError,
|
||||
) as e:
|
||||
if _verbose:
|
||||
logger.info("ws closed: %r", e)
|
||||
except Exception as e:
|
||||
logger.exception("ws handler exception: %s", e)
|
||||
finally:
|
||||
try:
|
||||
_connections.remove(websocket)
|
||||
except KeyError:
|
||||
pass
|
||||
await websocket.wait_closed()
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
ws_port: int,
|
||||
wss_port: Optional[int] = None,
|
||||
ssl_context=None,
|
||||
get_hosts: Optional[Callable] = None,
|
||||
get_msgs: Optional[Callable] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""Start WebSocket servers and block until cancelled.
|
||||
|
||||
This is intended to be awaited inside the main asyncio event loop.
|
||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
||||
started.
|
||||
"""
|
||||
global _loop, _get_hosts, _get_msgs, _verbose
|
||||
_loop = asyncio.get_running_loop()
|
||||
_get_hosts = get_hosts
|
||||
_get_msgs = get_msgs
|
||||
_verbose = verbose
|
||||
|
||||
servers = []
|
||||
# plain WebSocket
|
||||
websockets_logger = logging.getLogger("websockets.server")
|
||||
websockets_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
# regular WebSocket
|
||||
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
|
||||
servers.append(ws_server)
|
||||
# secure WebSocket (optional)
|
||||
if wss_port and ssl_context:
|
||||
wss_server = websockets.serve(
|
||||
_handler, host, wss_port, ssl=ssl_context
|
||||
) # , subprotocols=["hbd"])
|
||||
servers.append(wss_server)
|
||||
|
||||
# await starting of all servers
|
||||
for srv in servers:
|
||||
await srv
|
||||
|
||||
if _verbose:
|
||||
logger.info(
|
||||
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
|
||||
)
|
||||
|
||||
# block forever (until loop is stopped or cancelled)
|
||||
await asyncio.Future()
|
||||
|
||||
|
||||
def broadcast(typ: str, data) -> bool:
|
||||
"""Thread-safe broadcast helper.
|
||||
|
||||
Schedules coroutine(s) on the running loop to send message to all
|
||||
connected websockets. Returns False if server was not running.
|
||||
"""
|
||||
if not _loop:
|
||||
return False
|
||||
jmsg = json.dumps({"type": typ, "data": data})
|
||||
to_close = []
|
||||
for ws in list(_connections):
|
||||
if ws.state != websockets.protocol.State.OPEN:
|
||||
to_close.append(ws)
|
||||
continue
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
|
||||
except Exception:
|
||||
to_close.append(ws)
|
||||
logger.debug("ws.send exception: closed")
|
||||
for ws in to_close:
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
|
||||
except Exception:
|
||||
pass
|
||||
if ws in _connections:
|
||||
_connections.remove(ws)
|
||||
return True
|
||||
|
||||
|
||||
def connection_count() -> int:
|
||||
return len(_connections)
|
||||
-380
@@ -1,380 +0,0 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.__dict__)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.cver = 0
|
||||
self.connections = {}
|
||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["ver"] = str(self.cver)
|
||||
d["num"] = self.num
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["ver"] = "Ver"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in self.connections:
|
||||
# dirty ugly hack: fix conn to host backpointer
|
||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
||||
cld["host"] = cld["host"].name
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def setcver(self, cver):
|
||||
self.cver = cver
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
pass
|
||||
|
||||
# def dispstate(self):
|
||||
# if self.state in ["down", "overdue"]:
|
||||
# state = "<b>%s</b>" % self.state
|
||||
# elif self.state in ["up", "UP"]:
|
||||
# state = ""
|
||||
# for x in list(self.connections.keys()):
|
||||
# try:
|
||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
||||
# except:
|
||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
||||
# state = ""
|
||||
# else:
|
||||
# state = "%s" % self.state
|
||||
# return state
|
||||
|
||||
def dispstats(self):
|
||||
if self.doesack != -1:
|
||||
if self.upcount > 0:
|
||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
||||
r = ""
|
||||
for v in range(3):
|
||||
a, u = self.hdwcounts[v]
|
||||
if (self.upcount - u) != 0:
|
||||
vs = "%0.0f" % (
|
||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
||||
)
|
||||
if vs == "0":
|
||||
vs = ""
|
||||
else:
|
||||
vs = "-"
|
||||
r += '<td align="right">%s</td>' % vs
|
||||
return r
|
||||
else:
|
||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
"ver",
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le:]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
+52
-16
@@ -4,26 +4,58 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.0.3"
|
||||
description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
|
||||
version = "5.3.7"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"PyYAML>=6.0",
|
||||
]
|
||||
license = "MIT"
|
||||
keywords = ["heartbeat", "monitoring", "dns", "websocket"]
|
||||
license-files = ["LICENSE.md"]
|
||||
keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
|
||||
authors = [
|
||||
{ name = "heartbeat contributors" }
|
||||
{ name = "Andreas Wrede" }
|
||||
]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Operating System :: POSIX :: BSD",
|
||||
"Topic :: System :: Monitoring",
|
||||
"Topic :: System :: Networking :: Monitoring",
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"websockets>=13.2",
|
||||
"mattermostdriver>=7.3.0",
|
||||
"PyYAML>=6.0",
|
||||
"aiohttp>=3.8",
|
||||
"Jinja2>=3.1.0",
|
||||
"fastapi>=0.95.0",
|
||||
]
|
||||
[project.urls]
|
||||
Repository = "https://git.wrede.ca/andreas/heartbeat"
|
||||
|
||||
[project.optional-dependencies]
|
||||
# Client-only dependencies (hbc - system monitoring client)
|
||||
client = [
|
||||
"psutil>=5.9.0",
|
||||
]
|
||||
|
||||
# Server-only dependencies (hbd - heartbeat daemon/server)
|
||||
server = [
|
||||
"websockets>=13.2",
|
||||
"mattermostdriver>=7.3.0",
|
||||
"aiohttp>=3.11",
|
||||
"Jinja2>=3.1.6",
|
||||
"matrix-nio>=0.24",
|
||||
"ruamel.yaml>=0.18",
|
||||
]
|
||||
|
||||
# Minimal client — hbc_mini only, no external dependencies
|
||||
mini = []
|
||||
|
||||
# Install both client and server
|
||||
all = [
|
||||
"hbd[client,server]",
|
||||
]
|
||||
|
||||
# Development dependencies
|
||||
dev = [
|
||||
"pytest>=7.0",
|
||||
"pytest-cov>=4.0",
|
||||
@@ -35,15 +67,19 @@ dev = [
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
hbd = "hbd.cli:main"
|
||||
hbc = "hbd.hbc:main"
|
||||
hbd = "hbd.server.cli:main"
|
||||
hbc = "hbd.client.main:main"
|
||||
|
||||
[tool.setuptools]
|
||||
script-files = ["scripts/hb_install.sh", "scripts/hbc_mini.py"]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["hbd*"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||
"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||
"hbd.client" = ["*.yaml"]
|
||||
|
||||
|
||||
[tool.black]
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
key "rndc-key" {
|
||||
algorithm hmac-md5;
|
||||
secret "qlGa+AYKtyOgWNuozqECMw==";
|
||||
};
|
||||
@@ -0,0 +1,40 @@
|
||||
async def send_sms(hass, user, password, sender_did, call):
|
||||
"""Send SMS message using multipart form-data like MMS."""
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
recipient = call.data.get("recipient")
|
||||
message = call.data.get("message")
|
||||
|
||||
if not recipient or not message:
|
||||
_LOGGER.error("Recipient or message missing.")
|
||||
return
|
||||
|
||||
# Build form data dictionary
|
||||
form_data = {
|
||||
'api_username': str(user),
|
||||
'api_password': str(password),
|
||||
'did': str(sender_did),
|
||||
'dst': str(recipient),
|
||||
'message': str(message),
|
||||
'method': 'sendSMS'
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
with aiohttp.MultipartWriter("form-data") as mp:
|
||||
for key, value in form_data.items():
|
||||
part = mp.append(value)
|
||||
part.set_content_disposition('form-data', name=key)
|
||||
|
||||
_LOGGER.error("voipms_sms: sending SMS: %s", mp)
|
||||
async with session.post(REST_ENDPOINT, data=mp) as response:
|
||||
response_text = await response.text()
|
||||
if response.status == 200:
|
||||
response_json = json.loads(response_text)
|
||||
if response_json['status'] == "success":
|
||||
_LOGGER.info("voipms_sms: SMS sent successfully: %s", response_text)
|
||||
else:
|
||||
_LOGGER.error("voipms_sms: SMS not sent: %s", response_text)
|
||||
else:
|
||||
_LOGGER.error("voipms_sms: Failed to send SMS. Status: %s, Response: %s", response.status, response_text)
|
||||
|
||||
|
||||
|
||||
@@ -3,11 +3,15 @@
|
||||
set -e
|
||||
uv version --bump patch
|
||||
VER=$(uv version --short)
|
||||
sed -i "" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" scripts/hbc_mini.py
|
||||
|
||||
# commit pyproject.toml
|
||||
git commit -m "version $VER" pyproject.toml hbd/__init__.py
|
||||
git commit -m "version $VER" pyproject.toml hbd/__init__.py scripts/hbc_mini.py
|
||||
git push
|
||||
# tag version
|
||||
git tag -a v$VER -m "Version $VER"
|
||||
git push --tags
|
||||
|
||||
rm hbd/__init__.py.bak
|
||||
rm scripts/hbc_mini.py.bak
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
hbc_mini
|
||||
hbc_mini_dbg
|
||||
@@ -0,0 +1,21 @@
|
||||
CC ?= cc
|
||||
CFLAGS = -O2 -Wall -Wextra -std=c11
|
||||
LDFLAGS = -lz -lpthread -lm
|
||||
TARGET = hbc_mini
|
||||
SRC = hbc_mini.c
|
||||
|
||||
# FreeBSD/NetBSD keep zlib in base; no extra flags needed.
|
||||
# On some NetBSD installs pthreads may need -lpthread from pkgsrc.
|
||||
|
||||
.PHONY: all clean debug
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(SRC)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
|
||||
|
||||
debug: $(SRC)
|
||||
$(CC) -g -fsanitize=address,undefined -o $(TARGET)_dbg $< $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGET) $(TARGET)_dbg
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demo script for HTTP API endpoints.
|
||||
Tests and demonstrates the plugin data and alert APIs.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
BASE_URL = "http://localhost:50004"
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header."""
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f" {title}")
|
||||
print('=' * 70)
|
||||
|
||||
def format_timestamp(timestamp):
|
||||
"""Convert Unix timestamp to readable format."""
|
||||
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
def format_duration(seconds):
|
||||
"""Format duration in human-readable format."""
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds / 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{minutes}m {secs}s"
|
||||
elif seconds < 86400:
|
||||
hours = int(seconds / 3600)
|
||||
minutes = int((seconds % 3600) / 60)
|
||||
return f"{hours}h {minutes}m"
|
||||
else:
|
||||
days = int(seconds / 86400)
|
||||
hours = int((seconds % 86400) / 3600)
|
||||
return f"{days}d {hours}h"
|
||||
|
||||
def test_hosts_api():
|
||||
"""Test GET /api/0/hosts endpoint."""
|
||||
print_section("1. List All Monitored Hosts")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5)
|
||||
response.raise_for_status()
|
||||
hosts = response.json()
|
||||
|
||||
print(f"Found {len(hosts)} hosts:\n")
|
||||
for host in hosts:
|
||||
name = host.get('name', 'unknown')
|
||||
dyn = host.get('dyn', False)
|
||||
conn_count = len(host.get('connections', []))
|
||||
|
||||
print(f" • {name}")
|
||||
print(f" - Protocol: IPv{ver}")
|
||||
print(f" - Dynamic: {dyn}")
|
||||
print(f" - Connections: {conn_count}")
|
||||
|
||||
return hosts
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_host_plugins_api(hostname):
|
||||
"""Test GET /api/0/hosts/{hostname}/plugins endpoint."""
|
||||
print_section(f"2. Get All Plugins for Host: {hostname}")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/plugins", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
plugins = data.get('plugins', {})
|
||||
print(f"Found {len(plugins)} plugins:\n")
|
||||
|
||||
for plugin_name, plugin_data in plugins.items():
|
||||
timestamp = plugin_data.get('timestamp', 0)
|
||||
sample_count = plugin_data.get('sample_count', 0)
|
||||
metrics = plugin_data.get('data', {})
|
||||
|
||||
print(f" 📦 {plugin_name}")
|
||||
print(f" Last update: {format_timestamp(timestamp)}")
|
||||
print(f" Samples: {sample_count}")
|
||||
print(f" Metrics: {len(metrics)}")
|
||||
|
||||
# Show first few metrics
|
||||
for i, (metric, value) in enumerate(metrics.items()):
|
||||
if i < 3: # Show only first 3 metrics
|
||||
if isinstance(value, float):
|
||||
print(f" - {metric}: {value:.2f}")
|
||||
elif isinstance(value, dict):
|
||||
print(f" - {metric}: [nested data, {len(value)} keys]")
|
||||
else:
|
||||
print(f" - {metric}: {value}")
|
||||
|
||||
if len(metrics) > 3:
|
||||
print(f" ... and {len(metrics) - 3} more")
|
||||
print()
|
||||
|
||||
return list(plugins.keys())
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_plugin_detail_api(hostname, plugin_name, limit=5):
|
||||
"""Test GET /api/0/hosts/{hostname}/plugins/{plugin_name} endpoint."""
|
||||
print_section(f"3. Get Detailed Data: {hostname}/{plugin_name}")
|
||||
|
||||
try:
|
||||
url = f"{BASE_URL}/api/0/hosts/{hostname}/plugins/{plugin_name}"
|
||||
params = {'limit': limit}
|
||||
response = requests.get(url, params=params, timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
samples = data.get('samples', [])
|
||||
print(f"Retrieved {len(samples)} samples (limit={limit}):\n")
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
timestamp = sample.get('timestamp', 0)
|
||||
metrics = sample.get('data', {})
|
||||
|
||||
print(f" [{i+1}] {format_timestamp(timestamp)}")
|
||||
for metric, value in sorted(metrics.items())[:5]: # Show first 5 metrics
|
||||
if isinstance(value, float):
|
||||
print(f" {metric}: {value:.2f}")
|
||||
elif isinstance(value, dict):
|
||||
print(f" {metric}: [nested: {len(value)} keys]")
|
||||
else:
|
||||
print(f" {metric}: {value}")
|
||||
print()
|
||||
|
||||
return samples
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_host_alerts_api(hostname):
|
||||
"""Test GET /api/0/hosts/{hostname}/alerts endpoint."""
|
||||
print_section(f"4. Get Alerts for Host: {hostname}")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
alerts = data.get('alerts', [])
|
||||
summary = data.get('summary', {})
|
||||
|
||||
print(f"Summary:")
|
||||
print(f" ✓ OK: {summary.get('ok', 0)}")
|
||||
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||
print(f" ❓ Unknown: {summary.get('unknown', 0)}")
|
||||
print()
|
||||
|
||||
# Show non-OK alerts
|
||||
active_alerts = [a for a in alerts if a.get('level') != 'OK']
|
||||
if active_alerts:
|
||||
print(f"Active Alerts ({len(active_alerts)}):")
|
||||
for alert in active_alerts:
|
||||
metric = alert.get('metric_path', 'unknown')
|
||||
level = alert.get('level', 'UNKNOWN')
|
||||
value = alert.get('last_value', 0)
|
||||
since = alert.get('since', 0)
|
||||
duration = datetime.now().timestamp() - since
|
||||
|
||||
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||
print(f" {icon} {metric}")
|
||||
print(f" Level: {level}")
|
||||
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||
print(f" Duration: {format_duration(duration)}")
|
||||
print()
|
||||
else:
|
||||
print("✓ No active alerts - all systems normal!")
|
||||
|
||||
return data
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return {}
|
||||
|
||||
def test_all_alerts_api():
|
||||
"""Test GET /api/0/alerts endpoint."""
|
||||
print_section("5. Get All Active Alerts Across All Hosts")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
alerts = data.get('alerts', [])
|
||||
summary = data.get('summary', {})
|
||||
host_count = data.get('host_count', 0)
|
||||
|
||||
print(f"Monitoring {host_count} hosts")
|
||||
print(f"Active Alerts: {summary.get('total', 0)}")
|
||||
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||
print()
|
||||
|
||||
if alerts:
|
||||
print("Alert Details:")
|
||||
for alert in alerts:
|
||||
hostname = alert.get('hostname', 'unknown')
|
||||
metric = alert.get('metric_path', 'unknown')
|
||||
level = alert.get('level', 'UNKNOWN')
|
||||
value = alert.get('last_value', 0)
|
||||
since = alert.get('since', 0)
|
||||
duration = datetime.now().timestamp() - since
|
||||
notification_count = alert.get('notification_count', 0)
|
||||
|
||||
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||
print(f" {icon} {hostname} / {metric}")
|
||||
print(f" Level: {level}")
|
||||
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||
print(f" Duration: {format_duration(duration)}")
|
||||
print(f" Notifications: {notification_count}")
|
||||
print()
|
||||
else:
|
||||
print("✅ All systems normal - no active alerts!")
|
||||
|
||||
return data
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return {}
|
||||
|
||||
def test_messages_api():
|
||||
"""Test GET /api/0/messages endpoint."""
|
||||
print_section("6. Get Recent Messages")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/messages", timeout=5)
|
||||
response.raise_for_status()
|
||||
messages = response.json()
|
||||
|
||||
print(f"Last {len(messages)} messages:\n")
|
||||
for msg in messages[-5:]: # Show last 5
|
||||
timestamp = msg.get('time', 0)
|
||||
host = msg.get('host', 'unknown')
|
||||
text = msg.get('msg', '')
|
||||
|
||||
print(f" [{format_timestamp(timestamp)}] {host}: {text}")
|
||||
|
||||
return messages
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_error_handling():
|
||||
"""Test API error handling."""
|
||||
print_section("7. Error Handling Tests")
|
||||
|
||||
# Test non-existent host
|
||||
print("Testing non-existent host...")
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/nonexistenthost/plugins", timeout=5)
|
||||
if response.status_code == 404:
|
||||
error_data = response.json()
|
||||
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||
else:
|
||||
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
|
||||
# Test non-existent plugin
|
||||
print("\nTesting non-existent plugin...")
|
||||
try:
|
||||
# Get first host
|
||||
hosts = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5).json()
|
||||
if hosts:
|
||||
hostname = hosts[0]['name']
|
||||
response = requests.get(
|
||||
f"{BASE_URL}/api/0/hosts/{hostname}/plugins/nonexistentplugin",
|
||||
timeout=5
|
||||
)
|
||||
if response.status_code == 404:
|
||||
error_data = response.json()
|
||||
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||
else:
|
||||
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
|
||||
def demo_monitoring_loop():
|
||||
"""Demonstrate continuous monitoring."""
|
||||
print_section("8. Continuous Monitoring Demo (5 iterations)")
|
||||
|
||||
print("Monitoring alerts every 3 seconds (Ctrl+C to stop)...\n")
|
||||
|
||||
try:
|
||||
for i in range(5):
|
||||
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
summary = data.get('summary', {})
|
||||
critical = summary.get('critical', 0)
|
||||
warning = summary.get('warning', 0)
|
||||
|
||||
timestamp = datetime.now().strftime('%H:%M:%S')
|
||||
status = "🔴 CRITICAL" if critical > 0 else "⚠️ WARNING" if warning > 0 else "✅ OK"
|
||||
|
||||
print(f"[{timestamp}] {status} - Critical: {critical}, Warning: {warning}")
|
||||
|
||||
if i < 4: # Don't sleep after last iteration
|
||||
sleep(3)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nMonitoring stopped by user")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
|
||||
def main():
|
||||
"""Run all API tests."""
|
||||
print("""
|
||||
╔══════════════════════════════════════════════════════════════╗
|
||||
║ Heartbeat Daemon HTTP API Demo & Test Suite ║
|
||||
╚══════════════════════════════════════════════════════════════╝
|
||||
""")
|
||||
|
||||
print(f"Testing API at: {BASE_URL}")
|
||||
print(f"Ensure the heartbeat daemon is running!")
|
||||
|
||||
# Test basic connectivity
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=2)
|
||||
response.raise_for_status()
|
||||
print("✅ API is reachable\n")
|
||||
except Exception as e:
|
||||
print(f"❌ Cannot connect to API: {e}")
|
||||
print("\nPlease ensure:")
|
||||
print(" 1. Heartbeat daemon is running")
|
||||
print(" 2. HTTP server is enabled in configuration")
|
||||
print(f" 3. Server is listening on port {BASE_URL.split(':')[-1]}")
|
||||
sys.exit(1)
|
||||
|
||||
# Run test suite
|
||||
hosts = test_hosts_api()
|
||||
|
||||
if not hosts:
|
||||
print("\n⚠️ No hosts found. Ensure clients are sending heartbeats.")
|
||||
return
|
||||
|
||||
# Pick first host for detailed testing
|
||||
hostname = hosts[0].get('name', '')
|
||||
|
||||
if hostname:
|
||||
plugins = test_host_plugins_api(hostname)
|
||||
|
||||
if plugins:
|
||||
# Test detailed plugin data
|
||||
test_plugin_detail_api(hostname, plugins[0], limit=3)
|
||||
|
||||
# Test alert endpoints
|
||||
test_host_alerts_api(hostname)
|
||||
|
||||
# Test global endpoints
|
||||
test_all_alerts_api()
|
||||
test_messages_api()
|
||||
|
||||
# Test error handling
|
||||
test_error_handling()
|
||||
|
||||
# Continuous monitoring demo
|
||||
demo_monitoring_loop()
|
||||
|
||||
print_section("Test Suite Complete")
|
||||
print("""
|
||||
Next Steps:
|
||||
• View the web UI at http://localhost:50004/live
|
||||
• Check plugin metrics at http://localhost:50004/plugins
|
||||
• Monitor alerts at http://localhost:50004/alerts
|
||||
• Read API documentation: docs/HTTP_API.md
|
||||
""")
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nDemo interrupted by user")
|
||||
sys.exit(0)
|
||||
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demonstration of the threshold alerting system.
|
||||
|
||||
This script shows how thresholds work by simulating plugin data
|
||||
with values that cross various threshold boundaries.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from hbd.threshold import ThresholdChecker, AlertLevel
|
||||
|
||||
|
||||
def demo_basic_thresholds():
|
||||
"""Demonstrate basic threshold checking."""
|
||||
print("=" * 70)
|
||||
print("DEMO 1: Basic Threshold Checking")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
"operator": ">",
|
||||
"hysteresis": 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
|
||||
def notifier(msg):
|
||||
notifications.append(msg)
|
||||
print(f" 📧 NOTIFICATION: {msg}")
|
||||
|
||||
checker = ThresholdChecker(config, notification_callback=notifier)
|
||||
alert_states = {}
|
||||
|
||||
# Simulate CPU values over time
|
||||
test_values = [
|
||||
(50.0, "Normal operation"),
|
||||
(85.0, "Crosses WARNING threshold"),
|
||||
(87.0, "Still in WARNING"),
|
||||
(95.0, "Escalates to CRITICAL"),
|
||||
(92.0, "Still CRITICAL (in hysteresis)"),
|
||||
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
|
||||
(79.0, "Recovers to OK"),
|
||||
(50.0, "Back to normal"),
|
||||
]
|
||||
|
||||
print("\nSimulating CPU usage over time:")
|
||||
print("-" * 70)
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n📊 CPU: {value}% - {description}")
|
||||
|
||||
plugin_data = {"cpu_percent": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||
if current_state:
|
||||
print(f" Current state: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
for metric, old_level, new_level, val in state_changes:
|
||||
print(f" ⚠️ State change: {old_level.name} → {new_level.name}")
|
||||
|
||||
print(f"\n📈 Summary: {len(notifications)} notifications sent")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_multiple_metrics():
|
||||
"""Demonstrate monitoring multiple metrics."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 2: Multiple Metrics and Alert Summary")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
||||
"load_1min": {"warning": 4.0, "critical": 8.0},
|
||||
},
|
||||
"memory_monitor": {
|
||||
"percent": {"warning": 85.0, "critical": 95.0},
|
||||
"available_mb": {
|
||||
"warning": 1000,
|
||||
"critical": 500,
|
||||
"operator": "<",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
# Simulate problematic system state
|
||||
print("\nSimulating a system under load:")
|
||||
print("-" * 70)
|
||||
|
||||
scenarios = [
|
||||
{
|
||||
"name": "Initial state - all OK",
|
||||
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||
},
|
||||
{
|
||||
"name": "CPU spikes to WARNING",
|
||||
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||
},
|
||||
{
|
||||
"name": "Memory also reaches WARNING",
|
||||
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||
},
|
||||
{
|
||||
"name": "CPU escalates to CRITICAL",
|
||||
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
|
||||
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||
},
|
||||
{
|
||||
"name": "System recovering",
|
||||
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
|
||||
},
|
||||
]
|
||||
|
||||
for scenario in scenarios:
|
||||
print(f"\n📍 {scenario['name']}")
|
||||
|
||||
# Check CPU metrics
|
||||
checker.check_plugin_data(
|
||||
"testhost",
|
||||
"cpu_monitor",
|
||||
scenario["cpu_monitor"],
|
||||
alert_states
|
||||
)
|
||||
|
||||
# Check memory metrics
|
||||
checker.check_plugin_data(
|
||||
"testhost",
|
||||
"memory_monitor",
|
||||
scenario["memory_monitor"],
|
||||
alert_states
|
||||
)
|
||||
|
||||
# Show alert summary
|
||||
summary = checker.get_alert_summary(alert_states)
|
||||
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
|
||||
|
||||
# Show active alerts
|
||||
active = checker.get_active_alerts(alert_states)
|
||||
if active:
|
||||
print(f" Active alerts:")
|
||||
for alert in active:
|
||||
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
|
||||
|
||||
print(f"\n📈 Total notifications sent: {len(notifications)}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_hysteresis():
|
||||
"""Demonstrate hysteresis effect."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 3: Hysteresis Prevents Flapping")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
"hysteresis": 0.1, # 10% hysteresis
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
print("\nCritical threshold: 90%")
|
||||
print("Hysteresis: 10%")
|
||||
print("Recovery threshold: 81% (90 - 10% of 90)")
|
||||
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
|
||||
print("-" * 70)
|
||||
|
||||
# Simulate fluctuating values
|
||||
test_values = [
|
||||
(75.0, "Normal"),
|
||||
(92.0, "Crosses CRITICAL"),
|
||||
(88.0, "Drops but still above 81% (stays CRITICAL)"),
|
||||
(86.0, "Still above 81% (stays CRITICAL)"),
|
||||
(83.0, "Still above 81% (stays CRITICAL)"),
|
||||
(80.0, "Below 81% - recovers to OK"),
|
||||
(88.0, "Rises again but below 90% (stays OK)"),
|
||||
(91.0, "Crosses CRITICAL again"),
|
||||
]
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n📊 CPU: {value:5.1f}% - {description}")
|
||||
|
||||
plugin_data = {"cpu_percent": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
"testhost",
|
||||
"cpu_monitor",
|
||||
plugin_data,
|
||||
alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||
print(f" State: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
print(f" 📧 Notification sent (state changed)")
|
||||
else:
|
||||
print(f" ✓ No notification (state unchanged - hysteresis working)")
|
||||
|
||||
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_inverse_threshold():
|
||||
"""Demonstrate inverse thresholds (less than)."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 4: Inverse Thresholds (Alert When Low)")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"memory_monitor": {
|
||||
"available_mb": {
|
||||
"warning": 1000, # Warn when < 1000 MB
|
||||
"critical": 500, # Critical when < 500 MB
|
||||
"operator": "<",
|
||||
"hysteresis": 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
print("\nMonitoring available memory (alert when LOW):")
|
||||
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
|
||||
print("-" * 70)
|
||||
|
||||
test_values = [
|
||||
(2000, "Plenty of memory"),
|
||||
(800, "Drops below 1000 MB - WARNING"),
|
||||
(450, "Drops below 500 MB - CRITICAL"),
|
||||
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
|
||||
(600, "Enough recovery - back to WARNING"),
|
||||
(1200, "Fully recovered - OK"),
|
||||
]
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n💾 Available: {value} MB - {description}")
|
||||
|
||||
plugin_data = {"available_mb": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
"testhost",
|
||||
"memory_monitor",
|
||||
plugin_data,
|
||||
alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("memory_monitor.available_mb")
|
||||
print(f" State: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
for metric, old_level, new_level, val in state_changes:
|
||||
print(f" 📧 {old_level.name} → {new_level.name}")
|
||||
|
||||
print(f"\n📈 Notifications sent: {len(notifications)}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n")
|
||||
print("╔" + "═" * 68 + "╗")
|
||||
print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
|
||||
print("╚" + "═" * 68 + "╝")
|
||||
|
||||
demo_basic_thresholds()
|
||||
demo_multiple_metrics()
|
||||
demo_hysteresis()
|
||||
demo_inverse_threshold()
|
||||
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMONSTRATION COMPLETE")
|
||||
print("=" * 70)
|
||||
print("\nKey takeaways:")
|
||||
print(" • Thresholds detect when metrics exceed configured limits")
|
||||
print(" • Notifications sent only on state changes, not every check")
|
||||
print(" • Hysteresis prevents alert flapping")
|
||||
print(" • Supports both 'greater than' and 'less than' thresholds")
|
||||
print(" • Multiple metrics can be monitored simultaneously")
|
||||
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
|
||||
print("=" * 70)
|
||||
print()
|
||||
Executable
+115
@@ -0,0 +1,115 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Helper script to install the heartbeat tools. By default, it will only
|
||||
# install the heartbeat client, hbc. The server is installed when the arg 'server' is passed
|
||||
# to the script. The script will install the heartbeat tools in a python
|
||||
# virtual environment in ~/venvs/hbd. The hbd and hbc commands will be
|
||||
# installed from the wheel and symlinked to ~/bin/hbd and ~/bin/hbc,
|
||||
# respectively. If the virtual environment already exists, it will be
|
||||
# reused. The script will also remove any existing symlinks for hbd and hbc
|
||||
# in ~/bin before creating new ones.
|
||||
|
||||
set -e
|
||||
what=$1
|
||||
on_ha=0
|
||||
where=""
|
||||
venv=""
|
||||
[ "$2" = "HA" ] && on_ha=1
|
||||
[ -z "$what" ] && what="client"
|
||||
|
||||
if [ -d /homeassistant ]; then # if running from HA command line
|
||||
echo "HA, running \"docker exec homeassistant /config/bin/hb_install.sh $@\""
|
||||
docker exec homeassistant /config/bin/hb_install.sh $@ HA
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
echo "Failed to install heartbeat in HA, please check the logs for more details"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ $on_ha -eq 1 ] || [ -r /.dockerenv ] && [ -d /config/bin ]; then
|
||||
# Installing under docker on Home Assistant OS, using /config/bin for executables and /config/venvs for virtual environments
|
||||
echo "Home Assistant OS detected, installing under docker"
|
||||
where="/config/bin"
|
||||
venv="/config/venvs"
|
||||
else
|
||||
if [ ! -d $HOME/.local/bin ] && [ ! -d $HOME/bin ]; then
|
||||
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
for where in $HOME/bin $HOME/.local/bin notset ; do
|
||||
if echo ":$PATH:" | grep -q ":$where:" ; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$where" = "notset" ]; then
|
||||
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
if [ "$what" = "mini" ]; then
|
||||
venv=""
|
||||
else
|
||||
venv="$HOME/venvs"
|
||||
fi
|
||||
fi
|
||||
echo "Installing $what to $where"
|
||||
if [ ! -z "$venv" ]; then
|
||||
echo "Using virtual environment at $venv/hbd"
|
||||
fi
|
||||
|
||||
if [ "$venv" != "" ] && [ ! -d $venv/hbd ]; then
|
||||
arg=""
|
||||
have_pip=$(python3 -c "import pip" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||
if [ "$have_pip" = "Not Installed" ]; then
|
||||
# some systems do not have pip installed by default, so we need to fetch get-pip.py and install pip
|
||||
echo "pip is not installed, fetching get-pip.py and installing pip"
|
||||
arg="--without-pip"
|
||||
fi
|
||||
mkdir -p $venv
|
||||
have_venv=$(python3 -c "import venv" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||
if [ "$have_venv" = "Not Installed" ]; then
|
||||
if [ "$have_pip" = "Not Installed" ]; then
|
||||
echo "python has no venv, and no pip to install virtualenv, cannot continue"
|
||||
exit 1
|
||||
fi
|
||||
echo "python venv module not found, installing virtualenv"
|
||||
python3 -m pip install --user virtualenv
|
||||
python3 -m virtualenv $venv/hbd --system-site-packages $arg
|
||||
else
|
||||
python3 -m venv $venv/hbd --system-site-packages $arg
|
||||
fi
|
||||
. $venv/hbd/bin/activate
|
||||
if [ -n "$arg" ]; then
|
||||
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py
|
||||
fi
|
||||
deactivate
|
||||
fi
|
||||
|
||||
if [ ! -z "$venv" ]; then
|
||||
. $venv/hbd/bin/activate
|
||||
fi
|
||||
if [ "$what" = "mini" ]; then
|
||||
curl -s -o $where/hbc_mini https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hbc_mini.py
|
||||
chmod +x $where/hbc_mini
|
||||
else
|
||||
python3 -mpip install --upgrade --index-url https://git.wrede.ca/api/packages/andreas/pypi/simple/ --extra-index-url https://pypi.org/simple hbd[$what]
|
||||
fi
|
||||
|
||||
if [ ! -z "$venv" ]; then
|
||||
echo "linking executables to $where"
|
||||
if [ "$what" = "server" ]; then
|
||||
rm -f $where/hbd
|
||||
ln -sf $(which hbd) $where/hbd
|
||||
elif [ "$what" = "client" ]; then
|
||||
rm -f $where/hbc
|
||||
ln -sf $(which hbc) $where/hbc
|
||||
fi
|
||||
rm -f $where/hb_install.sh
|
||||
ln -sf $(which hb_install.sh) $where/hb_install.sh
|
||||
fi
|
||||
echo "Installation complete. To upgrade, run the following:"
|
||||
echo " $where/hb_install.sh $what"
|
||||
echo "To install on another machine, run the following obtain the install script and run it:"
|
||||
echo "from https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hb_install.sh"
|
||||
echo "and then run sh hb_install.sh [mini|client]"
|
||||
Executable
+1203
File diff suppressed because it is too large
Load Diff
@@ -1,15 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
# install hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
|
||||
|
||||
set -e
|
||||
if [ ! -d ~/venvs/hbd ]; then
|
||||
mkdir -p ~/venvs
|
||||
python3 -m venv ~/venvs/hbd --system-site-packages
|
||||
fi
|
||||
. ~/venvs/hbd/bin/activate
|
||||
pip install 'git+ssh://git@git.wrede.ca/andreas/heartbeat.git'
|
||||
rm -f ~/bin/hbd
|
||||
rm -f ~/bin/hbc
|
||||
ln -sf $(which hbd) ~/bin/hbd
|
||||
ln -sf $(which hbc) ~/bin/hbc
|
||||
Executable
+4
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
|
||||
#echo "OK - all is well"
|
||||
echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user