Compare commits
322 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 313bbd37ac | |||
| f7320644f3 | |||
| 76e11b92f2 | |||
| d39c0da5fe | |||
| 832b9d04d8 | |||
| 44d5f15a67 | |||
| 37b8e35a26 | |||
| fa317a3b78 | |||
| 8729fe7038 | |||
| f4231dd5f3 | |||
| c47576637f | |||
| 2b9523ec28 | |||
| 610ad0af30 | |||
| 69b5b410ed | |||
| 8b2b0fd9d0 | |||
| 756b2323be | |||
| 6e7156b42d | |||
| 928035df50 | |||
| 0f90be659e | |||
| 4160e34a96 | |||
| 6430d2ddf3 | |||
| 4b87a90e76 | |||
| 450814daca | |||
| e7786ac5da | |||
| fed71d97d6 | |||
| ba96da9622 | |||
| 7f17ddc2ff | |||
| 7750c5a303 | |||
| e58530df7d | |||
| fe7143759c | |||
| 236b40cfe4 | |||
| 4e5bafd26c | |||
| 817ae064af | |||
| a00282913b | |||
| d699a29fa9 | |||
| 4ce7eacfdd | |||
| 1cefc2676e | |||
| 668a135e53 | |||
| 59e256a042 | |||
| 708508157f | |||
| f67fa9baff | |||
| 588eb2a792 | |||
| b907343e36 | |||
| e50a3996ae | |||
| e1056a0365 | |||
| 1dbe0f8e64 | |||
| 12e8812070 | |||
| 9b5d8ac9b1 | |||
| 500d256d76 | |||
| a7a45bf8c3 | |||
| 3e9b052f71 | |||
| 7444262985 | |||
| 3401cc0dbb | |||
| ab0132a38d | |||
| 9e389736f8 | |||
| b64a2a9313 | |||
| a52744a448 | |||
| 5e2b04b811 | |||
| 8e07b09d7e | |||
| 653e018e4f | |||
| c7326da7d9 | |||
| 0426a75d8c | |||
| 539f25d877 | |||
| 3e3099fc6d | |||
| c9f15a3f1c | |||
| 6e396ad760 | |||
| 2800de0b4a | |||
| 15f7e6a64d | |||
| 9768d13b88 | |||
| 8640d731aa | |||
| de81751e59 | |||
| 60c692cefc | |||
| 9a0baf3c78 | |||
| 55bdb9593a | |||
| 2009626fb4 | |||
| 18769afd37 | |||
| 31db5cf35e | |||
| 326f53f23d | |||
| 4f9bc8c868 | |||
| 259b4a3594 | |||
| 8646f68957 | |||
| a4a6c1e3d9 | |||
| 0e8250362e | |||
| 2f5da9fc5e | |||
| 87aeec5999 | |||
| f24500a6b5 | |||
| a7bb183222 | |||
| 8207cd7b5f | |||
| 11f1eefa8c | |||
| 62f496e9f8 | |||
| aef9e7769b | |||
| 58c2b9d996 | |||
| 2e8bcb630d | |||
| 338711181b | |||
| 43487f17e7 | |||
| 40205bf5c7 | |||
| b95f1a5bb7 | |||
| 12f7eb722b | |||
| 217bba1b76 | |||
| 967e05ed74 | |||
| c20245b0ab | |||
| b9db0c552e | |||
| 05045bafa2 | |||
| 39f1b5de30 | |||
| b06de6fdd3 | |||
| 940d0af35e | |||
| d6d31aa2e3 | |||
| 76edfe7577 | |||
| d190029728 | |||
| b8307e7a9d | |||
| a2fdf091f5 | |||
| 1914e6f28e | |||
| 82cbce9615 | |||
| dbb779b013 | |||
| ca908ee967 | |||
| 73c697b6c5 | |||
| 3e2357380b | |||
| cc4a103bae | |||
| 53fb10fdf5 | |||
| 2df2ad18c9 | |||
| b81a0d2a6c | |||
| 1a19088cfe | |||
| 172f6e950f | |||
| 4349ae217a | |||
| b3aa7b585f | |||
| 88a3c09b51 | |||
| 0504402a8a | |||
| ca58c18802 | |||
| 1ddc4b8132 | |||
| 5e1720ed32 | |||
| 77f127fe60 | |||
| 54fbd8d73d | |||
| 7ab17e26e2 | |||
| 28f5fa951c | |||
| 37f1c58969 | |||
| f006077a71 | |||
| d9fc8d632f | |||
| f640574e4f | |||
| 9a19424279 | |||
| ca8ba84e65 | |||
| f3d08d1c9e | |||
| 1e4263b793 | |||
| e931acb9f5 | |||
| 018409e71d | |||
| 1824f637b4 | |||
| a534c06b26 | |||
| d7b5c97a4e | |||
| ae447ac4a6 | |||
| d44ce3d124 | |||
| b1985d0eb2 | |||
| de778f680f | |||
| d7b368c7c6 | |||
| e790663f9f | |||
| 475319e248 | |||
| ca5ef384a8 | |||
| c93dbdc0f4 | |||
| 3a546a1e5c | |||
| 74c89d098c | |||
| 3301dbfe34 | |||
| d00d903e7d | |||
| babb5d61aa | |||
| 11d1c718b3 | |||
| a99b6b54c7 | |||
| 8da3d550eb | |||
| a76d0fc840 | |||
| 94cbb31c48 | |||
| ae60844a8a | |||
| 49fa310361 | |||
| 28e2180f7b | |||
| ce0590f015 | |||
| f50acca509 | |||
| 72fc82b91f | |||
| 46f8c32c0b | |||
| 691f62aa69 | |||
| cffc9805f9 | |||
| 917d6a401b | |||
| 2bd3a9beb6 | |||
| 5523c60866 | |||
| ab37ac7194 | |||
| f811a19d80 | |||
| 6239825f43 | |||
| b56245bb23 | |||
| 331c4e804d | |||
| 9fd945a481 | |||
| 26df08eeff | |||
| 5819dd6b25 | |||
| 6fb67f8615 | |||
| e70ae6f176 | |||
| a77f6d380c | |||
| 6aae2a1dab | |||
| 85ee0e1040 | |||
| c4f09e9ced | |||
| 64710fd4cd | |||
| 1f5e7465a3 | |||
| b290b21e23 | |||
| 65c4267847 | |||
| 462a445235 | |||
| 368e178f93 | |||
| 6905bf266a | |||
| b6dcce4f35 | |||
| e6436fc236 | |||
| c5ce41762e | |||
| 26ca0c095f | |||
| 1eecd67594 | |||
| caf3c2c0ac | |||
| 9af4006097 | |||
| ddf7067d13 | |||
| 505353a8a8 | |||
| 0402d33c71 | |||
| 7d8ca5d8db | |||
| 56037a036d | |||
| 65ceb31d8d | |||
| 1c9b6c1ca9 | |||
| d7e6b478e1 | |||
| 535dbda47d | |||
| c9567dddae | |||
| b5963badd6 | |||
| a76a39b4a0 | |||
| 94e1597978 | |||
| c9c2ed772f | |||
| aeb78dcb8e | |||
| 77b337e4dd | |||
| 293461f3f6 | |||
| c70a4807dc | |||
| 1a470e7cfa | |||
| 990c658e65 | |||
| b78d6ac0fe | |||
| afd5060f59 | |||
| f61f7aebc2 | |||
| 5c382d2b8d | |||
| 35bba451f5 | |||
| 80edfba0c0 | |||
| 6bc8de192e | |||
| 2d8166d04a | |||
| ab33d81b30 | |||
| 2c0328f36d | |||
| fb8e27825d | |||
| 1366c69cdc | |||
| d0c8c186f4 | |||
| 19f7c8312e | |||
| 24b0e362fb | |||
| 3a030548c0 | |||
| 094cb7ed9d | |||
| 0199ca4693 | |||
| 75344ebbbd | |||
| 7f049a4e26 | |||
| 6559f5462c | |||
| 6556d35f97 | |||
| dec96a0da6 | |||
| 8d3de01117 | |||
| 5bedf026b1 | |||
| daf5277507 | |||
| ee3b72878f | |||
| 6217f7a124 | |||
| 2468386f24 | |||
| 2015195112 | |||
| 3426185383 | |||
| 9eedbafe97 | |||
| a5f31c5cb5 | |||
| 2f72cf0118 | |||
| c56e77c2c1 | |||
| e9aa7a6f8b | |||
| a75a8a4087 | |||
| ba27d2e300 | |||
| 381e37efce | |||
| 97dfc08f4d | |||
| d281ac5a70 | |||
| 812bbf8555 | |||
| e6b7a1aa27 | |||
| 90f47ad018 | |||
| cc458e8972 | |||
| 79bf00abfd | |||
| d77277857f | |||
| 3232239a85 | |||
| 014781de5e | |||
| 68b1c65384 | |||
| e8bb553349 | |||
| e4ecb8723f | |||
| 5edbaacf81 | |||
| 8421f472f2 | |||
| 51f9bdc2b5 | |||
| 02bc42fbf0 | |||
| 832a8b0bda | |||
| 57c4b86430 | |||
| 43fad7beed | |||
| 8dd002d159 | |||
| 2373b55d8b | |||
| 81530636ec | |||
| 190199b36d | |||
| 73aa89f8f4 | |||
| 941f3ea4b0 | |||
| c5770006f7 | |||
| 84c1aef51f | |||
| 460d2be9e9 | |||
| 090d341244 | |||
| 079e84f729 | |||
| dd23d9d163 | |||
| ad7178ebcb | |||
| 0543266c92 | |||
| 7e2038ecac | |||
| 75e41eafc4 | |||
| 73b9d05357 | |||
| 9d81f96f31 | |||
| d2e1c7a629 | |||
| 83d5ead471 | |||
| d339133981 | |||
| 7be129ad40 | |||
| 179048e565 | |||
| 8fe64ae8c5 | |||
| b6574872cc | |||
| 5e6dfc75ad | |||
| 087a264e97 | |||
| d9ca0b74e2 | |||
| 999740bc99 | |||
| 4c53b7cec9 | |||
| 535b839bfc | |||
| e3dd461d04 | |||
| e55a81568f | |||
| 83fbba433e | |||
| a494b162cd | |||
| 83b7139643 | |||
| 5dca9369dd |
@@ -0,0 +1,20 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Edit(*)",
|
||||||
|
"Bash(pytest *)",
|
||||||
|
"Bash(python *)",
|
||||||
|
"Bash(python3 *)",
|
||||||
|
"Bash(.venv/bin/pytest *)",
|
||||||
|
"Bash(npm *)",
|
||||||
|
"Bash(git *)",
|
||||||
|
"Bash(ls *)",
|
||||||
|
"Bash(cat *)",
|
||||||
|
"Bash(grep *)",
|
||||||
|
"Bash(find *)",
|
||||||
|
"Bash(mkdir *)",
|
||||||
|
"Bash(touch *)",
|
||||||
|
"Bash(uv *)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
name: Release
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
runs-on: FreeBSD
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
run: |
|
||||||
|
python3 --version
|
||||||
|
python3 -m ensurepip --upgrade
|
||||||
|
|
||||||
|
- name: Install build tools
|
||||||
|
run: |
|
||||||
|
python3 -m venv .venv
|
||||||
|
.venv/bin/pip install --upgrade pip
|
||||||
|
.venv/bin/pip install build twine
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: .venv/bin/python -m build
|
||||||
|
|
||||||
|
- name: Extract version from tag
|
||||||
|
id: get_version
|
||||||
|
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
||||||
|
- name: Generate changelog
|
||||||
|
id: changelog
|
||||||
|
run: |
|
||||||
|
PREV_TAG=$(git tag --sort=-version:refname | grep -m 1 -v "^${GITHUB_REF#refs/tags/}$")
|
||||||
|
if [ -n "$PREV_TAG" ]; then
|
||||||
|
CHANGELOG=$(git log --pretty=format:"- %s" "${PREV_TAG}..HEAD")
|
||||||
|
else
|
||||||
|
CHANGELOG="Initial release"
|
||||||
|
fi
|
||||||
|
# Write multiline to output
|
||||||
|
{
|
||||||
|
echo "CHANGELOG<<EOF"
|
||||||
|
echo "$CHANGELOG"
|
||||||
|
echo "EOF"
|
||||||
|
} >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Upload to Gitea PyPI registry
|
||||||
|
env:
|
||||||
|
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||||
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||||
|
run: |
|
||||||
|
.venv/bin/python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/*
|
||||||
|
|
||||||
|
- name: Create release
|
||||||
|
uses: actions/gitea-release-action@v1
|
||||||
|
with:
|
||||||
|
files: |
|
||||||
|
dist/*.whl
|
||||||
|
dist/*.tar.gz
|
||||||
|
title: "Release ${{ steps.get_version.outputs.VERSION }}"
|
||||||
|
body: "${{ steps.changelog.outputs.CHANGELOG }}"
|
||||||
@@ -7,4 +7,11 @@ __pycache__/
|
|||||||
.venv/
|
.venv/
|
||||||
test/
|
test/
|
||||||
build/
|
build/
|
||||||
|
dist/
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
|
ssl/
|
||||||
|
uv.lock
|
||||||
|
.hb.yaml
|
||||||
|
.superpowers/
|
||||||
|
rndc-key
|
||||||
|
docs/superpowers/
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
#name: "w02"
|
|
||||||
hb_port: 50003
|
|
||||||
hbd_host: ''
|
|
||||||
#logfile: "/home/andreas/public_html/messages/andreas"
|
|
||||||
logfile: "/Users/andreas/public_html/messages/andreas"
|
|
||||||
logfmt: "msg"
|
|
||||||
grace: 40
|
|
||||||
interval: 10
|
|
||||||
watchhosts:
|
|
||||||
# "localhost":
|
|
||||||
# "haschloss" :
|
|
||||||
# "cotgate":
|
|
||||||
# "wentworth":
|
|
||||||
"y":
|
|
||||||
notify: +4915123456789
|
|
||||||
src: "signal"
|
|
||||||
"winter":
|
|
||||||
notify: +14168226179
|
|
||||||
src: "signal"
|
|
||||||
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
|
|
||||||
drophosts: {"unknown", "wookie15", "wort"}
|
|
||||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
|
||||||
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
|
|
||||||
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
|
|
||||||
pushsrv: "pushover"
|
|
||||||
|
|
||||||
dyndomains: {"wrede.org"}
|
|
||||||
Vendored
+7
-6
@@ -4,12 +4,13 @@
|
|||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd (module)",
|
"name": "Python: Run hbd (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "hbd.cli",
|
"module": "hbd.server.cli",
|
||||||
"args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
|
"args": ["-c", "~/.hb.yaml", "-f", "-v"],
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
"env": {
|
"env": {
|
||||||
"PYTHONPATH": "${workspaceFolder}"
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
@@ -28,14 +29,14 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd with debugpy (listen)",
|
"name": "Python: Run hbc (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "debugpy",
|
"module": "hbd.client.main",
|
||||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
|
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||||
|
"cwd": "${workspaceFolder}",
|
||||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
Vendored
+4
-1
@@ -2,5 +2,8 @@
|
|||||||
"python.pythonPath": "/usr/bin/python3",
|
"python.pythonPath": "/usr/bin/python3",
|
||||||
"python.linting.enabled": true,
|
"python.linting.enabled": true,
|
||||||
"python.formatting.provider": "black",
|
"python.formatting.provider": "black",
|
||||||
"python.linting.flake8Enabled": true
|
"python.linting.flake8Enabled": true,
|
||||||
|
"chat.tools.terminal.autoApprove": {
|
||||||
|
"mv": true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
1. Don't assume. Don't hide confusion. Surface tradeoffs.
|
||||||
|
2. Minimum code that solves the problem. Nothing speculative.
|
||||||
|
3. Touch only what you must. Clean up only your own mess.
|
||||||
|
4. Define success criteria. Loop until verified.
|
||||||
@@ -0,0 +1,210 @@
|
|||||||
|
# Heartbeat
|
||||||
|
|
||||||
|
Heartbeat is a lightweight host monitoring system built around a simple idea: each machine you want to monitor runs a small client (`hbc`) that sends a UDP "heartbeat" packet to a central server (`hbd`) on a regular interval. If a heartbeat stops arriving, you get notified. Alongside reachability, clients can ship system metrics — CPU, memory, disk, network — and the server will alert you when any of those cross a threshold.
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
```
|
||||||
|
[ monitored host ] [ your server ]
|
||||||
|
┌─────────────┐ UDP 50003 ┌────────────────────────┐
|
||||||
|
│ hbc │ ────────────> │ hbd │
|
||||||
|
│ │ │ host state tracking │
|
||||||
|
│ plugins: │ <──────────── │ threshold alerting │
|
||||||
|
│ cpu, mem, │ ACK / CMD │ notifications │
|
||||||
|
│ disk, ... │ │ web dashboard + API │
|
||||||
|
└─────────────┘ └────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
- **hbd** — the server daemon. Tracks which hosts are alive, evaluates metric thresholds, fires notifications, serves the web dashboard and REST API.
|
||||||
|
- **hbc** — the client. Sends heartbeats and plugin data over UDP. Runs on any Linux/BSD/macOS host.
|
||||||
|
- **hbc_mini** — a zero-dependency single-file alternative (`hbc_mini.py` or `hbc_mini.c`) for hosts where you can't install Python packages.
|
||||||
|
|
||||||
|
Notifications can go to Pushover, email, Mattermost, Matrix, Signal, or VoIP.ms SMS. The dashboard shows host connectivity, RTT graphs, active alerts, and per-host plugin metrics in real time via WebSocket.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Getting started
|
||||||
|
|
||||||
|
This tutorial sets up a server on one machine and a client on a second machine. You'll end up with a working dashboard and your first host being monitored.
|
||||||
|
|
||||||
|
### 1. Install the server
|
||||||
|
|
||||||
|
On the machine that will run `hbd`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://git.wrede.ca/andreas/heartbeat.git
|
||||||
|
cd heartbeat
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify the install:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd --help
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Create a server config
|
||||||
|
|
||||||
|
Create `~/.hb.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hb_port: 50003 # UDP port — clients send heartbeats here
|
||||||
|
hbd_port: 50004 # HTTP port — web dashboard and API
|
||||||
|
ws_port: 50005 # WebSocket port — live dashboard updates
|
||||||
|
|
||||||
|
interval: 20 # Expected heartbeat interval (seconds)
|
||||||
|
grace: 2 # Seconds of slack before a host is considered overdue
|
||||||
|
|
||||||
|
pickfile: ~/.hb.pick
|
||||||
|
pidfile: ~/.hb.pid
|
||||||
|
logfile: ~/.hb.log
|
||||||
|
```
|
||||||
|
|
||||||
|
That's enough to get started. No hosts, no users, no notifications needed yet — the server will accept any client that connects.
|
||||||
|
|
||||||
|
### 3. Start the server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd serve -c ~/.hb.yaml -f -v
|
||||||
|
```
|
||||||
|
|
||||||
|
`-f` keeps it in the foreground so you can watch the log. You should see:
|
||||||
|
|
||||||
|
```
|
||||||
|
Heartbeat daemon starting on UDP :50003, HTTP :50004, WS :50005
|
||||||
|
```
|
||||||
|
|
||||||
|
Open `http://your-server:50004/live` in a browser. The dashboard is empty for now.
|
||||||
|
|
||||||
|
### 4. Install the client on a host to monitor
|
||||||
|
|
||||||
|
On the machine you want to monitor (must be able to reach the server on UDP 50003):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install hbd # or: copy scripts/hbc_mini.py if you can't install packages
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Quick start — no config file
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Within a few seconds the server log will show the host checking in, and it will appear on the dashboard.
|
||||||
|
|
||||||
|
#### With a config file
|
||||||
|
|
||||||
|
Create `~/.hbc.yaml` on the client host:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hb_port: 50003
|
||||||
|
interval: 10 # Send a heartbeat every 10 seconds
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
cpu_monitor:
|
||||||
|
interval: 60
|
||||||
|
memory_monitor:
|
||||||
|
interval: 60
|
||||||
|
disk_monitor:
|
||||||
|
interval: 60
|
||||||
|
```
|
||||||
|
|
||||||
|
Then start the client:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc -c ~/.hbc.yaml your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Send a boot message at startup so the server logs when the host came up:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc -b -c ~/.hbc.yaml your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Run as a daemon (logs go to syslog):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc -d -b -c ~/.hbc.yaml your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. View the dashboard
|
||||||
|
|
||||||
|
Open `http://your-server:50004/live`. You'll see the monitored host, its last heartbeat time, and RTT. Click the host name to see plugin metrics.
|
||||||
|
|
||||||
|
Navigate to `/plugins/<hostname>` for CPU, memory, and disk graphs.
|
||||||
|
|
||||||
|
### 6. Add a notification channel (optional)
|
||||||
|
|
||||||
|
Edit `~/.hb.yaml` on the server:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
pushover_ops:
|
||||||
|
type: pushover
|
||||||
|
token: YOUR_APP_TOKEN
|
||||||
|
user: YOUR_USER_KEY
|
||||||
|
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
password: pbkdf2:sha256:... # generate: hbd passwd alice
|
||||||
|
admin: true
|
||||||
|
notification_channels: [pushover_ops]
|
||||||
|
|
||||||
|
default_owner: alice
|
||||||
|
```
|
||||||
|
|
||||||
|
Generate the password hash:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd passwd alice
|
||||||
|
```
|
||||||
|
|
||||||
|
Paste the output into the config, then reload:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd reload
|
||||||
|
```
|
||||||
|
|
||||||
|
Test the channel:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd notify
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Set a threshold alert (optional)
|
||||||
|
|
||||||
|
Add to `~/.hb.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Reload: `hbd reload`. The server will now alert when a monitored host crosses these values.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What's next
|
||||||
|
|
||||||
|
| Topic | Where to look |
|
||||||
|
|---|---|
|
||||||
|
| Full server config reference | [README — Server](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#server-hbd) |
|
||||||
|
| Client options and all plugins | [README — Client](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#client-hbc) |
|
||||||
|
| Threshold alerting details | [THRESHOLD_ALERTING.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/THRESHOLD_ALERTING.md) |
|
||||||
|
| Notification channels | [NOTIFICATIONS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NOTIFICATIONS.md) |
|
||||||
|
| User accounts and roles | [USERS.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/USERS.md) |
|
||||||
|
| Writing a custom plugin | [PLUGIN_DEVELOPMENT.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/PLUGIN_DEVELOPMENT.md) |
|
||||||
|
| Nagios check integration | [NAGIOS_INTEGRATION.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/NAGIOS_INTEGRATION.md) |
|
||||||
|
| REST API | [HTTP_API.md](https://git.wrede.ca/andreas/heartbeat/src/branch/master/docs/HTTP_API.md) |
|
||||||
|
| Zero-dependency client | [README — hbc_mini](https://git.wrede.ca/andreas/heartbeat/src/branch/master/README.md#hbc_mini--zero-dependency-client) |
|
||||||
+21
@@ -0,0 +1,21 @@
|
|||||||
|
# MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2002 - 2026 Andreas Wrede
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
@@ -1,174 +1,755 @@
|
|||||||
|
# Heartbeat Daemon (hbd)
|
||||||
|
|
||||||
|
A lightweight UDP-based host monitoring system. Monitored hosts run a client (`hbc`) that sends periodic heartbeat packets and system metrics to a central server (`hbd`). The server tracks host reachability, evaluates metric thresholds, sends notifications, and serves a web dashboard.
|
||||||
# Heartbeat Daemon (hbd) ✅
|
|
||||||
|
|
||||||
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 📌 Features
|
## Architecture
|
||||||
|
|
||||||
- Receive and parse heartbeat datagrams (text or zlib-compressed) ✅
|
```
|
||||||
- Maintain host state and detect up/down transitions ✅
|
[ host running hbc ] [ server running hbd ]
|
||||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
┌────────────────────┐ ┌────────────────────────────┐
|
||||||
- WebSocket API for live updates (hosts & messages) ✅
|
│ heartbeat client │ UDP 50003 │ heartbeat daemon │
|
||||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
│ │ ──────────> │ │
|
||||||
- Modular codebase suitable for unit testing and CI ✅
|
│ plugins: │ HTB / PLG │ host state tracking │
|
||||||
|
│ - cpu_monitor │ │ threshold evaluation │
|
||||||
|
│ - memory_monitor │ <────────── │ DNS updates (nsupdate) │
|
||||||
|
│ - disk_monitor │ ACK/CMD/UPD │ notifications │
|
||||||
|
│ - nagios_runner │ │ web dashboard + REST API │
|
||||||
|
│ - ... │ │ WebSocket live updates │
|
||||||
|
└────────────────────┘ └────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Package:** `hbd` v5.3.4
|
||||||
|
**Python:** 3.11+
|
||||||
|
|
||||||
|
### Subpackages
|
||||||
|
|
||||||
|
| Package | Purpose |
|
||||||
|
|---|---|
|
||||||
|
| `hbd.common` | Protocol encoding/decoding, shared utilities |
|
||||||
|
| `hbd.server` | The `hbd` daemon |
|
||||||
|
| `hbd.client` | The `hbc` client |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## ⚙️ Quickstart
|
## Installation
|
||||||
|
|
||||||
Prerequisites:
|
Dependencies are declared in `pyproject.toml`. Install into a virtualenv:
|
||||||
- Python 3.10+ (project uses language features from recent Python)
|
|
||||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
|
||||||
|
|
||||||
Install dependencies (recommended into a venv):
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m venv .venv
|
# Server + client
|
||||||
source .venv/bin/activate
|
pip install .
|
||||||
python -m pip install --upgrade pip
|
|
||||||
python -m pip install -r requirements.txt
|
# Using the install script
|
||||||
# for development/testing tools
|
scripts/hb_install.sh
|
||||||
python -m pip install -r requirements-dev.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Run the daemon (example):
|
**Entry points:**
|
||||||
|
- `hbd` — server (`hbd.server.cli:main`)
|
||||||
|
- `hbc` — client (`hbd.client.main:main`)
|
||||||
|
|
||||||
```bash
|
**Runtime dependencies:**
|
||||||
# run with default config lookup (~/.hb.yaml)
|
|
||||||
PYTHONPATH=. hbd -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also run it directly via the package entrypoint after installation:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m hbd.cli -c /path/to/config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🐞 Debugging in VS Code
|
|
||||||
|
|
||||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
|
||||||
|
|
||||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
|
||||||
- Use **F5** and pick one of these configurations from the Run view:
|
|
||||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
|
||||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
|
||||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
|
||||||
|
|
||||||
To start `hbd` manually and wait for the debugger to attach, run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
|
||||||
|
|
||||||
|
| Component | Packages |
|
||||||
|
|---|---|
|
||||||
|
| Both | PyYAML ≥6.0 |
|
||||||
|
| Client | psutil ≥5.9.0 |
|
||||||
|
| Server | aiohttp ≥3.11, websockets ≥13.2, Jinja2 ≥3.1.6, ruamel.yaml ≥0.18, mattermostdriver ≥7.3.0, matrix-nio ≥0.24 |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🛠 Configuration
|
## Server (`hbd`)
|
||||||
|
|
||||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
### Starting the server
|
||||||
|
|
||||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
```bash
|
||||||
- `hbd_port`: internal control port (default: 50004)
|
# Foreground, verbose, with config file
|
||||||
- `hbd_host`: bind address for HTTP/WSS
|
hbd serve -c /etc/hb.yaml -f -v
|
||||||
- `pickfile`: path for persisted state
|
|
||||||
- `logfile`: path to log file
|
|
||||||
- `logfmt`: `text` or `msg`
|
|
||||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
|
||||||
- `interval` / `grace`: heartbeat timing configuration
|
|
||||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
|
||||||
- `nsupdate_bin`: path to nsupdate binary
|
|
||||||
|
|
||||||
Example `.hb.yaml` (minimal):
|
# As a module
|
||||||
|
python -m hbd.server.cli serve -c /etc/hb.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### CLI subcommands
|
||||||
|
|
||||||
|
| Command | Description |
|
||||||
|
|---|---|
|
||||||
|
| `hbd serve` | Start the daemon (default) |
|
||||||
|
| `hbd passwd <username>` | Generate a password hash for config |
|
||||||
|
| `hbd notify` | Test notification channels |
|
||||||
|
| `hbd stop` | Stop a running daemon |
|
||||||
|
| `hbd reload` | Reload config (send SIGHUP) |
|
||||||
|
| `hbd restart` | Restart daemon |
|
||||||
|
|
||||||
|
### Configuration (`~/.hb.yaml`)
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
hbd_host: 0.0.0.0
|
# Network
|
||||||
hbd_port: 50004
|
hb_port: 50003 # UDP port for heartbeat messages
|
||||||
|
hbd_port: 50004 # HTTP API / web UI port
|
||||||
|
hbd_host: "" # Bind address (empty = all interfaces)
|
||||||
|
ws_port: 50005 # WebSocket port (plain)
|
||||||
|
wss_port: ~ # WebSocket port (TLS; requires cert_path/wss_pem/wss_key)
|
||||||
|
|
||||||
|
# Timing
|
||||||
|
interval: 20 # Expected heartbeat interval (seconds)
|
||||||
|
grace: 2 # Extra seconds before declaring a host overdue
|
||||||
|
|
||||||
|
# Persistence
|
||||||
|
pickfile: ~/.hb.pick # Host state persistence
|
||||||
|
pidfile: ~/.hb.pid
|
||||||
|
logfile: ~/.hb.log
|
||||||
|
|
||||||
|
# Message journal
|
||||||
|
journal_enabled: true
|
||||||
|
journal_dir: /var/log/heartbeat
|
||||||
|
journal_file: messages.journal
|
||||||
|
journal_max_size: 104857600 # 100 MB
|
||||||
|
journal_max_backups: 10
|
||||||
|
|
||||||
|
# DNS
|
||||||
|
nsupdate_bin: /usr/bin/nsupdate
|
||||||
dyndomains:
|
dyndomains:
|
||||||
- example.com
|
- example.com
|
||||||
nsupdate_bin: /usr/bin/nsupdate
|
|
||||||
pushsrv: pushover
|
# Threshold alert re-notification interval (seconds)
|
||||||
|
threshold_renotify_interval: 3600
|
||||||
|
|
||||||
|
# Notification channels
|
||||||
|
notification_channels:
|
||||||
|
pushover_ops:
|
||||||
|
type: pushover
|
||||||
|
token: YOUR_APP_TOKEN
|
||||||
|
user: YOUR_USER_KEY
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
port: 587
|
||||||
|
user: alerts@example.com
|
||||||
|
password: secret
|
||||||
|
recipients: [ops@example.com]
|
||||||
|
|
||||||
|
# Users
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
password: pbkdf2:sha256:... # generate with: hbd passwd alice
|
||||||
|
admin: true
|
||||||
|
notification_channels: [pushover_ops]
|
||||||
|
bob:
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
|
||||||
|
default_owner: alice
|
||||||
|
|
||||||
|
# Hosts
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
dyndns: true # Update DNS when address changes
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
monitors: []
|
||||||
|
database01:
|
||||||
|
watch: false # Suppress all notifications for this host
|
||||||
```
|
```
|
||||||
|
|
||||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
Send SIGHUP (or `hbd reload`) to reload configuration without restarting. Changes to ports, certificates, pickle path, and journal path require a full restart.
|
||||||
|
|
||||||
|
### Persistence
|
||||||
|
|
||||||
|
Host state (reachability, plugin data, alert states) is saved to `pickfile` every 5 minutes and on clean shutdown. The server loads this state on startup.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🔧 Architecture & Modules
|
## Client (`hbc`)
|
||||||
|
|
||||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
### Usage
|
||||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
|
||||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and a background DNS thread (`start_dns_thread`)
|
```bash
|
||||||
- `hbd.notify` — email and push notification helpers
|
# Basic — send heartbeats to a server
|
||||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
hbc your-server.example.com
|
||||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
|
||||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
# Multiple servers
|
||||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
hbc server1.example.com server2.example.com
|
||||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
|
||||||
|
# With config file, running as a daemon
|
||||||
This modular layout makes the code easier to test and maintain.
|
hbc -d -c /etc/hbc.yaml your-server.example.com
|
||||||
|
|
||||||
---
|
# Send a boot message, then heartbeat normally
|
||||||
|
hbc -b your-server.example.com
|
||||||
## 🧪 Testing & Dev
|
|
||||||
|
# One-off message
|
||||||
Tests are implemented using `unittest` and additional tests rely on `pytest` if you prefer. To run tests locally without installing anything beyond the dev requirements:
|
hbc -m "maintenance starting" your-server.example.com
|
||||||
|
|
||||||
|
# Force IPv4 or IPv6 only
|
||||||
|
hbc -4 your-server.example.com
|
||||||
|
hbc -6 your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|---|---|
|
||||||
|
| `-b`, `--boot` | Send a boot message at startup |
|
||||||
|
| `-c`, `--config FILE` | Config file path (default: `~/.hbc.yaml`) |
|
||||||
|
| `-d`, `--daemon` | Daemonize (logs go to syslog) |
|
||||||
|
| `-m`, `--message TEXT` | Send a one-off message and exit |
|
||||||
|
| `-n`, `--name NAME` | Override reported hostname |
|
||||||
|
| `-v`, `--verbose` | Verbose output |
|
||||||
|
| `-x`, `--debug` | Debug level (repeatable) |
|
||||||
|
| `-4` / `-6` | Restrict to IPv4 or IPv6 |
|
||||||
|
|
||||||
|
### Configuration (`~/.hbc.yaml`)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hb_port: 50003 # Server UDP port
|
||||||
|
interval: 10 # Heartbeat interval (seconds)
|
||||||
|
owner: alice # Optional: claim ownership of this host
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
cpu_monitor:
|
||||||
|
interval: 300 # Override collection interval
|
||||||
|
per_core: true # Report per-core CPU usage
|
||||||
|
memory_monitor:
|
||||||
|
interval: 300
|
||||||
|
disk_monitor:
|
||||||
|
interval: 300
|
||||||
|
network_monitor:
|
||||||
|
interval: 300
|
||||||
|
ping_monitor:
|
||||||
|
interval: 60
|
||||||
|
hosts: [8.8.8.8, 192.168.1.1]
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300
|
||||||
|
commands:
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
zfs_monitor:
|
||||||
|
interval: 300
|
||||||
|
```
|
||||||
|
|
||||||
|
### Connection behaviour
|
||||||
|
|
||||||
|
- The client sends heartbeats over UDP to each server address resolved from the hostname (IPv4 and IPv6).
|
||||||
|
- If a connection fails to open at startup, IPv6 connections are dropped after 3 consecutive failures. IPv4 connections retry indefinitely.
|
||||||
|
- In daemon mode (`-d`), all log output goes to syslog (`LOG_DAEMON` facility).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## UDP Protocol
|
||||||
|
|
||||||
|
All messages are zlib-compressed key=value pairs with an ID prefix.
|
||||||
|
|
||||||
|
```
|
||||||
|
!<ID>: <zlib-compressed payload>
|
||||||
|
```
|
||||||
|
|
||||||
|
Payload format: `key=value;key=value;...`
|
||||||
|
|
||||||
|
| Message | Direction | Purpose |
|
||||||
|
|---|---|---|
|
||||||
|
| `HTB` | client → server | Heartbeat (name, timestamp, RTT, acks, interval) |
|
||||||
|
| `PLG` | client → server | Plugin data (plugin name + metrics) |
|
||||||
|
| `ACK` | server → client | Acknowledgment |
|
||||||
|
| `CMD` | server → client | Execute a shell command on the client |
|
||||||
|
| `UPD` | server → client | Trigger self-update via `hb_install.sh` |
|
||||||
|
|
||||||
|
Value encoding:
|
||||||
|
- Floats: 5 decimal places
|
||||||
|
- Lists/dicts: JSON prefixed with `@`
|
||||||
|
- Booleans: `1` / `0`
|
||||||
|
|
||||||
|
RTT is measured using kernel SO_TIMESTAMP when available (Linux, macOS, FreeBSD), falling back to application-layer timing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Plugin System
|
||||||
|
|
||||||
|
Plugins run on the client and collect system metrics that are sent to the server as `PLG` messages.
|
||||||
|
|
||||||
|
### Plugin types
|
||||||
|
|
||||||
|
| Type | `interval` | When collected |
|
||||||
|
|---|---|---|
|
||||||
|
| `InfoPlugin` | 0 | Once at startup; re-collected on server request |
|
||||||
|
| `MonitorPlugin` | 30 (default) | Periodically on the configured interval |
|
||||||
|
|
||||||
|
### Built-in plugins
|
||||||
|
|
||||||
|
| Plugin | Type | Data collected |
|
||||||
|
|---|---|---|
|
||||||
|
| `os_info` | Info | OS, kernel, distro, architecture, Python version, hbc version |
|
||||||
|
| `cpu_monitor` | Monitor | cpu_percent, per-core usage, load averages, process count, frequency |
|
||||||
|
| `memory_monitor` | Monitor | RAM and swap usage (ZFS ARC-aware) |
|
||||||
|
| `disk_monitor` | Monitor | Per-partition usage, disk I/O stats |
|
||||||
|
| `network_monitor` | Monitor | Per-interface byte/packet counts, connection count |
|
||||||
|
| `ping_monitor` | Monitor | RTT, packet loss, jitter per configured host |
|
||||||
|
| `filesystem_info` | Info | Mounted filesystems (excludes pseudo filesystems) |
|
||||||
|
| `nagios_runner` | Monitor | Output of configured Nagios-compatible check commands |
|
||||||
|
| `zfs_monitor` | Monitor | ZFS pool health, capacity, fragmentation, dedup ratio, I/O |
|
||||||
|
|
||||||
|
### Custom plugins
|
||||||
|
|
||||||
|
Create a `.py` file in `hbd/client/plugins/`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
class MyPlugin(MonitorPlugin):
|
||||||
|
name = "my_plugin"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
async def collect(self):
|
||||||
|
return {"my_metric": 42}
|
||||||
|
```
|
||||||
|
|
||||||
|
`initialize()` is called once at load time; return `False` to disable the plugin (e.g., if a required binary is missing).
|
||||||
|
|
||||||
|
### Nagios integration
|
||||||
|
|
||||||
|
The `nagios_runner` plugin executes any Nagios-compatible check binary:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
nagios_runner:
|
||||||
|
commands:
|
||||||
|
- name: check_http
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
- Commands are validated (absolute paths, executable) at startup.
|
||||||
|
- Exit codes map to OK / WARNING / CRITICAL / UNKNOWN.
|
||||||
|
- Performance data fields are extracted and stored individually.
|
||||||
|
- The `nagios` threshold operator maps exit codes directly to alert levels (see Threshold Alerting).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Threshold Alerting
|
||||||
|
|
||||||
|
The server evaluates plugin metrics against configurable thresholds and fires notifications on state changes.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">" # >, >=, <, <=, ==, != (default: >)
|
||||||
|
hysteresis: 0.1 # 10%: recover at 81 when critical=90
|
||||||
|
count: 1 # Require N consecutive breaches before alerting
|
||||||
|
display: "CPU {cpu_percent}% (threshold: {op_symbol}{threshold_value})"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
free_gb:
|
||||||
|
warning: 10.0
|
||||||
|
critical: 5.0
|
||||||
|
operator: "<"
|
||||||
|
|
||||||
|
nagios_runner:
|
||||||
|
status_code:
|
||||||
|
operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||||
|
display: "{check_name}: {output}"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Per-host threshold profiles
|
||||||
|
|
||||||
|
Named profiles let different hosts use different thresholds. A single name or a list is accepted; lists are applied left-to-right.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent: {warning: 80, critical: 90}
|
||||||
|
|
||||||
|
tight_cpu:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent: {warning: 60, critical: 75}
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
web-01:
|
||||||
|
threshold_config: default
|
||||||
|
db-01:
|
||||||
|
threshold_config: [default, tight_cpu]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Alert states
|
||||||
|
|
||||||
|
| State | Meaning |
|
||||||
|
|---|---|
|
||||||
|
| OK | Metric within normal range |
|
||||||
|
| WARNING | Metric crossed warning threshold |
|
||||||
|
| CRITICAL | Metric crossed critical threshold |
|
||||||
|
| UNKNOWN | Cannot determine (e.g. Nagios exit code 3) |
|
||||||
|
|
||||||
|
Notifications are sent on state transitions (OK → WARNING, WARNING → CRITICAL, CRITICAL → OK). De-escalations (CRITICAL → WARNING) do not trigger a notification. Ongoing alerts generate a re-notification every `threshold_renotify_interval` seconds (default: 3600). Alerts can be acknowledged via the web UI or API to suppress re-notifications.
|
||||||
|
|
||||||
|
### RTT thresholds
|
||||||
|
|
||||||
|
The server measures heartbeat round-trip time and supports RTT thresholds using the same format:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
rtt:
|
||||||
|
webserver01:
|
||||||
|
warning: 100.0 # ms
|
||||||
|
critical: 500.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generic threshold matching
|
||||||
|
|
||||||
|
When a metric has no exact threshold entry, the server strips leading segments and retries. This allows one entry to cover all Nagios checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
nagios_runner.check_disk_root_status_code → no match
|
||||||
|
nagios_runner.disk_root_status_code → no match
|
||||||
|
nagios_runner.root_status_code → no match
|
||||||
|
nagios_runner.status_code → matched ✓
|
||||||
|
```
|
||||||
|
|
||||||
|
The stripped prefix (`check_disk_root`) is available as `{check_name}` in the `display` template.
|
||||||
|
|
||||||
|
### Display template variables
|
||||||
|
|
||||||
|
| Variable | Description |
|
||||||
|
|---|---|
|
||||||
|
| `{value}` | Current metric value |
|
||||||
|
| `{threshold_value}` | Threshold that was crossed |
|
||||||
|
| `{op_symbol}` | Comparison operator |
|
||||||
|
| `{check_name}` | Prefix stripped by generic matching |
|
||||||
|
| `{metric_name}` | Full field name |
|
||||||
|
| `{output}` | Nagios check output text |
|
||||||
|
| `{status}` | Nagios status name (OK/WARNING/CRITICAL/UNKNOWN) |
|
||||||
|
| any plugin field | Any field present in the plugin's data |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notification Channels
|
||||||
|
|
||||||
|
Notifications are dispatched to the host's owner, managers, and monitors. Each user specifies which channels to use.
|
||||||
|
|
||||||
|
### Supported channel types
|
||||||
|
|
||||||
|
| Type | Required fields |
|
||||||
|
|---|---|
|
||||||
|
| `pushover` | `token`, `user` |
|
||||||
|
| `email` | `smtp_server`, `recipients`, `sender`, `user`, `password`, `port` |
|
||||||
|
| `mattermost` | `webhook_url`, `channel` |
|
||||||
|
| `matrix` | `homeserver`, `user`, `password`, `room_id` |
|
||||||
|
| `signal` | `phone_number`, `recipient` |
|
||||||
|
| `sms_voipms` | `api_key`, `recipient` |
|
||||||
|
|
||||||
|
Each channel can set a `min_level` (`WARNING` or `CRITICAL`) to filter low-severity alerts.
|
||||||
|
|
||||||
|
Recovery notifications are only sent to channels that received the original alert.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Web Dashboard & HTTP API
|
||||||
|
|
||||||
|
The server exposes a web UI and REST API on `hbd_port` (default 50004).
|
||||||
|
|
||||||
|
### Web pages
|
||||||
|
|
||||||
|
| Path | Description |
|
||||||
|
|---|---|
|
||||||
|
| `/login` | Login form (shown automatically when auth is configured) |
|
||||||
|
| `/live` | Real-time host connectivity, RTT, and message stream |
|
||||||
|
| `/plugins/<host>` | Per-host plugin metrics |
|
||||||
|
| `/alerts` | Active alerts with severity filtering |
|
||||||
|
| `/settings` | Server config, users, notification channels, thresholds |
|
||||||
|
|
||||||
|
Live views use WebSocket connections for real-time updates.
|
||||||
|
|
||||||
|
Non-admin users see only hosts where they have a role (monitor, manager, or owner). Admins see all hosts.
|
||||||
|
|
||||||
|
### REST API
|
||||||
|
|
||||||
|
All endpoints are under `/api/0/`. When authentication is configured, include a session token:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log in, get a token
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||||
|
|
||||||
|
# Use the token
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
| Method | Endpoint | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| GET | `/api/0/hosts` | All visible hosts |
|
||||||
|
| GET | `/api/0/alerts` | All active alerts |
|
||||||
|
| GET | `/api/0/alert_summary` | Count of ok/warning/critical |
|
||||||
|
| GET | `/api/0/messages` | Last 30 messages |
|
||||||
|
| GET | `/api/0/hosts/{host}/plugins` | All plugin data for host |
|
||||||
|
| GET | `/api/0/hosts/{host}/plugins/{plugin}?limit=N` | Plugin samples |
|
||||||
|
| GET | `/api/0/hosts/{host}/alerts` | Alert states for host |
|
||||||
|
| GET | `/api/0/hosts/{host}/access` | Access roles |
|
||||||
|
| PUT | `/api/0/hosts/{host}/access` | Update access roles |
|
||||||
|
| GET | `/api/0/hosts/{host}/info` | Host info (hbc version, thresholds) |
|
||||||
|
| POST | `/api/0/alerts/acknowledge` | Acknowledge alert |
|
||||||
|
| GET | `/api/0/users` | All users (admin only) |
|
||||||
|
| GET | `/api/0/users/me` | Current user profile |
|
||||||
|
| PUT | `/api/0/users/me` | Update own profile |
|
||||||
|
| POST | `/api/0/auth/login` | Create session |
|
||||||
|
| POST | `/api/0/auth/logout` | Destroy session |
|
||||||
|
| GET | `/api/0/config` | Server config (secrets redacted) |
|
||||||
|
| POST | `/api/0/config` | Update config |
|
||||||
|
| GET | `/api/0/config/backups` | List config backups |
|
||||||
|
| POST | `/api/0/config/rollback` | Roll back to previous config |
|
||||||
|
| GET | `/api/0/notification_channels` | List channels |
|
||||||
|
| POST | `/api/0/notification_channels` | Create channel |
|
||||||
|
| PUT | `/api/0/notification_channels/{name}` | Update channel |
|
||||||
|
| DELETE | `/api/0/notification_channels/{name}` | Delete channel |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## User Management & Authentication
|
||||||
|
|
||||||
|
When no `users:` block is in config, the server runs unauthenticated — all existing behaviour is preserved.
|
||||||
|
|
||||||
|
### Roles
|
||||||
|
|
||||||
|
| Role | Capabilities |
|
||||||
|
|---|---|
|
||||||
|
| monitor | View status, plugin data, alerts |
|
||||||
|
| manager | monitor + queue commands, trigger DNS, queue upgrades |
|
||||||
|
| owner | manager + drop host, transfer ownership, update access |
|
||||||
|
| admin | Owner-level on all hosts + access to server config and users |
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
password: pbkdf2:sha256:... # hbd passwd alice
|
||||||
|
admin: true
|
||||||
|
notification_channels: [pushover_ops]
|
||||||
|
|
||||||
|
default_owner: alice # Owns any host with no explicit owner
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
```
|
||||||
|
|
||||||
|
Password hashing uses PBKDF2-HMAC-SHA256 (260,000 iterations). Sessions expire after 24 hours.
|
||||||
|
|
||||||
|
OAuth2 login (Gitea) is supported:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
oauth:
|
||||||
|
gitea:
|
||||||
|
url: https://git.example.com
|
||||||
|
client_id: xxx
|
||||||
|
client_secret: yyy
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dynamic DNS
|
||||||
|
|
||||||
|
When `dyndns: true` is set on a host and `dyndomains` is configured, the server updates DNS via `nsupdate` whenever the host's source address changes.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nsupdate_bin: /usr/bin/nsupdate
|
||||||
|
dyndomains:
|
||||||
|
- example.com
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
dyndns: true
|
||||||
|
```
|
||||||
|
|
||||||
|
DNS updates run asynchronously in a background worker.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Message Journal
|
||||||
|
|
||||||
|
All received messages are logged in JSONL format with automatic size-based rotation.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
journal_enabled: true
|
||||||
|
journal_dir: /var/log/heartbeat
|
||||||
|
journal_file: messages.journal
|
||||||
|
journal_max_size: 104857600 # 100 MB
|
||||||
|
journal_max_backups: 10
|
||||||
|
```
|
||||||
|
|
||||||
|
Example entry:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver01","interval":10}}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `hbc_mini` — Zero-dependency client
|
||||||
|
|
||||||
|
`scripts/hbc_mini.py` is a single-file client requiring only Python 3.8+ and no external packages. Copy it to any host and run directly.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 hbc_mini.py your-server.example.com
|
||||||
|
python3 hbc_mini.py -d your-server.example.com # daemon mode
|
||||||
|
python3 hbc_mini.py -b your-server.example.com # send boot message
|
||||||
|
```
|
||||||
|
|
||||||
|
Config: `~/.hbc.json` (JSON format, same keys as `~/.hbc.yaml`).
|
||||||
|
|
||||||
|
**Available plugins:**
|
||||||
|
|
||||||
|
| Plugin | Platform |
|
||||||
|
|---|---|
|
||||||
|
| `os_info` | All |
|
||||||
|
| `ping_monitor` | All |
|
||||||
|
| `nagios_runner` | All (not Windows) |
|
||||||
|
| `cpu_monitor` | Linux (`/proc/stat`; no per-core, no frequency) |
|
||||||
|
| `memory_monitor` | Linux (`/proc/meminfo`) |
|
||||||
|
| `disk_monitor` | Linux, macOS, BSD (`df -P`) |
|
||||||
|
| `network_monitor` | Linux (`/proc/net/dev`) |
|
||||||
|
|
||||||
|
Not available vs full `hbc`: no YAML config, no `filesystem_info`, no `zfs_monitor`, no IPv6 early-fail protection.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `hbc_mini.c` — C client
|
||||||
|
|
||||||
|
`scripts/c/hbc_mini.c` is a single-file C port of `hbc_mini.py`. It has no runtime dependencies beyond libc, zlib, pthreads, and libm, and runs on Linux, FreeBSD, NetBSD, and DragonFly BSD.
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cc -O2 -o hbc_mini scripts/c/hbc_mini.c -lz -lpthread -lm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
The CLI is identical to `hbc_mini.py`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./hbc_mini your-server.example.com
|
||||||
|
./hbc_mini -d your-server.example.com # daemon mode (logs to syslog)
|
||||||
|
./hbc_mini -b your-server.example.com # send boot message
|
||||||
|
./hbc_mini -m "note" your-server.example.com # send one-shot message
|
||||||
|
./hbc_mini -4 your-server.example.com # IPv4 only
|
||||||
|
./hbc_mini -6 your-server.example.com # IPv6 only
|
||||||
|
```
|
||||||
|
|
||||||
|
Config: `~/.hbc.json` (JSON, same keys as the Python version).
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
The C client uses two threads:
|
||||||
|
|
||||||
|
- **Main thread** — heartbeat sender loop + `select()`-based receive loop (1 s timeout). Sends `HTB` at the configured interval, receives `ACK`/`CMD` messages, and re-sends `os_info` on server request.
|
||||||
|
- **Monitor thread** — all periodic plugins in a single thread with a 1-second sleep loop. Each plugin has its own next-run timestamp tracked independently.
|
||||||
|
|
||||||
|
SIGHUP causes the process to restart itself via `execv()`. SIGTERM/SIGINT trigger a clean shutdown (sends a shutdown heartbeat if `-b` was used).
|
||||||
|
|
||||||
|
### Available plugins
|
||||||
|
|
||||||
|
| Plugin | Platform | Data source |
|
||||||
|
|---|---|---|
|
||||||
|
| `os_info` | Linux, FreeBSD, NetBSD, DragonFly | `uname(2)`, `/etc/os-release`, `kern.osrelease` sysctl |
|
||||||
|
| `cpu_monitor` | Linux | `/proc/stat` |
|
||||||
|
| `cpu_monitor` | FreeBSD, DragonFly, NetBSD | `kern.cp_time` sysctl |
|
||||||
|
| `memory_monitor` | Linux | `/proc/meminfo` (ZFS ARC-aware) |
|
||||||
|
| `memory_monitor` | FreeBSD, DragonFly | `vm.stats.vm.*` sysctl |
|
||||||
|
| `memory_monitor` | NetBSD | `VM_UVMEXP` sysctl |
|
||||||
|
| `disk_monitor` | All | `df -P` subprocess |
|
||||||
|
| `network_monitor` | Linux | `/proc/net/dev` |
|
||||||
|
| `network_monitor` | FreeBSD, NetBSD, DragonFly | `getifaddrs()` + `AF_LINK` |
|
||||||
|
| `ping_monitor` | All | `ping` subprocess |
|
||||||
|
| `nagios_runner` | All | `popen()` subprocess |
|
||||||
|
|
||||||
|
`cpu_monitor` reports: `cpu_percent`, `cpu_user`, `cpu_system`, `cpu_idle`, `cpu_iowait` (Linux only), load averages, `cpu_core_count`, `uptime_seconds`.
|
||||||
|
|
||||||
|
`memory_monitor` reports: `memory_total`, `memory_used`, `memory_available`, `memory_free`, `memory_percent`, and swap fields when swap is present.
|
||||||
|
|
||||||
|
`network_monitor` reports per-interface cumulative `bytes_recv`/`bytes_sent` and interval deltas. The loopback interface (`lo`) is skipped by default; this is configurable:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"plugins": {
|
||||||
|
"network_monitor": {
|
||||||
|
"skip_interfaces": ["lo", "docker0"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`disk_monitor` reports per-mount `total`, `used`, `free`, `percent`. An optional mount filter restricts reporting to specific paths:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"plugins": {
|
||||||
|
"disk_monitor": {
|
||||||
|
"mounts": ["/", "/data"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Differences from `hbc_mini.py`
|
||||||
|
|
||||||
|
- No `filesystem_info` or `zfs_monitor` plugins
|
||||||
|
- `UPD` (self-update) messages are logged but not acted on
|
||||||
|
- No IPv6 early-fail protection
|
||||||
|
- Config is JSON only (`~/.hbc.json`), no YAML
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Running tests
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# with project root on PYTHONPATH
|
|
||||||
PYTHONPATH=. python -m unittest discover -v
|
PYTHONPATH=. python -m unittest discover -v
|
||||||
# or with pytest if installed
|
# or
|
||||||
pytest -q
|
pytest -q
|
||||||
```
|
```
|
||||||
|
|
||||||
Developer tooling included:
|
### Linting and type checking
|
||||||
- `pyproject.toml` — project metadata and dependencies
|
|
||||||
- `requirements-dev.txt` — dev/test dependencies
|
|
||||||
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
|
||||||
|
|
||||||
To run linters and type checks locally:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# after installing dev deps
|
|
||||||
tox -e lint
|
tox -e lint
|
||||||
tox -e mypy
|
tox -e mypy
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
### Debugging in VS Code
|
||||||
|
|
||||||
## 🚀 Running in production
|
A `.vscode/launch.json` is included with configurations for running and attaching the debugger. Select the project `.venv` as the Python interpreter, then use F5.
|
||||||
|
|
||||||
- Use your system service manager (systemd, launchd, etc.) to run `hbd` in the background.
|
To start with debugpy and wait for attach:
|
||||||
- Ensure `nsupdate` and necessary credentials are available for dynamic DNS updates.
|
|
||||||
- Configure TLS for WSS if you enable secure websockets.
|
|
||||||
|
|
||||||
> Note: The project contains a small example for obtaining DNS-verified certs (certbot with RFC2136) — see earlier commit history or ask me to re-add the example to this README if you want it documented here.
|
```bash
|
||||||
|
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli serve -c .hb.yaml -f -v
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🤝 Contributing
|
## License
|
||||||
|
|
||||||
Contributions welcome! Please:
|
|
||||||
1. Open an issue to discuss larger changes.
|
|
||||||
2. Create a topic branch and a clear PR.
|
|
||||||
3. Add tests for new features and run linters.
|
|
||||||
4. Keep changes focused and documented.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📜 License
|
|
||||||
|
|
||||||
This repository is licensed under the MIT license. See `LICENSE` for details.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
If you'd like, I can also:
|
|
||||||
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
|
||||||
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
|
||||||
|
|
||||||
Which one should I do next? ✨
|
|
||||||
|
|
||||||
|
MIT. See `LICENSE` for details.
|
||||||
|
|||||||
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
@@ -0,0 +1,291 @@
|
|||||||
|
# Configuration Reload
|
||||||
|
|
||||||
|
The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
|
||||||
|
|
||||||
|
## How to Reload Configuration
|
||||||
|
|
||||||
|
Send a SIGHUP signal to the running hbd process:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Find the process ID
|
||||||
|
ps aux | grep hbd
|
||||||
|
|
||||||
|
# Or use pidof/pgrep
|
||||||
|
pidof hbd
|
||||||
|
pgrep -f hbd
|
||||||
|
|
||||||
|
# Send SIGHUP signal
|
||||||
|
kill -HUP <pid>
|
||||||
|
|
||||||
|
# Or if using systemd
|
||||||
|
systemctl reload heartbeat
|
||||||
|
```
|
||||||
|
|
||||||
|
## What Can Be Reloaded
|
||||||
|
|
||||||
|
The following configuration sections can be reloaded without restarting:
|
||||||
|
|
||||||
|
### ✅ Fully Reloadable
|
||||||
|
|
||||||
|
- **Notification Channels** (`notification_channels`)
|
||||||
|
- Add, remove, or modify notification channel definitions
|
||||||
|
- Update tokens, API keys, SMTP credentials
|
||||||
|
- Change recipient lists
|
||||||
|
|
||||||
|
- **Threshold Configurations** (`threshold_configs`)
|
||||||
|
- Modify warning and critical thresholds
|
||||||
|
- Add or remove threshold rules
|
||||||
|
- Change operators and hysteresis values
|
||||||
|
- Update display formats
|
||||||
|
|
||||||
|
- **Host Configuration** (`hosts`)
|
||||||
|
- Change watch status
|
||||||
|
- Update notification channel assignments
|
||||||
|
- Modify threshold config assignments
|
||||||
|
- Change dyndns status
|
||||||
|
|
||||||
|
- **Host Lists**
|
||||||
|
- `watchhosts` - hosts to monitor
|
||||||
|
- `dyndnshosts` - hosts with dynamic DNS
|
||||||
|
- `drophosts` - hosts to ignore
|
||||||
|
|
||||||
|
- **Runtime Settings**
|
||||||
|
- `grace` - grace period multiplier
|
||||||
|
- `interval` - expected heartbeat interval
|
||||||
|
- `threshold_renotify_interval` - re-notification interval
|
||||||
|
- `debug` - debug level
|
||||||
|
- `verbose` - verbose output
|
||||||
|
|
||||||
|
- **DNS Settings**
|
||||||
|
- `dyndomains` - dynamic DNS domains
|
||||||
|
- `nsupdate_bin` - nsupdate binary path
|
||||||
|
- `rndc_key` - RNDC key path
|
||||||
|
|
||||||
|
### ⚠️ Requires Restart
|
||||||
|
|
||||||
|
The following settings **cannot** be reloaded and require a service restart:
|
||||||
|
|
||||||
|
- **Network Ports**
|
||||||
|
- `hb_port` - UDP heartbeat port
|
||||||
|
- `hbd_port` - HTTP API port
|
||||||
|
- `ws_port` - WebSocket port
|
||||||
|
- `wss_port` - Secure WebSocket port
|
||||||
|
|
||||||
|
- **SSL/TLS Settings**
|
||||||
|
- `cert_path` - SSL certificate path
|
||||||
|
- `wss_pem` - SSL certificate file
|
||||||
|
- `wss_key` - SSL key file
|
||||||
|
|
||||||
|
- **Persistence**
|
||||||
|
- `pickfile` - Pickle file path
|
||||||
|
|
||||||
|
- **Logging**
|
||||||
|
- `logfile` - Log file path
|
||||||
|
|
||||||
|
- **Journal Settings**
|
||||||
|
- `journal_enabled` - Enable/disable journaling
|
||||||
|
- `journal_dir` - Journal directory
|
||||||
|
- `journal_file` - Journal filename
|
||||||
|
- `journal_max_size` - Maximum journal size
|
||||||
|
- `journal_max_backups` - Number of backup files
|
||||||
|
|
||||||
|
## Reload Process
|
||||||
|
|
||||||
|
When a SIGHUP signal is received:
|
||||||
|
|
||||||
|
1. **Configuration File Loading**
|
||||||
|
- The config file is re-read from disk
|
||||||
|
- YAML parsing is performed
|
||||||
|
- Validation checks are run
|
||||||
|
|
||||||
|
2. **Component Updates**
|
||||||
|
- Notification system is updated with new channel definitions
|
||||||
|
- Threshold checker reloads all threshold configurations
|
||||||
|
- Alert states are preserved to maintain hysteresis
|
||||||
|
|
||||||
|
3. **Error Handling**
|
||||||
|
- If reload fails, the previous configuration is kept
|
||||||
|
- Error messages are logged
|
||||||
|
- Service continues running with old configuration
|
||||||
|
|
||||||
|
4. **Logging**
|
||||||
|
- Reload start and completion are logged
|
||||||
|
- Each component reports its reload status
|
||||||
|
- Total number of thresholds is reported
|
||||||
|
|
||||||
|
## Example Reload Session
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Terminal 1: Watch the logs
|
||||||
|
tail -f /var/log/heartbeat.log
|
||||||
|
|
||||||
|
# Terminal 2: Edit configuration
|
||||||
|
vim /path/to/.hb.yaml
|
||||||
|
|
||||||
|
# Make changes to notification channels or thresholds
|
||||||
|
# Save the file
|
||||||
|
|
||||||
|
# Terminal 3: Trigger reload
|
||||||
|
kill -HUP $(pgrep -f hbd)
|
||||||
|
|
||||||
|
# Terminal 1: See reload messages
|
||||||
|
2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Starting configuration reload...
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
|
||||||
|
2026-04-01 12:34:56 INFO: Notification configuration reloaded
|
||||||
|
2026-04-01 12:34:56 INFO: Reloading threshold configuration...
|
||||||
|
2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Configuration reload completed successfully
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Use Cases
|
||||||
|
|
||||||
|
### 1. Update Notification Credentials
|
||||||
|
|
||||||
|
If you need to rotate API keys or update SMTP passwords:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
pushover_standard:
|
||||||
|
type: pushover
|
||||||
|
token: new-token-here # Updated
|
||||||
|
user: new-user-key-here # Updated
|
||||||
|
```
|
||||||
|
|
||||||
|
Just edit the config file and send SIGHUP - no restart needed.
|
||||||
|
|
||||||
|
### 2. Adjust Threshold Values
|
||||||
|
|
||||||
|
Fine-tune alerting thresholds based on observed behavior:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 85.0 # Increased from 80.0
|
||||||
|
critical: 95.0 # Increased from 90.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Send SIGHUP to apply the new thresholds immediately.
|
||||||
|
|
||||||
|
### 3. Add New Notification Channels
|
||||||
|
|
||||||
|
Add a new notification destination:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
email_oncall:
|
||||||
|
type: email
|
||||||
|
recipients: [oncall@example.com]
|
||||||
|
sender: alerts@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
critical_server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard, email_oncall] # Added
|
||||||
|
```
|
||||||
|
|
||||||
|
The new channel becomes active immediately after SIGHUP.
|
||||||
|
|
||||||
|
### 4. Update Watch List
|
||||||
|
|
||||||
|
Start or stop monitoring hosts without restart:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
new_server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true # Start watching
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Test Configuration Before Reload**
|
||||||
|
- Validate YAML syntax before sending SIGHUP
|
||||||
|
- Check for typos in channel names
|
||||||
|
- Verify threshold values are reasonable
|
||||||
|
|
||||||
|
2. **Monitor Reload Logs**
|
||||||
|
- Always check logs after reload to confirm success
|
||||||
|
- Look for error messages if reload fails
|
||||||
|
- Verify expected number of thresholds loaded
|
||||||
|
|
||||||
|
3. **Backup Before Changes**
|
||||||
|
- Keep a backup of working configuration
|
||||||
|
- Use version control (git) for config files
|
||||||
|
- Document why changes were made
|
||||||
|
|
||||||
|
4. **Gradual Rollout**
|
||||||
|
- Test changes on development server first
|
||||||
|
- Apply to one production server at a time
|
||||||
|
- Verify behavior before applying everywhere
|
||||||
|
|
||||||
|
5. **Plan for Restart-Required Changes**
|
||||||
|
- Schedule downtime for port or SSL changes
|
||||||
|
- Use blue-green deployment if possible
|
||||||
|
- Keep service downtime minimal
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Reload Doesn't Apply Changes
|
||||||
|
|
||||||
|
**Check:**
|
||||||
|
- Is the config file path correct?
|
||||||
|
- Did you save the file after editing?
|
||||||
|
- Are there YAML syntax errors?
|
||||||
|
- Check the logs for error messages
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Validate YAML syntax
|
||||||
|
python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
|
||||||
|
|
||||||
|
# Check file modification time
|
||||||
|
ls -l .hb.yaml
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
journalctl -u heartbeat -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Partial Configuration Applied
|
||||||
|
|
||||||
|
**Cause:** Some sections reloaded, others didn't.
|
||||||
|
|
||||||
|
**Solution:** Check logs to see which components failed. Common issues:
|
||||||
|
- Invalid channel type
|
||||||
|
- Missing required threshold fields
|
||||||
|
- Invalid host references
|
||||||
|
|
||||||
|
### Service Becomes Unresponsive
|
||||||
|
|
||||||
|
**Cause:** Malformed configuration caused an exception.
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
1. Revert to backup configuration
|
||||||
|
2. Send SIGHUP again to reload the good config
|
||||||
|
3. If service is completely stuck, restart it
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
The reload mechanism uses:
|
||||||
|
|
||||||
|
- **Signal Handling**: SIGHUP triggers reload event
|
||||||
|
- **Async-Safe Reloading**: Configuration is loaded asynchronously
|
||||||
|
- **Component Coordination**: All affected components are updated atomically
|
||||||
|
- **State Preservation**: Alert states and hysteresis information are maintained
|
||||||
|
- **Error Recovery**: Failed reloads don't affect running configuration
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
|
||||||
|
- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
|
||||||
|
- Configuration examples in `hbd/config_*.yaml`
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
# Dark Mode
|
||||||
|
|
||||||
|
Every page in the Heartbeat web UI supports light mode, dark mode, and automatic (follows the OS/browser setting). Each user picks their preference independently; it is stored in the browser and takes effect immediately without a page reload.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Choosing a theme
|
||||||
|
|
||||||
|
Open your profile page (`/profile`) and scroll to the **Appearance** section. Click one of the three buttons:
|
||||||
|
|
||||||
|
| Button | Behaviour |
|
||||||
|
|--------|-----------|
|
||||||
|
| **Auto** | Follows the OS or browser dark-mode preference. Updates live if the system setting changes. |
|
||||||
|
| **Light** | Always light, regardless of system setting. |
|
||||||
|
| **Dark** | Always dark, regardless of system setting. |
|
||||||
|
|
||||||
|
The preference is stored in `localStorage` under the key `hbd_theme` and applies to the current browser only. Clearing browser storage resets it to **Auto**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation notes
|
||||||
|
|
||||||
|
### No flash of unstyled content
|
||||||
|
|
||||||
|
A small synchronous `<script>` runs at the very top of `<head>`, before any CSS is parsed, and sets `data-theme="dark"` on `<html>` when the stored preference (or the system setting in auto mode) calls for dark. Because it runs before paint, there is no visible flicker on page load.
|
||||||
|
|
||||||
|
### CSS custom properties
|
||||||
|
|
||||||
|
All colours are expressed as CSS custom properties defined in `head.html`:
|
||||||
|
|
||||||
|
```
|
||||||
|
:root — light-mode values (default)
|
||||||
|
html[data-theme="dark"] — dark-mode overrides
|
||||||
|
```
|
||||||
|
|
||||||
|
Key variables:
|
||||||
|
|
||||||
|
| Variable | Purpose |
|
||||||
|
|----------|---------|
|
||||||
|
| `--bg` | Page background |
|
||||||
|
| `--surface` | Card / panel background |
|
||||||
|
| `--surface-2` / `--surface-3` | Slightly lighter/darker surfaces (table rows, hover states) |
|
||||||
|
| `--text` / `--text-sec` / `--text-muted` | Primary, secondary, muted text |
|
||||||
|
| `--border` / `--border-2`…`4` | Border shades from prominent to faint |
|
||||||
|
| `--link` | Hyperlink and interactive-element colour |
|
||||||
|
| `--nav-bg` | Navigation bar background |
|
||||||
|
| `--input-bg` / `--input-border` | Form control colours |
|
||||||
|
| `--shadow` / `--shadow-sm` | Box-shadow alphas |
|
||||||
|
|
||||||
|
A single global rule in `head.html` themes all `<input>`, `<select>`, and `<textarea>` elements across every page at once:
|
||||||
|
|
||||||
|
```css
|
||||||
|
html[data-theme="dark"] input:not([type=checkbox]):not([type=radio]),
|
||||||
|
html[data-theme="dark"] select,
|
||||||
|
html[data-theme="dark"] textarea { … }
|
||||||
|
```
|
||||||
|
|
||||||
|
Each page template adds its own `html[data-theme="dark"]` block for page-specific elements (cards, tables, badges, etc.).
|
||||||
|
|
||||||
|
### Auto-mode live updates
|
||||||
|
|
||||||
|
A `matchMedia` change listener in `head.html` updates `data-theme` whenever the OS preference changes, so users in **Auto** mode see the theme switch without reloading.
|
||||||
|
|
||||||
|
### Semantic colours are unchanged
|
||||||
|
|
||||||
|
Alert colours (red for critical, orange for warning, green for ok) and status indicators are intentionally left as fixed values — they are semantic signals, not surface colours, and look correct on both light and dark backgrounds.
|
||||||
@@ -0,0 +1,738 @@
|
|||||||
|
# HTTP API and Web UI Documentation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
All API endpoints are relative to the server base URL:
|
||||||
|
```
|
||||||
|
http://your-server:50004
|
||||||
|
```
|
||||||
|
|
||||||
|
Default port is `50004` (configurable via `hbd_port` in configuration).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
When [user accounts are configured](USERS.md), every request must be authenticated.
|
||||||
|
|
||||||
|
- **Browser requests** to HTML pages are redirected to `/login` automatically. JavaScript `fetch()` calls on the dashboards send the session cookie automatically — no JS changes are needed.
|
||||||
|
- **API / programmatic requests** must include the token in an `Authorization: Bearer <token>` header or an `X-Auth-Token` header.
|
||||||
|
|
||||||
|
Unauthenticated API requests receive `401 Unauthorized`. When no users are configured the server runs in unauthenticated mode and all endpoints are open.
|
||||||
|
|
||||||
|
### Login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||||
|
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
See [User Management](USERS.md) for full authentication documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
|
||||||
|
| Method | Path | Description | Auth required |
|
||||||
|
|--------|------|-------------|---------------|
|
||||||
|
| `POST` | `/api/0/auth/login` | Obtain session token | No |
|
||||||
|
| `POST` | `/api/0/auth/logout` | Invalidate session | Token |
|
||||||
|
|
||||||
|
### Users
|
||||||
|
|
||||||
|
| Method | Path | Description | Role |
|
||||||
|
|--------|------|-------------|------|
|
||||||
|
| `GET` | `/api/0/users` | List all users | Admin |
|
||||||
|
| `GET` | `/api/0/users/me` | Own profile | Authenticated |
|
||||||
|
| `PUT` | `/api/0/users/me` | Update own profile | Authenticated |
|
||||||
|
|
||||||
|
### Notification Channels
|
||||||
|
|
||||||
|
| Method | Path | Description | Role |
|
||||||
|
|--------|------|-------------|------|
|
||||||
|
| `GET` | `/api/0/notification_channel_types` | Channel type schemas | Authenticated |
|
||||||
|
| `GET` | `/api/0/notification_channels` | List visible channels | Authenticated |
|
||||||
|
| `POST` | `/api/0/notification_channels` | Create a channel | Authenticated |
|
||||||
|
| `PUT` | `/api/0/notification_channels/{name}` | Update a channel | Owner or Admin |
|
||||||
|
| `DELETE` | `/api/0/notification_channels/{name}` | Delete a channel | Owner or Admin |
|
||||||
|
|
||||||
|
### Host Management
|
||||||
|
|
||||||
|
#### GET /api/0/hosts
|
||||||
|
Get list of all monitored hosts with their state information. When auth is enabled, only hosts the caller has at least **monitor** access to are returned.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "webserver01",
|
||||||
|
"dyn": false,
|
||||||
|
"owner": "alice",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"],
|
||||||
|
"connections": [...]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/messages
|
||||||
|
Get recent heartbeat messages (last 30).
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"time": 1711234567.123,
|
||||||
|
"host": "webserver01",
|
||||||
|
"msg": "heartbeat received"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Plugin Data Endpoints
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/plugins
|
||||||
|
Get all plugin data for a specific host.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"plugins": {
|
||||||
|
"cpu_monitor": {
|
||||||
|
"timestamp": 1711234567.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 45.2,
|
||||||
|
"load_1min": 2.5,
|
||||||
|
"load_5min": 2.1,
|
||||||
|
"load_15min": 1.8
|
||||||
|
},
|
||||||
|
"sample_count": 100
|
||||||
|
},
|
||||||
|
"memory_monitor": {
|
||||||
|
"timestamp": 1711234568.456,
|
||||||
|
"data": {
|
||||||
|
"percent": 65.4,
|
||||||
|
"available_mb": 4096,
|
||||||
|
"total_mb": 16384
|
||||||
|
},
|
||||||
|
"sample_count": 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
|
||||||
|
Get detailed historical data for a specific plugin.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
- `plugin_name` (path): Name of the plugin
|
||||||
|
- `limit` (query, optional): Number of recent samples to return (default: 10)
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"plugin": "cpu_monitor",
|
||||||
|
"samples": [
|
||||||
|
{
|
||||||
|
"timestamp": 1711234567.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 45.2,
|
||||||
|
"load_1min": 2.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"timestamp": 1711234267.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 42.1,
|
||||||
|
"load_1min": 2.3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sample_count": 2
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
```bash
|
||||||
|
# Get last 1 sample (most recent)
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
|
||||||
|
|
||||||
|
# Get last 50 samples
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
|
||||||
|
|
||||||
|
# Get disk monitor data
|
||||||
|
curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Host Access
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/access
|
||||||
|
Get owner/managers/monitors for a host. Requires **monitor** role or higher.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "alice",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### PUT /api/0/hosts/{hostname}/access
|
||||||
|
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||||
|
|
||||||
|
**Request body** (all fields optional):
|
||||||
|
```json
|
||||||
|
{ "owner": "bob", "managers": ["carol"], "monitors": [] }
|
||||||
|
```
|
||||||
|
|
||||||
|
Changes take effect immediately but are not written back to the config file. Update the config file and send `SIGHUP` to make them permanent.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Notification Channel Endpoints
|
||||||
|
|
||||||
|
Channels are visible to all users by default. Channels marked `private: true` are only visible to their owner. Admins see all channels.
|
||||||
|
|
||||||
|
#### GET /api/0/notification_channel_types
|
||||||
|
Return the schema for every supported notifier type. Used by the web UI to dynamically render the channel creation form.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"pushover": {
|
||||||
|
"label": "Pushover",
|
||||||
|
"fields": [
|
||||||
|
{"key": "token", "label": "App token", "type": "secret", "required": true},
|
||||||
|
{"key": "user", "label": "User key", "type": "secret", "required": true},
|
||||||
|
{"key": "sound", "label": "Sound", "type": "text", "required": false}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"email": { "label": "E-mail", "fields": [ ... ] },
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### GET /api/0/notification_channels
|
||||||
|
List channels visible to the current user (public channels + own private channels). Admins receive all channels.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "pushover_ops",
|
||||||
|
"type": "pushover",
|
||||||
|
"type_label": "Pushover",
|
||||||
|
"owner": null,
|
||||||
|
"private": false,
|
||||||
|
"min_level": "WARNING",
|
||||||
|
"fields": [
|
||||||
|
{"key": "token", "label": "App token", "value": "•••", "sensitive": true},
|
||||||
|
{"key": "user", "label": "User key", "value": "•••", "sensitive": true}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Sensitive fields (`type: "secret"`) are always returned as `"•••"`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### POST /api/0/notification_channels
|
||||||
|
Create a new channel. The creating user becomes the channel's `owner`.
|
||||||
|
|
||||||
|
**Request body:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "my_pushover",
|
||||||
|
"type": "pushover",
|
||||||
|
"token": "app-token",
|
||||||
|
"user": "user-key",
|
||||||
|
"min_level": "WARNING",
|
||||||
|
"private": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:** `{"ok": true, "name": "my_pushover"}`
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `400` (missing required field or unknown type), `409` (name already exists)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### PUT /api/0/notification_channels/{name}
|
||||||
|
Update an existing channel. Only the channel owner or an admin may update it.
|
||||||
|
|
||||||
|
Secret fields sent as `"•••"` are preserved from the existing config (same pattern as OAuth secrets in the admin config editor).
|
||||||
|
|
||||||
|
**Request body:** same shape as POST, `name` ignored (taken from URL).
|
||||||
|
|
||||||
|
**Response:** `{"ok": true}`
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `403 Forbidden`, `404 Not Found`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### DELETE /api/0/notification_channels/{name}
|
||||||
|
Delete a channel. Only the channel owner or an admin may delete it.
|
||||||
|
|
||||||
|
**Response:** `{"ok": true}`
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `403 Forbidden`, `404 Not Found`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Alert Endpoints
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/alerts
|
||||||
|
Get alert states for a specific host.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"alerts": [
|
||||||
|
{
|
||||||
|
"metric_path": "cpu_monitor.cpu_percent",
|
||||||
|
"level": "WARNING",
|
||||||
|
"since": 1711234000.0,
|
||||||
|
"last_value": 85.5,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metric_path": "disk_monitor./.percent",
|
||||||
|
"level": "OK",
|
||||||
|
"since": 1711230000.0,
|
||||||
|
"last_value": 65.0,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"ok": 15,
|
||||||
|
"warning": 1,
|
||||||
|
"critical": 0,
|
||||||
|
"unknown": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/alerts
|
||||||
|
Get all active alerts across all monitored hosts.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"alerts": [
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"metric_path": "cpu_monitor.cpu_percent",
|
||||||
|
"level": "CRITICAL",
|
||||||
|
"since": 1711234000.0,
|
||||||
|
"last_value": 95.5,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hostname": "database01",
|
||||||
|
"metric_path": "memory_monitor.percent",
|
||||||
|
"level": "WARNING",
|
||||||
|
"since": 1711233000.0,
|
||||||
|
"last_value": 88.2,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"critical": 1,
|
||||||
|
"warning": 1,
|
||||||
|
"unknown": 0,
|
||||||
|
"total": 2
|
||||||
|
},
|
||||||
|
"host_count": 5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/alerts | jq .
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Web UI Pages
|
||||||
|
|
||||||
|
### Login
|
||||||
|
**URL:** `/login`
|
||||||
|
|
||||||
|
Shown automatically when a browser request is made without a valid session (when users are configured). After successful login the browser is redirected to the originally requested page.
|
||||||
|
|
||||||
|
### Logout
|
||||||
|
**URL:** `/logout`
|
||||||
|
|
||||||
|
Clears the session cookie and redirects to `/login`.
|
||||||
|
|
||||||
|
### Live Dashboard
|
||||||
|
**URL:** `/live`
|
||||||
|
|
||||||
|
Real-time dashboard showing:
|
||||||
|
- Host connection states
|
||||||
|
- IPv4/IPv6 connectivity
|
||||||
|
- Latency metrics
|
||||||
|
- Recent messages
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- WebSocket-powered live updates
|
||||||
|
- Sortable columns
|
||||||
|
- Color-coded status indicators
|
||||||
|
|
||||||
|
### Plugin Metrics
|
||||||
|
**URL:** `/plugins`
|
||||||
|
|
||||||
|
Interactive visualization of plugin metrics:
|
||||||
|
- Select host and plugin from dropdown
|
||||||
|
- View current metric values
|
||||||
|
- Automatic refresh every 30 seconds
|
||||||
|
- Support for nested metrics (e.g., per-partition disk stats)
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Card-based metric display
|
||||||
|
- Unit formatting (%, MB, GB)
|
||||||
|
- Nested object visualization
|
||||||
|
- Auto-refresh
|
||||||
|
|
||||||
|
**Screenshots of available data:**
|
||||||
|
- CPU usage, load average, frequency
|
||||||
|
- Memory usage, available memory, swap
|
||||||
|
- Disk usage per partition, I/O statistics
|
||||||
|
- Network interface statistics, connection counts
|
||||||
|
- Custom plugin data
|
||||||
|
|
||||||
|
### Alerts Dashboard
|
||||||
|
**URL:** `/alerts`
|
||||||
|
|
||||||
|
Comprehensive alert monitoring:
|
||||||
|
- Summary cards (Critical, Warning, Total Hosts)
|
||||||
|
- Filter by severity (All, Critical, Warning)
|
||||||
|
- Alert details with duration
|
||||||
|
- Auto-refresh every 15 seconds
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Color-coded alert levels
|
||||||
|
- Duration tracking
|
||||||
|
- Filterable list
|
||||||
|
- Real-time updates
|
||||||
|
- Summary statistics
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration Examples
|
||||||
|
|
||||||
|
### Monitoring Script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# Check for critical alerts and send notification
|
||||||
|
|
||||||
|
# Log in first (when auth is configured)
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"monitor","password":"secret"}' | jq -r .token)
|
||||||
|
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||||
|
|
||||||
|
RESPONSE=$(curl -s $AUTH http://localhost:50004/api/0/alerts)
|
||||||
|
CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
|
||||||
|
|
||||||
|
if [ "$CRITICAL_COUNT" -gt 0 ]; then
|
||||||
|
echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
|
||||||
|
echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
|
||||||
|
# Send notification
|
||||||
|
# mail -s "Critical Alerts" admin@example.com < alert_details.txt
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python Client
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
BASE = 'http://localhost:50004'
|
||||||
|
|
||||||
|
# Log in (skip if auth not configured)
|
||||||
|
resp = requests.post(f'{BASE}/api/0/auth/login',
|
||||||
|
json={"username": "alice", "password": "secret"})
|
||||||
|
token = resp.json().get("token")
|
||||||
|
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
||||||
|
|
||||||
|
# Get all plugin data for a host
|
||||||
|
response = requests.get(f'{BASE}/api/0/hosts/webserver01/plugins', headers=headers)
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
print(f"Host: {data['hostname']}")
|
||||||
|
print(f"Plugins: {', '.join(data['plugins'].keys())}")
|
||||||
|
|
||||||
|
for plugin, info in data['plugins'].items():
|
||||||
|
print(f"\n{plugin}:")
|
||||||
|
for metric, value in info['data'].items():
|
||||||
|
print(f" {metric}: {value}")
|
||||||
|
|
||||||
|
# Check for alerts
|
||||||
|
response = requests.get(f'{BASE}/api/0/alerts', headers=headers)
|
||||||
|
alerts = response.json()
|
||||||
|
|
||||||
|
if alerts['summary']['critical'] > 0:
|
||||||
|
print(f"\n⚠️ {alerts['summary']['critical']} CRITICAL ALERTS!")
|
||||||
|
for alert in alerts['alerts']:
|
||||||
|
if alert['level'] == 'CRITICAL':
|
||||||
|
print(f" - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Grafana Integration
|
||||||
|
|
||||||
|
The API endpoints can be used with Grafana's JSON datasource plugin:
|
||||||
|
|
||||||
|
1. Install the SimpleJSON datasource plugin
|
||||||
|
2. Configure datasource URL: `http://your-server:50004`
|
||||||
|
3. Create queries:
|
||||||
|
- Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
|
||||||
|
- Alerts: `/api/0/alerts`
|
||||||
|
|
||||||
|
### Prometheus Integration
|
||||||
|
|
||||||
|
Export metrics in Prometheus format (future enhancement):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example prometheus exporter
|
||||||
|
from prometheus_client import Gauge, generate_latest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
|
||||||
|
memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
|
||||||
|
|
||||||
|
def collect_metrics():
|
||||||
|
hosts = requests.get('http://localhost:50004/api/0/hosts').json()
|
||||||
|
for host in hosts:
|
||||||
|
hostname = host['name']
|
||||||
|
plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
|
||||||
|
|
||||||
|
if 'cpu_monitor' in plugins['plugins']:
|
||||||
|
cpu_data = plugins['plugins']['cpu_monitor']['data']
|
||||||
|
cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
|
||||||
|
|
||||||
|
if 'memory_monitor' in plugins['plugins']:
|
||||||
|
mem_data = plugins['plugins']['memory_monitor']['data']
|
||||||
|
memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Response Formats
|
||||||
|
|
||||||
|
### Success Response
|
||||||
|
All successful API calls return HTTP 200 with JSON body:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"field": "value",
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Response
|
||||||
|
API errors return appropriate HTTP status codes with JSON:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "Host 'unknown-host' not found"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Common Status Codes:**
|
||||||
|
- `200 OK` - Success
|
||||||
|
- `400 Bad Request` - Invalid parameters
|
||||||
|
- `401 Unauthorized` - Missing or invalid session token
|
||||||
|
- `403 Forbidden` - Authenticated but insufficient role
|
||||||
|
- `404 Not Found` - Resource not found
|
||||||
|
- `500 Internal Server Error` - Server error
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## WebSocket API
|
||||||
|
|
||||||
|
For real-time updates, connect to the WebSocket endpoint:
|
||||||
|
|
||||||
|
**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
|
||||||
|
|
||||||
|
**Messages:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "host",
|
||||||
|
"data": {
|
||||||
|
"name": "webserver01",
|
||||||
|
"state": "UP"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "plugin",
|
||||||
|
"data": {
|
||||||
|
"host": "webserver01",
|
||||||
|
"plugin": "cpu_monitor",
|
||||||
|
"data": {...},
|
||||||
|
"timestamp": 1711234567.123
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Enable HTTP Server
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# In your hbd configuration file
|
||||||
|
hbd_host: "" # Listen on all interfaces
|
||||||
|
hbd_port: 50004 # HTTP port
|
||||||
|
ws_port: 50005 # WebSocket port (optional)
|
||||||
|
# wss_port: 50006 # Secure WebSocket (requires SSL)
|
||||||
|
```
|
||||||
|
|
||||||
|
### SSL/TLS Configuration
|
||||||
|
|
||||||
|
For secure WebSocket connections:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
wss_port: 50006
|
||||||
|
cert_path: /etc/heartbeat/certs/
|
||||||
|
wss_pem: server.pem
|
||||||
|
wss_key: server.key
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
|
||||||
|
The API currently does not implement rate limiting. For production use, consider:
|
||||||
|
|
||||||
|
- Placing behind a reverse proxy (nginx, Apache)
|
||||||
|
- Using API gateway for rate limiting
|
||||||
|
- Implementing caching for frequently accessed endpoints
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CORS Support
|
||||||
|
|
||||||
|
By default, CORS is not enabled. To enable for web applications:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# In http.py, add CORS middleware
|
||||||
|
from aiohttp_cors import setup as cors_setup
|
||||||
|
|
||||||
|
app = web.Application()
|
||||||
|
cors = cors_setup(app)
|
||||||
|
|
||||||
|
# Configure CORS for all routes
|
||||||
|
for route in list(app.router.routes()):
|
||||||
|
cors.add(route, {
|
||||||
|
"*": aiohttp_cors.ResourceOptions(
|
||||||
|
allow_credentials=True,
|
||||||
|
expose_headers="*",
|
||||||
|
allow_headers="*",
|
||||||
|
)
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Caching
|
||||||
|
- Plugin data is cached in memory (last 100 samples per plugin)
|
||||||
|
- No database queries required
|
||||||
|
- Responses are fast (<10ms typical)
|
||||||
|
|
||||||
|
### Scalability
|
||||||
|
- Each host stores its own data independently
|
||||||
|
- Memory usage: ~1KB per host + ~1KB per plugin sample
|
||||||
|
- For 100 hosts with 5 plugins: ~50MB memory
|
||||||
|
|
||||||
|
### Best Practices
|
||||||
|
1. Use `limit` parameter to control response size
|
||||||
|
2. Cache responses on client side when appropriate
|
||||||
|
3. Use WebSocket for real-time updates instead of polling
|
||||||
|
4. Consider pagination for large deployments (future enhancement)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### API Returns 401
|
||||||
|
- Auth is configured — include `Authorization: Bearer <token>` header
|
||||||
|
- Token may have expired (24 h TTL) — log in again
|
||||||
|
|
||||||
|
### API Returns 403
|
||||||
|
- Authenticated user lacks the required role for this host/action
|
||||||
|
- Check host's `owner`, `managers`, `monitors` config
|
||||||
|
|
||||||
|
### API Returns 404
|
||||||
|
- Verify hostname in URL matches actual host name
|
||||||
|
- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
|
||||||
|
|
||||||
|
### No Plugin Data
|
||||||
|
- Verify client is configured with plugins
|
||||||
|
- Check client logs for plugin errors
|
||||||
|
- Ensure plugins are sending data (check journal logs)
|
||||||
|
|
||||||
|
### Empty Alerts
|
||||||
|
- Verify thresholds are configured
|
||||||
|
- Check host is in `watchhosts` list
|
||||||
|
- Ensure plugins are collecting metrics
|
||||||
|
- Review server logs for threshold checker errors
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [User Management](USERS.md)
|
||||||
|
- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
|
||||||
|
- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
|
||||||
|
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
||||||
|
- Configuration examples: `hbd/config_example.yaml`
|
||||||
@@ -0,0 +1,413 @@
|
|||||||
|
# Message Journal
|
||||||
|
|
||||||
|
The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
|
||||||
|
- Audit message history
|
||||||
|
- Debug connection issues
|
||||||
|
- Analyze traffic patterns
|
||||||
|
- Replay messages for testing
|
||||||
|
- Create historical reports
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **JSON Format**: Each message is logged as a single JSON line for easy parsing
|
||||||
|
- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
|
||||||
|
- **Automatic Cleanup**: Keeps only a configurable number of backup files
|
||||||
|
- **Thread-Safe**: Safe for concurrent access from multiple async tasks
|
||||||
|
- **Configurable**: All settings controllable via configuration file
|
||||||
|
- **Performance**: Non-blocking async operation with minimal overhead
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Message journal configuration
|
||||||
|
journal_enabled: true # Enable/disable journaling
|
||||||
|
journal_dir: /var/log/heartbeat # Directory for journal files
|
||||||
|
journal_file: messages.journal # Base filename
|
||||||
|
journal_max_size: 104857600 # Max size in bytes (100MB default)
|
||||||
|
journal_max_backups: 10 # Number of backup files to keep
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration Options
|
||||||
|
|
||||||
|
| Option | Default | Description |
|
||||||
|
|--------|---------|-------------|
|
||||||
|
| `journal_enabled` | `true` | Enable or disable message journaling |
|
||||||
|
| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
|
||||||
|
| `journal_file` | `messages.journal` | Base filename for the journal |
|
||||||
|
| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
|
||||||
|
| `journal_max_backups` | `10` | Number of rotated backup files to keep |
|
||||||
|
|
||||||
|
## File Format
|
||||||
|
|
||||||
|
Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||||
|
{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Entry Structure
|
||||||
|
|
||||||
|
Each journal entry contains:
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|-------|------|-------------|
|
||||||
|
| `timestamp` | float | Unix timestamp (seconds since epoch) |
|
||||||
|
| `datetime` | string | ISO 8601 formatted datetime |
|
||||||
|
| `source_ip` | string | Source IP address |
|
||||||
|
| `source_port` | integer | Source UDP port |
|
||||||
|
| `message` | object | Complete parsed message dictionary |
|
||||||
|
|
||||||
|
## Log Rotation
|
||||||
|
|
||||||
|
### How Rotation Works
|
||||||
|
|
||||||
|
1. Journal writes messages to the current file
|
||||||
|
2. When file size exceeds `journal_max_size`, rotation is triggered
|
||||||
|
3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
|
||||||
|
4. New empty file is created as the current journal
|
||||||
|
5. Old backup files exceeding `journal_max_backups` are deleted
|
||||||
|
|
||||||
|
### Example File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/log/heartbeat/
|
||||||
|
├── messages.journal # Current active journal
|
||||||
|
├── messages.journal.20260328-120000 # Rotated backup
|
||||||
|
├── messages.journal.20260328-140000 # Rotated backup
|
||||||
|
└── messages.journal.20260328-160000 # Rotated backup (oldest)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rotation Behavior
|
||||||
|
|
||||||
|
- Rotation is triggered when the next message would exceed the size limit
|
||||||
|
- Rotation is automatic and requires no manual intervention
|
||||||
|
- Old backups are deleted in FIFO order (oldest first)
|
||||||
|
- Rotation is thread-safe and won't lose messages
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Reading Journal Files
|
||||||
|
|
||||||
|
#### Using Python
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Read all entries from current journal
|
||||||
|
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
entry = json.loads(line)
|
||||||
|
print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using jq (command line)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View all messages
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq .
|
||||||
|
|
||||||
|
# Filter by message type
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
|
||||||
|
|
||||||
|
# Filter by hostname
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||||
|
|
||||||
|
# Count messages by type
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||||
|
|
||||||
|
# Extract timestamps and source IPs
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using shell tools
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Count total messages
|
||||||
|
wc -l /var/log/heartbeat/messages.journal
|
||||||
|
|
||||||
|
# View recent messages
|
||||||
|
tail -n 100 /var/log/heartbeat/messages.journal | jq .
|
||||||
|
|
||||||
|
# Search for specific host
|
||||||
|
grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
|
||||||
|
|
||||||
|
# Check journal file size
|
||||||
|
du -h /var/log/heartbeat/messages.journal
|
||||||
|
```
|
||||||
|
|
||||||
|
### Analyzing Historical Data
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Combine all journal files (current + backups)
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
|
||||||
|
|
||||||
|
# Count messages per host
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
|
||||||
|
|
||||||
|
# Find all plugin messages
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
|
||||||
|
|
||||||
|
# Extract CPU metrics from plugin messages
|
||||||
|
cat /var/log/heartbeat/messages.journal* | \
|
||||||
|
jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with Log Management
|
||||||
|
|
||||||
|
### Logrotate
|
||||||
|
|
||||||
|
While the journal has built-in rotation, you can also use logrotate for additional management:
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/log/heartbeat/messages.journal.* {
|
||||||
|
daily
|
||||||
|
rotate 30
|
||||||
|
compress
|
||||||
|
delaycompress
|
||||||
|
missingok
|
||||||
|
notifempty
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Elasticsearch/OpenSearch
|
||||||
|
|
||||||
|
Import journal data into Elasticsearch for advanced analysis:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
import json
|
||||||
|
|
||||||
|
es = Elasticsearch(['localhost:9200'])
|
||||||
|
|
||||||
|
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
entry = json.loads(line)
|
||||||
|
es.index(index='heartbeat-messages', body=entry)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Splunk
|
||||||
|
|
||||||
|
Create a Splunk input for the journal:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[monitor:///var/log/heartbeat/messages.journal*]
|
||||||
|
sourcetype = heartbeat_json
|
||||||
|
index = heartbeat
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Overhead
|
||||||
|
|
||||||
|
- Journal writing is async and non-blocking
|
||||||
|
- Typical overhead: < 1ms per message
|
||||||
|
- Minimal impact on heartbeat processing
|
||||||
|
|
||||||
|
### Disk Usage
|
||||||
|
|
||||||
|
Calculate expected disk usage:
|
||||||
|
|
||||||
|
```
|
||||||
|
Messages per day = (86400 seconds / interval) * number_of_hosts
|
||||||
|
Average message size ≈ 200-500 bytes
|
||||||
|
Daily disk usage = Messages per day * Average message size
|
||||||
|
|
||||||
|
Example:
|
||||||
|
- 100 hosts
|
||||||
|
- 30 second interval
|
||||||
|
- 2880 messages/day per host
|
||||||
|
- 288,000 messages/day total
|
||||||
|
- ~60-140 MB/day
|
||||||
|
```
|
||||||
|
|
||||||
|
### Recommendations
|
||||||
|
|
||||||
|
- **Small deployments** (< 50 hosts): Default settings work well
|
||||||
|
- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
|
||||||
|
- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### Check Journal Status
|
||||||
|
|
||||||
|
The journal exposes statistics that can be queried:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.journal import get_journal
|
||||||
|
|
||||||
|
journal = get_journal()
|
||||||
|
stats = journal.get_stats()
|
||||||
|
print(f"Current size: {stats['current_size']:,} bytes")
|
||||||
|
print(f"Rotation threshold: {stats['rotation_threshold']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log Messages
|
||||||
|
|
||||||
|
Journal operations are logged at appropriate levels:
|
||||||
|
|
||||||
|
- `INFO`: Initialization, rotation events, cleanup
|
||||||
|
- `DEBUG`: Individual message logging
|
||||||
|
- `WARNING`: Non-critical issues
|
||||||
|
- `ERROR`: Critical failures
|
||||||
|
|
||||||
|
Check hbd logs for journal-related messages:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
grep journal /var/log/heartbeat.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Journal Files Not Created
|
||||||
|
|
||||||
|
**Problem**: No journal files appear in the configured directory.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Check `journal_enabled: true` in configuration
|
||||||
|
- Verify directory exists and hbd has write permissions
|
||||||
|
- Check hbd logs for initialization errors
|
||||||
|
- Verify disk space is available
|
||||||
|
|
||||||
|
### Rotation Not Working
|
||||||
|
|
||||||
|
**Problem**: Journal file grows beyond `journal_max_size`.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Check that `journal_max_size` is properly configured
|
||||||
|
- Verify hbd has permission to rename/create files
|
||||||
|
- Check for filesystem issues
|
||||||
|
- Review hbd logs for rotation errors
|
||||||
|
|
||||||
|
### Missing Messages
|
||||||
|
|
||||||
|
**Problem**: Some messages don't appear in journal.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Verify `journal_enabled: true`
|
||||||
|
- Check for write errors in hbd logs
|
||||||
|
- Verify sufficient disk space
|
||||||
|
- Check if filesystem is read-only
|
||||||
|
|
||||||
|
### Performance Issues
|
||||||
|
|
||||||
|
**Problem**: Journal causing slow message processing.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Use faster storage (SSD) for journal directory
|
||||||
|
- Increase `journal_max_size` to reduce rotation frequency
|
||||||
|
- Disable journal if not needed: `journal_enabled: false`
|
||||||
|
- Consider async syslog forwarding instead
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### File Permissions
|
||||||
|
|
||||||
|
Ensure proper permissions on journal files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Journal directory
|
||||||
|
chmod 750 /var/log/heartbeat
|
||||||
|
chown hbd:hbd /var/log/heartbeat
|
||||||
|
|
||||||
|
# Journal files
|
||||||
|
chmod 640 /var/log/heartbeat/messages.journal*
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sensitive Data
|
||||||
|
|
||||||
|
Journal files may contain:
|
||||||
|
- Hostnames and IP addresses
|
||||||
|
- System metrics
|
||||||
|
- Custom message content
|
||||||
|
|
||||||
|
**Recommendations**:
|
||||||
|
- Restrict read access to authorized users only
|
||||||
|
- Consider encryption for archived journals
|
||||||
|
- Implement log retention policies
|
||||||
|
- Sanitize data if sharing for debugging
|
||||||
|
|
||||||
|
## API Reference
|
||||||
|
|
||||||
|
### MessageJournal Class
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MessageJournal:
|
||||||
|
def __init__(self, config: Dict[str, Any])
|
||||||
|
async def initialize(self) -> bool
|
||||||
|
async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
|
||||||
|
async def close(self)
|
||||||
|
def get_stats(self) -> Dict[str, Any]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Module Functions
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_journal(config: Dict = None) -> MessageJournal
|
||||||
|
async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example: Custom Message Processing
|
||||||
|
|
||||||
|
Process journal messages in real-time:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
async def tail_journal(journal_path):
|
||||||
|
"""Follow journal file and process new messages."""
|
||||||
|
path = Path(journal_path)
|
||||||
|
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
# Jump to end
|
||||||
|
f.seek(0, 2)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
entry = json.loads(line)
|
||||||
|
await process_message(entry)
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
async def process_message(entry):
|
||||||
|
"""Process a journal entry."""
|
||||||
|
msg = entry['message']
|
||||||
|
|
||||||
|
# Alert on boot messages
|
||||||
|
if msg.get('boot'):
|
||||||
|
print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
|
||||||
|
|
||||||
|
# Track CPU usage
|
||||||
|
if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
|
||||||
|
cpu = msg.get('cpu_percent', 0)
|
||||||
|
if cpu > 90:
|
||||||
|
print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
Potential improvements for future versions:
|
||||||
|
|
||||||
|
- Compression of rotated logs (gzip)
|
||||||
|
- Time-based rotation in addition to size-based
|
||||||
|
- Filtering to exclude certain message types
|
||||||
|
- Structured logging output formats (CEF, GELF)
|
||||||
|
- Remote syslog forwarding
|
||||||
|
- Message deduplication
|
||||||
|
- Journal file encryption
|
||||||
|
- Signed journal entries
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [Configuration Guide](../hbd/config.py) - Full configuration options
|
||||||
|
- [UDP Protocol](../hbd/udp.py) - Message handling
|
||||||
|
- [Server Architecture](../hbd/server.py) - Server initialization
|
||||||
@@ -0,0 +1,326 @@
|
|||||||
|
# Nagios Plugin Integration Guide
|
||||||
|
|
||||||
|
The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Install Nagios Plugins
|
||||||
|
|
||||||
|
**Debian/Ubuntu:**
|
||||||
|
```bash
|
||||||
|
sudo apt-get install nagios-plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
**RHEL/CentOS/Fedora:**
|
||||||
|
```bash
|
||||||
|
sudo yum install nagios-plugins-all
|
||||||
|
# or
|
||||||
|
sudo dnf install nagios-plugins-all
|
||||||
|
```
|
||||||
|
|
||||||
|
**Arch Linux:**
|
||||||
|
```bash
|
||||||
|
sudo pacman -S monitoring-plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure Heartbeat
|
||||||
|
|
||||||
|
Add the `nagios_runner` section to your `~/.hb.yaml` config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
interval: 60 # Run plugins every 60 seconds
|
||||||
|
timeout: 30 # Command timeout in seconds
|
||||||
|
commands:
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Start Heartbeat Client
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc -v localhost
|
||||||
|
```
|
||||||
|
|
||||||
|
The client will now execute the configured Nagios plugins and send their results to the server.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Nagios Plugin Standard
|
||||||
|
|
||||||
|
Nagios plugins follow a simple interface:
|
||||||
|
|
||||||
|
1. **Exit Codes:**
|
||||||
|
- `0` = OK
|
||||||
|
- `1` = WARNING
|
||||||
|
- `2` = CRITICAL
|
||||||
|
- `3` = UNKNOWN
|
||||||
|
|
||||||
|
2. **Output Format:**
|
||||||
|
```
|
||||||
|
STATUS - Message | performance_data
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Performance Data Format:**
|
||||||
|
```
|
||||||
|
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Plugin Output
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
|
||||||
|
```
|
||||||
|
|
||||||
|
This output includes:
|
||||||
|
- **Status:** `DISK OK`
|
||||||
|
- **Message:** `free space: / 156 GB (78%)`
|
||||||
|
- **Performance Data:** `/=44GB;127;142;0;159`
|
||||||
|
- Current value: 44GB
|
||||||
|
- Warning threshold: 127GB
|
||||||
|
- Critical threshold: 142GB
|
||||||
|
- Min: 0GB
|
||||||
|
- Max: 159GB
|
||||||
|
|
||||||
|
### Data Collected
|
||||||
|
|
||||||
|
The `nagios_runner` plugin collects:
|
||||||
|
|
||||||
|
**For each configured command:**
|
||||||
|
- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
|
||||||
|
- `{name}_status_code` - Numeric exit code (0-3)
|
||||||
|
- `{name}_output` - Status message
|
||||||
|
- `{name}_{metric}` - Each performance metric value
|
||||||
|
- `{name}_{metric}_uom` - Unit of measurement (if present)
|
||||||
|
- `{name}_{metric}_warn` - Warning threshold (if present)
|
||||||
|
- `{name}_{metric}_crit` - Critical threshold (if present)
|
||||||
|
- `{name}_{metric}_min` - Minimum value (if present)
|
||||||
|
- `{name}_{metric}_max` - Maximum value (if present)
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
# Collection interval in seconds (default: 60)
|
||||||
|
interval: 60
|
||||||
|
|
||||||
|
# Command execution timeout in seconds (default: 30)
|
||||||
|
timeout: 30
|
||||||
|
|
||||||
|
# Execute commands via shell (default: true)
|
||||||
|
# Set to false for direct execution (more secure but less flexible)
|
||||||
|
shell: true
|
||||||
|
|
||||||
|
# List of Nagios plugins to run
|
||||||
|
commands:
|
||||||
|
- name: unique_name # Required: unique identifier
|
||||||
|
command: /path/to/plugin [args] # Required: full command to execute
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Nagios Plugins
|
||||||
|
|
||||||
|
### System Resources
|
||||||
|
|
||||||
|
**Disk Space:**
|
||||||
|
```yaml
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
```
|
||||||
|
|
||||||
|
**Load Average:**
|
||||||
|
```yaml
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
```
|
||||||
|
|
||||||
|
**Swap Usage:**
|
||||||
|
```yaml
|
||||||
|
- name: check_swap
|
||||||
|
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
```
|
||||||
|
|
||||||
|
**Process Count:**
|
||||||
|
```yaml
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
```
|
||||||
|
|
||||||
|
**Users Logged In:**
|
||||||
|
```yaml
|
||||||
|
- name: check_users
|
||||||
|
command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network Services
|
||||||
|
|
||||||
|
**SSH:**
|
||||||
|
```yaml
|
||||||
|
- name: check_ssh
|
||||||
|
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||||
|
```
|
||||||
|
|
||||||
|
**HTTP:**
|
||||||
|
```yaml
|
||||||
|
- name: check_http_local
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||||
|
|
||||||
|
- name: check_http_ssl
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
|
||||||
|
```
|
||||||
|
|
||||||
|
**DNS:**
|
||||||
|
```yaml
|
||||||
|
- name: check_dns
|
||||||
|
command: /usr/lib/nagios/plugins/check_dns -H google.com
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ping:**
|
||||||
|
```yaml
|
||||||
|
- name: check_ping_gateway
|
||||||
|
command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
|
||||||
|
```
|
||||||
|
|
||||||
|
### Databases
|
||||||
|
|
||||||
|
**MySQL:**
|
||||||
|
```yaml
|
||||||
|
- name: check_mysql
|
||||||
|
command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
|
||||||
|
```
|
||||||
|
|
||||||
|
**PostgreSQL:**
|
||||||
|
```yaml
|
||||||
|
- name: check_pgsql
|
||||||
|
command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
|
||||||
|
```
|
||||||
|
|
||||||
|
## Writing Custom Nagios Plugins
|
||||||
|
|
||||||
|
You can write your own Nagios-compatible plugins in any language. Here's a simple example:
|
||||||
|
|
||||||
|
**Bash:**
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# /usr/local/bin/check_example.sh
|
||||||
|
|
||||||
|
# Get the value to check
|
||||||
|
value=$(some_command)
|
||||||
|
|
||||||
|
# Define thresholds
|
||||||
|
warn=80
|
||||||
|
crit=90
|
||||||
|
|
||||||
|
# Check and output result
|
||||||
|
if [ $value -ge $crit ]; then
|
||||||
|
echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 2
|
||||||
|
elif [ $value -ge $warn ]; then
|
||||||
|
echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
**Python:**
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# /usr/local/bin/check_example.py
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def check_something():
|
||||||
|
value = get_value() # Your check logic here
|
||||||
|
warn = 80
|
||||||
|
crit = 90
|
||||||
|
|
||||||
|
perfdata = f"value={value};{warn};{crit};0;100"
|
||||||
|
|
||||||
|
if value >= crit:
|
||||||
|
print(f"CRITICAL - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(2)
|
||||||
|
elif value >= warn:
|
||||||
|
print(f"WARNING - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"OK - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
check_something()
|
||||||
|
```
|
||||||
|
|
||||||
|
Then configure in Heartbeat:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
commands:
|
||||||
|
- name: my_custom_check
|
||||||
|
command: /usr/local/bin/check_example.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Plugin not found
|
||||||
|
```
|
||||||
|
Error: Command not found
|
||||||
|
```
|
||||||
|
**Solution:** Use the full path to the plugin. Common locations:
|
||||||
|
- `/usr/lib/nagios/plugins/`
|
||||||
|
- `/usr/lib64/nagios/plugins/`
|
||||||
|
- `/usr/local/nagios/libexec/`
|
||||||
|
|
||||||
|
### Permission denied
|
||||||
|
```
|
||||||
|
Error: Permission denied
|
||||||
|
```
|
||||||
|
**Solution:** Ensure the plugin is executable:
|
||||||
|
```bash
|
||||||
|
chmod +x /path/to/plugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Timeout errors
|
||||||
|
```
|
||||||
|
Command timed out after 30s
|
||||||
|
```
|
||||||
|
**Solution:** Increase the timeout in config:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
timeout: 60 # Increase timeout
|
||||||
|
```
|
||||||
|
|
||||||
|
### No performance data
|
||||||
|
If performance data is not being parsed:
|
||||||
|
1. Check plugin output includes `|` separator
|
||||||
|
2. Verify performance data format: `'label'=value[UOM];...`
|
||||||
|
3. Enable debug logging: `hbc -v -x localhost`
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
|
||||||
|
2. **No Rewriting:** Use plugins as-is without modification
|
||||||
|
3. **Community Support:** Well-documented and maintained plugins
|
||||||
|
4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
|
||||||
|
5. **Standard Interface:** Consistent exit codes and output format
|
||||||
|
6. **Performance Data:** Automatic extraction of metrics
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
|
||||||
|
- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
|
||||||
|
- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
|
||||||
|
- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- Configure threshold alerts based on Nagios plugin status codes
|
||||||
|
- View plugin data in the Heartbeat web UI
|
||||||
|
- Create custom plugins for your specific monitoring needs
|
||||||
|
- Integrate with existing Nagios/Icinga configurations
|
||||||
@@ -0,0 +1,325 @@
|
|||||||
|
# Notification System
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Notifications are dispatched to the **owner and managers** of a host, each via their own configured notification channels. Channel definitions are global; users reference them by name. No users configured → no notifications sent.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Alert event (udp.py / threshold.py)
|
||||||
|
└─ notify.send_notification(host_name, Notification)
|
||||||
|
├─ look up host.owner + host.managers
|
||||||
|
├─ for each user → user.notification_channels
|
||||||
|
└─ for each channel → _dispatch_to_channel (filtered by min_level)
|
||||||
|
```
|
||||||
|
|
||||||
|
Every notification carries:
|
||||||
|
- **title** — `[LEVEL] hostname` (e.g. `[CRITICAL] webserver01`)
|
||||||
|
- **body** — detail message (metric value, threshold, duration)
|
||||||
|
- **url** — link to the plugin metrics page (`{base_url}/plugins#{hostname}`)
|
||||||
|
- **level** — `RECOVER | WARNING | CRITICAL | INFO`
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Base URL
|
||||||
|
|
||||||
|
Set `base_url` so notification links point to your hbd instance:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_url: https://hbd.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
### Channel definitions
|
||||||
|
|
||||||
|
Channels are defined under `notification_channels`. Each entry specifies a delivery type and its credentials. Two optional metadata fields control visibility:
|
||||||
|
|
||||||
|
| Field | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `owner` | *(absent)* | Username who created/owns this channel. Absent = admin-created. |
|
||||||
|
| `private` | `false` | When `true`, only the owner can see and select this channel. |
|
||||||
|
| `min_level` | `WARNING` | Minimum alert level this channel receives. |
|
||||||
|
|
||||||
|
**Admin-created channels** (set in the config file or via the admin settings UI) are public by default — all users can select them:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
|
||||||
|
pushover_ops:
|
||||||
|
type: pushover
|
||||||
|
token: your-app-token
|
||||||
|
user: your-user-key
|
||||||
|
min_level: WARNING
|
||||||
|
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com]
|
||||||
|
sender: hbd@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: hbd@example.com
|
||||||
|
smtp_password: secret
|
||||||
|
min_level: WARNING
|
||||||
|
|
||||||
|
matrix_oncall:
|
||||||
|
type: matrix
|
||||||
|
homeserver: https://matrix.example.org
|
||||||
|
access_token: syt_xxx
|
||||||
|
room_id: "!abc:matrix.example.org"
|
||||||
|
min_level: CRITICAL
|
||||||
|
|
||||||
|
sms_oncall:
|
||||||
|
type: sms_voipms
|
||||||
|
api_user: me@example.com
|
||||||
|
api_password: secret
|
||||||
|
did: "5551234567"
|
||||||
|
dst: "5559876543"
|
||||||
|
min_level: CRITICAL
|
||||||
|
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234
|
||||||
|
recipient: +12025559999
|
||||||
|
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
```
|
||||||
|
|
||||||
|
**User-created channels** are written by authenticated users through the API or their profile page. They carry an `owner` field and optionally `private: true`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
|
||||||
|
alice_personal:
|
||||||
|
type: pushover
|
||||||
|
token: personal-token
|
||||||
|
user: personal-key
|
||||||
|
owner: alice # created by alice
|
||||||
|
private: true # only alice can see this channel
|
||||||
|
```
|
||||||
|
|
||||||
|
### Channel visibility
|
||||||
|
|
||||||
|
| Channel | Who can see / select it |
|
||||||
|
|---|---|
|
||||||
|
| No `private` field (or `private: false`) | All users |
|
||||||
|
| `private: true` | Only the `owner` |
|
||||||
|
| Any channel | Admins always see everything |
|
||||||
|
|
||||||
|
### Users with notification channels
|
||||||
|
|
||||||
|
Each user lists which channels they receive notifications on. Users can manage their own selection from the profile page:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
admin: true
|
||||||
|
notification_channels: [pushover_ops, email_ops]
|
||||||
|
|
||||||
|
bob:
|
||||||
|
full_name: Bob Jones
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
notification_channels: [sms_oncall, matrix_oncall]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host access — owner and managers
|
||||||
|
|
||||||
|
Notifications for a host go to its owner and all managers:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice # receives all notifications for this host
|
||||||
|
managers: [bob] # also receives notifications
|
||||||
|
threshold_config: default
|
||||||
|
watch: true # bold in dashboard (cosmetic only)
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dbserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
threshold_config: database
|
||||||
|
dyndns: false
|
||||||
|
```
|
||||||
|
|
||||||
|
`watch: true` only affects display (bold name in the live dashboard). Notifications are now controlled entirely by owner/managers.
|
||||||
|
|
||||||
|
## Channel Types
|
||||||
|
|
||||||
|
### `min_level` filtering
|
||||||
|
|
||||||
|
Every channel accepts an optional `min_level` field:
|
||||||
|
|
||||||
|
| Value | Channels receive |
|
||||||
|
|---|---|
|
||||||
|
| `WARNING` (default) | WARNING, CRITICAL, RECOVER |
|
||||||
|
| `CRITICAL` | CRITICAL only (and RECOVER) |
|
||||||
|
|
||||||
|
`RECOVER` is always passed through — you don't want to miss a recovery.
|
||||||
|
|
||||||
|
### pushover
|
||||||
|
|
||||||
|
Sends push notifications via [Pushover](https://pushover.net). Includes title, body, and a clickable URL.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: pushover
|
||||||
|
token: your-app-token # Required: Pushover application token
|
||||||
|
user: your-user-key # Required: Recipient's user key
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
### email
|
||||||
|
|
||||||
|
Sends via SMTP. Subject = title, body = message + URL on final line.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, oncall@example.com]
|
||||||
|
sender: hbd@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587 # 587 = STARTTLS (default), 465 = SSL
|
||||||
|
smtp_user: hbd@example.com
|
||||||
|
smtp_password: secret
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
### matrix
|
||||||
|
|
||||||
|
Sends a formatted HTML message to a Matrix room via [matrix-nio](https://github.com/poljar/matrix-nio).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: matrix
|
||||||
|
homeserver: https://matrix.example.org
|
||||||
|
access_token: syt_xxx # Bot account access token
|
||||||
|
room_id: "!abc:matrix.example.org"
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
1. Create a bot Matrix account
|
||||||
|
2. Obtain its access token (Element → Settings → Help & About → Access Token)
|
||||||
|
3. Invite the bot to the target room and note the room ID
|
||||||
|
|
||||||
|
### sms_voipms
|
||||||
|
|
||||||
|
Sends SMS via the [voip.ms REST API](https://voip.ms/api/v1/rest.php). Message is truncated to 160 characters.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: sms_voipms
|
||||||
|
api_user: me@example.com # voip.ms account email
|
||||||
|
api_password: secret # voip.ms API password
|
||||||
|
did: "5551234567" # Your voip.ms DID (sending number)
|
||||||
|
dst: "5559876543" # Destination number
|
||||||
|
min_level: CRITICAL
|
||||||
|
```
|
||||||
|
|
||||||
|
### signal
|
||||||
|
|
||||||
|
Sends via [signal-cli](https://github.com/AsamK/signal-cli).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234 # Your registered Signal number
|
||||||
|
recipient: +12025559999 # Recipient number
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
```bash
|
||||||
|
signal-cli -u +12025551234 register
|
||||||
|
signal-cli -u +12025551234 verify CODE
|
||||||
|
```
|
||||||
|
|
||||||
|
### mattermost
|
||||||
|
|
||||||
|
Sends via Mattermost incoming webhook. Message is formatted as Markdown.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot # Optional: display name
|
||||||
|
icon: https://…/icon.png # Optional: bot icon URL
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notification events
|
||||||
|
|
||||||
|
| Source | Level | Title example | Body example |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Host overdue | CRITICAL | `[CRITICAL] webserver01` | `IPv4 overdue` |
|
||||||
|
| Host recover | RECOVER | `[RECOVER] webserver01` | `IPv4 back after being overdue for 5:23` |
|
||||||
|
| Host boot | INFO | `[INFO] webserver01` | `webserver01 booted` |
|
||||||
|
| Host shutdown | INFO | `[INFO] webserver01` | `IPv4 shutdown` |
|
||||||
|
| Threshold breach | WARNING/CRITICAL | `[CRITICAL] webserver01` | `cpu_percent = 95.2 (threshold: > 90.0)` |
|
||||||
|
| Threshold reminder | CRITICAL | `[REMINDER/CRITICAL] webserver01` | `REMINDER (CRITICAL): … ongoing for 3600s` |
|
||||||
|
| Connection issue | WARNING | `[WARNING] webserver01` | `new address detected …` |
|
||||||
|
|
||||||
|
Reminder notifications (re-notify) are sent only for CRITICAL level alerts.
|
||||||
|
|
||||||
|
## API reference
|
||||||
|
|
||||||
|
### `send_notification(host_name, notif) -> dict`
|
||||||
|
|
||||||
|
Main entry point. Dispatches to owner + managers.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.server.notify import send_notification, Notification
|
||||||
|
|
||||||
|
send_notification(
|
||||||
|
"webserver01",
|
||||||
|
Notification(
|
||||||
|
title="[CRITICAL] webserver01",
|
||||||
|
body="cpu_percent = 95.2 (threshold: > 90.0)",
|
||||||
|
level="CRITICAL",
|
||||||
|
url="https://hbd.example.com/plugins#webserver01",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns `{channel_name: bool}` for each channel dispatched.
|
||||||
|
|
||||||
|
### `setup(cfg, loop=None)`
|
||||||
|
|
||||||
|
Called once at startup from `main.py`. Pass the running asyncio event loop so Matrix sends work correctly.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**No notifications sent:**
|
||||||
|
- Check that users are configured (`users:` section in yaml)
|
||||||
|
- Check that the host has an `owner` or `managers` set
|
||||||
|
- Check that users have `notification_channels` listed
|
||||||
|
- Check that the channel names in user config match keys under `notification_channels:`
|
||||||
|
- If a user can't select a channel, check whether it is `private: true` and owned by someone else
|
||||||
|
|
||||||
|
**min_level filtering too aggressive:**
|
||||||
|
- Default is `WARNING` — both WARNING and CRITICAL are sent
|
||||||
|
- Set `min_level: WARNING` explicitly if you were expecting warnings but set CRITICAL
|
||||||
|
|
||||||
|
**Matrix sends time out:**
|
||||||
|
- Verify the access token is valid and the bot is in the room
|
||||||
|
- `matrix-nio` must be installed: `pip install matrix-nio`
|
||||||
|
|
||||||
|
**voip.ms SMS fails:**
|
||||||
|
- Enable the API in your voip.ms account (Account → API)
|
||||||
|
- Verify the DID is SMS-capable in your voip.ms account
|
||||||
|
|
||||||
|
**Signal not found:**
|
||||||
|
- Specify full `cli_path`
|
||||||
|
- Run `signal-cli -u +NUMBER receive` to sync trust store
|
||||||
|
|
||||||
|
**Email authentication failed:**
|
||||||
|
- Use app-specific passwords for Gmail/Fastmail
|
||||||
|
- Verify port: 587 for STARTTLS, 465 for SSL
|
||||||
|
|
||||||
|
**Pushover `400` errors:**
|
||||||
|
- Double-check `token` (app) and `user` (user key) — they are different values
|
||||||
@@ -0,0 +1,567 @@
|
|||||||
|
# Plugin Development Guide
|
||||||
|
|
||||||
|
This guide explains how to create custom plugins for the Heartbeat monitoring system.
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
- [Plugin Architecture](#plugin-architecture)
|
||||||
|
- [Plugin Types](#plugin-types)
|
||||||
|
- [Creating a Plugin](#creating-a-plugin)
|
||||||
|
- [Plugin Lifecycle](#plugin-lifecycle)
|
||||||
|
- [Server-initiated InfoPlugin refresh](#server-initiated-infoplugin-refresh)
|
||||||
|
- [Configuration](#configuration)
|
||||||
|
- [Best Practices](#best-practices)
|
||||||
|
- [Examples](#examples)
|
||||||
|
- [Testing](#testing)
|
||||||
|
|
||||||
|
## Plugin Architecture
|
||||||
|
|
||||||
|
Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
|
||||||
|
|
||||||
|
### Key Concepts
|
||||||
|
|
||||||
|
- **Plugin Registry**: Central registry that manages all loaded plugins
|
||||||
|
- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
|
||||||
|
- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
|
||||||
|
- **Async/Await**: All plugin methods are async for non-blocking operation
|
||||||
|
|
||||||
|
## Plugin Types
|
||||||
|
|
||||||
|
### InfoPlugin
|
||||||
|
|
||||||
|
InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
|
||||||
|
|
||||||
|
- **Runs once** at startup (interval = 0)
|
||||||
|
- **Cached** - data is collected once and reused
|
||||||
|
- **Lightweight** - no periodic overhead
|
||||||
|
|
||||||
|
**Use InfoPlugin for:**
|
||||||
|
- Operating system details
|
||||||
|
- Hardware information
|
||||||
|
- Software versions
|
||||||
|
- Configuration data
|
||||||
|
- Static inventory
|
||||||
|
|
||||||
|
### MonitorPlugin
|
||||||
|
|
||||||
|
MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
|
||||||
|
|
||||||
|
- **Runs periodically** based on configured interval
|
||||||
|
- **Scheduled** - collected at regular intervals
|
||||||
|
- **Dynamic** - captures changing system state
|
||||||
|
|
||||||
|
**Use MonitorPlugin for:**
|
||||||
|
- Resource usage (CPU, memory, disk, network)
|
||||||
|
- Performance metrics
|
||||||
|
- Counters and gauges
|
||||||
|
- Time-series data
|
||||||
|
|
||||||
|
## Creating a Plugin
|
||||||
|
|
||||||
|
### Step 1: Choose Plugin Type
|
||||||
|
|
||||||
|
Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
|
||||||
|
|
||||||
|
### Step 2: Create Plugin File
|
||||||
|
|
||||||
|
Create a new Python file in `hbd/plugins/` directory:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""
|
||||||
|
My awesome plugin for Heartbeat.
|
||||||
|
|
||||||
|
Brief description of what this plugin does.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
# Import psutil or other dependencies if needed
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.plugin import MonitorPlugin # or InfoPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MyAwesomePlugin(MonitorPlugin): # or InfoPlugin
|
||||||
|
"""
|
||||||
|
One-line description of the plugin.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- List of metrics/data collected
|
||||||
|
- Another metric
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 60)
|
||||||
|
option1: Description of option1 (default: value)
|
||||||
|
option2: Description of option2 (default: value)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "my_awesome_plugin" # Unique plugin name
|
||||||
|
interval = 60 # For MonitorPlugin, use 0 for InfoPlugin
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""Initialize the plugin with optional configuration."""
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Extract configuration options
|
||||||
|
self.option1 = self.config.get('option1', 'default_value')
|
||||||
|
self.option2 = self.config.get('option2', True)
|
||||||
|
|
||||||
|
# Check dependencies
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil is required for my_awesome_plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""
|
||||||
|
Initialize the plugin.
|
||||||
|
|
||||||
|
This is called once when the plugin is loaded.
|
||||||
|
Use this to verify dependencies, establish connections, etc.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization successful, False otherwise
|
||||||
|
"""
|
||||||
|
logger.info(f"My awesome plugin initialized (option1: {self.option1})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect data.
|
||||||
|
|
||||||
|
This is called periodically (MonitorPlugin) or once (InfoPlugin).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of collected data (will be sent to server)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected {len(data)} metrics")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting data: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to collect actual metrics."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect your data here
|
||||||
|
metrics['metric1'] = self._get_metric1()
|
||||||
|
metrics['metric2'] = self._get_metric2()
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
def _get_metric1(self):
|
||||||
|
"""Helper method for metric collection."""
|
||||||
|
# Implementation here
|
||||||
|
return 42
|
||||||
|
|
||||||
|
def _get_metric2(self):
|
||||||
|
"""Helper method for metric collection."""
|
||||||
|
# Implementation here
|
||||||
|
return "hello"
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""
|
||||||
|
Cleanup resources.
|
||||||
|
|
||||||
|
This is called when the plugin is unloaded or the client shuts down.
|
||||||
|
Use this to close connections, release resources, etc.
|
||||||
|
"""
|
||||||
|
logger.info("My awesome plugin cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = MyAwesomePlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Test Your Plugin
|
||||||
|
|
||||||
|
Create a test script to verify your plugin works:
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
|
||||||
|
|
||||||
|
async def test():
|
||||||
|
# Create plugin instance
|
||||||
|
plugin = MyAwesomePlugin({'option1': 'test_value'})
|
||||||
|
|
||||||
|
# Initialize
|
||||||
|
if not await plugin.initialize():
|
||||||
|
print("Failed to initialize")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Collect data
|
||||||
|
data = await plugin.collect()
|
||||||
|
print(f"Collected data: {data}")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
await plugin.cleanup()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
success = asyncio.run(test())
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Plugin Lifecycle
|
||||||
|
|
||||||
|
Understanding the plugin lifecycle helps you implement plugins correctly:
|
||||||
|
|
||||||
|
```
|
||||||
|
1. Plugin Discovery
|
||||||
|
└─> Loader scans hbd/plugins/ directory
|
||||||
|
└─> Finds Python files (except those starting with _)
|
||||||
|
└─> Imports modules
|
||||||
|
|
||||||
|
2. Plugin Instantiation
|
||||||
|
└─> Creates instance with configuration
|
||||||
|
└─> __init__() is called
|
||||||
|
|
||||||
|
3. Plugin Initialization
|
||||||
|
└─> initialize() is called
|
||||||
|
└─> Plugin verifies dependencies, establishes connections
|
||||||
|
└─> Returns True/False for success/failure
|
||||||
|
|
||||||
|
4. Plugin Registration
|
||||||
|
└─> If initialization succeeds, plugin is registered
|
||||||
|
└─> Plugin becomes active
|
||||||
|
|
||||||
|
5. Data Collection
|
||||||
|
└─> For InfoPlugin: collect() called once after initialization
|
||||||
|
└─> For MonitorPlugin: collect() called periodically based on interval
|
||||||
|
└─> Data is sent to server via PLG message
|
||||||
|
|
||||||
|
6. Plugin Shutdown
|
||||||
|
└─> cleanup() is called
|
||||||
|
└─> Plugin releases resources, closes connections
|
||||||
|
```
|
||||||
|
|
||||||
|
## Server-initiated InfoPlugin refresh
|
||||||
|
|
||||||
|
When a heartbeat packet arrives from a host the server has no plugin data for (e.g. after a server restart), the server sets `request_update = 1` in the ACK reply. The client detects this flag and immediately re-runs all InfoPlugins — clearing their cached results first — then resends the data as PLG messages.
|
||||||
|
|
||||||
|
This means InfoPlugin data will always reach the server as soon as possible without requiring a client restart. No action is needed from plugin authors: the framework handles cache invalidation and re-collection automatically.
|
||||||
|
|
||||||
|
The lifecycle for this case looks like:
|
||||||
|
|
||||||
|
```
|
||||||
|
Server restarts, host reconnects
|
||||||
|
└─> hbd receives HTB with no existing plugin_data for host
|
||||||
|
└─> hbd sets request_update=1 in ACK
|
||||||
|
|
||||||
|
Client receives ACK
|
||||||
|
└─> Detects request_update flag
|
||||||
|
└─> Clears _cache on every registered InfoPlugin
|
||||||
|
└─> Calls collect() on each InfoPlugin
|
||||||
|
└─> Sends fresh PLG messages to server
|
||||||
|
```
|
||||||
|
|
||||||
|
If you write an `InfoPlugin` with side effects in `_collect_info()` (opening connections, writing files, etc.), be aware it may be called more than once per client session when this mechanism triggers.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Plugin-Specific Configuration
|
||||||
|
|
||||||
|
Plugins receive configuration through the `config` parameter in `__init__`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Access configuration with defaults
|
||||||
|
self.interval = self.config.get('interval', 60)
|
||||||
|
self.threshold = self.config.get('threshold', 80)
|
||||||
|
self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Client Configuration File
|
||||||
|
|
||||||
|
Users configure plugins in the client configuration YAML:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
my_awesome_plugin:
|
||||||
|
enabled: true
|
||||||
|
interval: 120
|
||||||
|
option1: custom_value
|
||||||
|
option2: false
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### 1. Error Handling
|
||||||
|
|
||||||
|
Always handle errors gracefully:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
return await self._collect_metrics()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Logging
|
||||||
|
|
||||||
|
Use appropriate log levels:
|
||||||
|
|
||||||
|
```python
|
||||||
|
logger.debug("Detailed information for debugging")
|
||||||
|
logger.info("Normal operation messages")
|
||||||
|
logger.warning("Warning messages for unusual but handled situations")
|
||||||
|
logger.error("Error messages for failures")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Dependencies
|
||||||
|
|
||||||
|
Check for optional dependencies:
|
||||||
|
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
import some_optional_library
|
||||||
|
except ImportError:
|
||||||
|
some_optional_library = None
|
||||||
|
|
||||||
|
# Later in __init__:
|
||||||
|
if some_optional_library is None:
|
||||||
|
raise ImportError("some_optional_library is required")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Performance
|
||||||
|
|
||||||
|
- Keep collection methods fast (< 1 second)
|
||||||
|
- Use async/await for I/O operations
|
||||||
|
- Cache expensive computations
|
||||||
|
- Don't block the event loop
|
||||||
|
|
||||||
|
### 5. Data Structure
|
||||||
|
|
||||||
|
Return clean, structured data:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
'metric_name': value,
|
||||||
|
'nested_data': {
|
||||||
|
'sub_metric': value
|
||||||
|
},
|
||||||
|
'list_data': [item1, item2],
|
||||||
|
'timestamp': time.time() # Optional timestamp
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Documentation
|
||||||
|
|
||||||
|
Document your plugin thoroughly:
|
||||||
|
|
||||||
|
- Class docstring with description and configuration
|
||||||
|
- Method docstrings explaining purpose and return values
|
||||||
|
- Inline comments for complex logic
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Example 1: Simple InfoPlugin
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import InfoPlugin
|
||||||
|
import platform
|
||||||
|
|
||||||
|
class SimpleInfoPlugin(InfoPlugin):
|
||||||
|
"""Collect basic system information."""
|
||||||
|
|
||||||
|
name = "simple_info"
|
||||||
|
interval = 0 # InfoPlugin
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
'hostname': platform.node(),
|
||||||
|
'system': platform.system(),
|
||||||
|
'python_version': platform.python_version()
|
||||||
|
}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = SimpleInfoPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: MonitorPlugin with State
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import MonitorPlugin
|
||||||
|
import time
|
||||||
|
|
||||||
|
class CounterPlugin(MonitorPlugin):
|
||||||
|
"""Track a counter over time."""
|
||||||
|
|
||||||
|
name = "counter"
|
||||||
|
interval = 30
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._counter = 0
|
||||||
|
self._start_time = time.time()
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
self._counter += 1
|
||||||
|
uptime = time.time() - self._start_time
|
||||||
|
|
||||||
|
return {
|
||||||
|
'count': self._counter,
|
||||||
|
'uptime': uptime,
|
||||||
|
'rate': self._counter / uptime
|
||||||
|
}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = CounterPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 3: Plugin with External Command
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import MonitorPlugin
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class CommandPlugin(MonitorPlugin):
|
||||||
|
"""Execute external command and capture output."""
|
||||||
|
|
||||||
|
name = "command_executor"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super().__init__(config)
|
||||||
|
self.command = self.config.get('command', 'echo "no command"')
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
process = await asyncio.create_subprocess_shell(
|
||||||
|
self.command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
stdout, stderr = await asyncio.wait_for(
|
||||||
|
process.communicate(),
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'exit_code': process.returncode,
|
||||||
|
'stdout': stdout.decode('utf-8'),
|
||||||
|
'stderr': stderr.decode('utf-8')
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {'error': str(e)}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = CommandPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Unit Testing
|
||||||
|
|
||||||
|
Create unit tests for your plugins:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import unittest
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class TestMyPlugin(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.plugin = MyAwesomePlugin({'option1': 'test'})
|
||||||
|
|
||||||
|
def test_initialization(self):
|
||||||
|
result = asyncio.run(self.plugin.initialize())
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
def test_collection(self):
|
||||||
|
asyncio.run(self.plugin.initialize())
|
||||||
|
data = asyncio.run(self.plugin.collect())
|
||||||
|
|
||||||
|
self.assertIsInstance(data, dict)
|
||||||
|
self.assertIn('metric1', data)
|
||||||
|
self.assertGreater(data['metric1'], 0)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
asyncio.run(self.plugin.cleanup())
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Integration Testing
|
||||||
|
|
||||||
|
Test your plugin with the actual client:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create test configuration
|
||||||
|
cat > test_config.yaml <<EOF
|
||||||
|
server: localhost
|
||||||
|
plugins:
|
||||||
|
my_awesome_plugin:
|
||||||
|
enabled: true
|
||||||
|
interval: 10
|
||||||
|
option1: test_value
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Run client in test mode
|
||||||
|
python -m hbd.hbc -c test_config.yaml --verbose
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### My plugin isn't loading
|
||||||
|
|
||||||
|
1. Check filename doesn't start with underscore
|
||||||
|
2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
|
||||||
|
3. Check `initialize()` returns True
|
||||||
|
4. Look for import errors in logs
|
||||||
|
|
||||||
|
### Plugin loads but doesn't collect data
|
||||||
|
|
||||||
|
1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
|
||||||
|
2. Verify `collect()` returns a dictionary
|
||||||
|
3. Check for exceptions in `collect()` method
|
||||||
|
4. Enable DEBUG logging to see detailed errors
|
||||||
|
|
||||||
|
### Data isn't appearing on server
|
||||||
|
|
||||||
|
1. Verify client is connected to server
|
||||||
|
2. Check server logs for PLG message handling
|
||||||
|
3. Verify returned data is JSON-serializable
|
||||||
|
4. Check for large data sizes (may exceed UDP packet size)
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
|
||||||
|
- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
|
||||||
|
- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
|
||||||
|
- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
|
||||||
File diff suppressed because it is too large
Load Diff
+286
@@ -0,0 +1,286 @@
|
|||||||
|
# User Management
|
||||||
|
|
||||||
|
Heartbeat supports optional user accounts with role-based access control per host. When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Users are defined in the server config file. Each host can have an **owner**, zero or more **managers**, and zero or more **monitors**. A **default owner** catches any host that does not name an explicit owner.
|
||||||
|
|
||||||
|
### Roles
|
||||||
|
|
||||||
|
| Role | Inherits | Permissions |
|
||||||
|
|------|----------|-------------|
|
||||||
|
| **monitor** | — | View host status, plugin data, alerts; acknowledge alerts they were notified for |
|
||||||
|
| **manager** | monitor | + Queue commands (`/c`), trigger DNS re-registration (`/n`), queue upgrades (`/u`); add/remove monitors |
|
||||||
|
| **owner** | manager | + Drop host (`/d`); add/remove managers; transfer ownership; update host access |
|
||||||
|
| **admin** *(flag)* | owner on all hosts | Full access to every host and the user list |
|
||||||
|
|
||||||
|
`admin` is a flag on the user, not a per-host role. An admin user has owner-level access on every host without being listed as owner/manager/monitor.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Defining users
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
andreas:
|
||||||
|
full_name: Andreas Wrede
|
||||||
|
avatar: /path/to/avatar.png # file path, URL, or base64 data URI (optional)
|
||||||
|
password: pbkdf2:sha256:... # generated with: hbd passwd andreas
|
||||||
|
admin: true # optional — grants server-wide owner access
|
||||||
|
|
||||||
|
bob:
|
||||||
|
full_name: Bob Smith
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
notification_channels: [pushover_standard] # channels bob has selected
|
||||||
|
|
||||||
|
carol:
|
||||||
|
full_name: Carol Jones
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
|
||||||
|
default_owner: andreas # owns hosts with no explicit owner
|
||||||
|
# falls back to the first admin user if omitted
|
||||||
|
```
|
||||||
|
|
||||||
|
### Client-declared host ownership
|
||||||
|
|
||||||
|
A host can declare its own owner directly in the hbc or hbc_mini client configuration. This is useful for hosts that are not listed in the server config, or during initial setup before a server-side config entry has been created.
|
||||||
|
|
||||||
|
**`~/.hbc.yaml`** (hbc):
|
||||||
|
```yaml
|
||||||
|
owner: andreas
|
||||||
|
```
|
||||||
|
|
||||||
|
**`~/.hbc.json`** (hbc_mini):
|
||||||
|
```json
|
||||||
|
{ "owner": "andreas" }
|
||||||
|
```
|
||||||
|
|
||||||
|
When set, the value is included in the `os_info` plugin data sent to the server. The server applies it as `host.owner` the first time `os_info` arrives, provided no owner has been configured server-side for that host. Server-configured ownership always takes precedence.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Assigning roles to hosts
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: andreas
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
unattended-host: # no owner → owned by default_owner
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generating a password hash
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd passwd andreas
|
||||||
|
```
|
||||||
|
|
||||||
|
Enter and confirm the password when prompted. Paste the printed hash into the config file under the user's `password` key.
|
||||||
|
|
||||||
|
You can also generate a hash non-interactively from Python:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.server.users import hash_password
|
||||||
|
print(hash_password("mysecret"))
|
||||||
|
```
|
||||||
|
|
||||||
|
Passwords are stored as PBKDF2-HMAC-SHA256 hashes (260 000 iterations). No third-party libraries are required — only Python's standard `hashlib`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
When at least one user is defined, every request must be authenticated. Unauthenticated requests to HTML pages are redirected to `/login`; unauthenticated API requests receive `401 Unauthorized`.
|
||||||
|
|
||||||
|
### Browser login
|
||||||
|
|
||||||
|
Navigate to any page — you will be redirected to `/login` automatically. After submitting valid credentials the server sets an `hbd_session` cookie (HttpOnly, SameSite=Lax, 24 h lifetime). All subsequent requests, including JavaScript `fetch()` calls on the dashboards, carry the cookie automatically.
|
||||||
|
|
||||||
|
To log out, visit `/logout`.
|
||||||
|
|
||||||
|
### API / programmatic login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log in and capture the token
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"andreas","password":"mysecret"}' | jq -r .token)
|
||||||
|
|
||||||
|
# Use the token in subsequent requests
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
The token is identical to the session cookie value — both mechanisms work simultaneously.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log out
|
||||||
|
curl -s -X POST http://localhost:50004/api/0/auth/logout \
|
||||||
|
-H "Authorization: Bearer $TOKEN"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
|
||||||
|
#### POST /api/0/auth/login
|
||||||
|
Obtain a session token.
|
||||||
|
|
||||||
|
**Request body:**
|
||||||
|
```json
|
||||||
|
{ "username": "andreas", "password": "mysecret" }
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{ "token": "<opaque-hex-token>", "username": "andreas" }
|
||||||
|
```
|
||||||
|
Also sets the `hbd_session` cookie for browser clients.
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `401 Unauthorized`, `404` (auth not configured)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### POST /api/0/auth/logout
|
||||||
|
Invalidate the current session.
|
||||||
|
|
||||||
|
**Headers:** `Authorization: Bearer <token>` or cookie
|
||||||
|
|
||||||
|
**Response:** `{ "success": true }`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Users
|
||||||
|
|
||||||
|
#### GET /api/0/users
|
||||||
|
List all users. **Admin only.**
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{ "username": "andreas", "full_name": "Andreas Wrede", "avatar": "", "admin": true, "notification_channels": [] },
|
||||||
|
{ "username": "bob", "full_name": "Bob Smith", "avatar": "", "admin": false, "notification_channels": ["pushover_standard"] }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### GET /api/0/users/me
|
||||||
|
Return the currently authenticated user's profile.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{ "username": "carol", "full_name": "Carol Jones", "avatar": "", "admin": false, "notification_channels": [] }
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### PUT /api/0/users/me
|
||||||
|
Update the current user's profile. All fields are optional — send only what you want to change.
|
||||||
|
|
||||||
|
**Update display name and avatar:**
|
||||||
|
```json
|
||||||
|
{ "full_name": "Carol Jones", "avatar": "/avatars/carol.png" }
|
||||||
|
```
|
||||||
|
|
||||||
|
**Change notification channel selection:**
|
||||||
|
```json
|
||||||
|
{ "notification_channels": ["pushover_ops", "email_ops"] }
|
||||||
|
```
|
||||||
|
Only channels visible to the user (public + own private) are accepted; others are silently dropped.
|
||||||
|
|
||||||
|
**Change password:**
|
||||||
|
```json
|
||||||
|
{ "password": { "current": "oldpass", "new": "newpass" } }
|
||||||
|
```
|
||||||
|
Requires the correct current password. New password is hashed before storage.
|
||||||
|
|
||||||
|
**Response:** `{"ok": true}`
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `400` (missing/invalid field), `401` (unauthenticated), `403` (wrong current password)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Host Access
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/access
|
||||||
|
Return owner/managers/monitors for a host. Requires at least **monitor** role.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "andreas",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### PUT /api/0/hosts/{hostname}/access
|
||||||
|
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||||
|
|
||||||
|
**Request body** (all fields optional):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "bob",
|
||||||
|
"managers": ["carol"],
|
||||||
|
"monitors": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Changes take effect immediately in memory. They are not written back to the config file — reload (`SIGHUP`) will re-apply config values. To make changes permanent, update the config file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Host visibility
|
||||||
|
|
||||||
|
When users are configured, `GET /api/0/hosts` only returns hosts the authenticated user has at least monitor access to. Admins see all hosts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config reload
|
||||||
|
|
||||||
|
On `SIGHUP`, the server reloads the config file, re-loads the user registry, and re-applies `owner`/`managers`/`monitors` from config to all known hosts. Existing sessions remain valid after a reload.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## No-auth mode
|
||||||
|
|
||||||
|
If `users:` is absent or empty, the server starts in **unauthenticated mode**:
|
||||||
|
|
||||||
|
- No login required — all pages and API endpoints are accessible without credentials.
|
||||||
|
- All permission checks pass unconditionally.
|
||||||
|
- `/login`, `/logout`, and the auth/user API endpoints return `404`.
|
||||||
|
|
||||||
|
This preserves full backwards compatibility with existing deployments.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security notes
|
||||||
|
|
||||||
|
- Session tokens are 64-character cryptographically random hex strings (`secrets.token_hex(32)`).
|
||||||
|
- Sessions expire after 24 hours (configurable via `users_mod.SESSION_TTL`).
|
||||||
|
- Cookies are `HttpOnly` and `SameSite=Lax` — they are not accessible to JavaScript and are not sent on cross-site requests.
|
||||||
|
- The HTTP API does not yet enforce TLS. For production use, place hbd behind a TLS-terminating reverse proxy (nginx, Caddy, etc.) or enable WSS.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [HTTP API Documentation](HTTP_API.md)
|
||||||
|
- [Notifications](NOTIFICATIONS.md)
|
||||||
|
- Configuration example: `hbd/config_example.yaml`
|
||||||
+14
-8
@@ -1,11 +1,17 @@
|
|||||||
"""hbd package - scaffolding for heartbeat daemon
|
"""hbd package - heartbeat monitoring system
|
||||||
|
|
||||||
This package contains the refactored modules for the original monolithic
|
This package contains both the heartbeat client (hbc) and server (hbd) components,
|
||||||
`hbd` script. The initial implementation contains small scaffolds so you can
|
organized into separate subpackages:
|
||||||
start moving functionality into the package.
|
|
||||||
|
- hbd.client: Client component with system monitoring plugins
|
||||||
|
- hbd.server: Server/daemon component with web UI and notifications
|
||||||
|
- hbd.common: Shared utilities and protocol definitions
|
||||||
|
|
||||||
|
Install options:
|
||||||
|
- pip install hbd[client] # Client only
|
||||||
|
- pip install hbd[server] # Server only
|
||||||
|
- pip install hbd[all] # Both client and server
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = ["main", "__version__"]
|
__all__ = ["__version__"]
|
||||||
__version__ = "5.0"
|
__version__ = "5.3.8"
|
||||||
|
|
||||||
from .cli import main
|
|
||||||
|
|||||||
-45
@@ -1,45 +0,0 @@
|
|||||||
"""Command line interface for hbd package."""
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from .config import load_config
|
|
||||||
from .server import run as run_server
|
|
||||||
|
|
||||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
|
||||||
|
|
||||||
|
|
||||||
def build_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="hbd",
|
|
||||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
|
||||||
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
||||||
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS, help="Push service to use")
|
|
||||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
|
||||||
parser = build_parser()
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
config = load_config(args.configfile)
|
|
||||||
|
|
||||||
# Apply CLI overrides
|
|
||||||
if args.foreground:
|
|
||||||
config["foreground"] = True
|
|
||||||
if args.verbose:
|
|
||||||
config["verbose"] = True
|
|
||||||
if args.pushsrv:
|
|
||||||
config["pushsrv"] = args.pushsrv
|
|
||||||
if args.debug:
|
|
||||||
config.setdefault("debug", 0)
|
|
||||||
config["debug"] += args.debug
|
|
||||||
|
|
||||||
run_server(config)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""HeartBeat Client (hbc) - System monitoring client."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
CLIENT_DEFAULTS = {
|
||||||
|
# Network settings
|
||||||
|
"hb_port": 50003, # Port where hbd servers listen
|
||||||
|
"interval": 10, # Heartbeat interval in seconds
|
||||||
|
|
||||||
|
# Host identity
|
||||||
|
"owner": None, # Optional username to set as this host's owner on the server
|
||||||
|
|
||||||
|
# Runtime flags
|
||||||
|
"foreground": False,
|
||||||
|
"verbose": False,
|
||||||
|
"debug": 0,
|
||||||
|
|
||||||
|
# Plugin configuration
|
||||||
|
"plugins": {}, # Per-plugin configuration
|
||||||
|
"thresholds": {}, # Threshold configuration for monitoring
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path=None):
|
||||||
|
"""Load configuration from a YAML file and merge with client defaults.
|
||||||
|
|
||||||
|
If YAML is not available or the file does not exist, defaults are returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with configuration
|
||||||
|
"""
|
||||||
|
cfg = CLIENT_DEFAULTS.copy()
|
||||||
|
if not path:
|
||||||
|
# default path (~/.hbc.yaml)
|
||||||
|
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||||
|
|
||||||
|
if os.path.exists(path):
|
||||||
|
if yaml:
|
||||||
|
logger.info("Loading configuration from %s", path)
|
||||||
|
with open(path) as fh:
|
||||||
|
data = yaml.safe_load(fh)
|
||||||
|
# Merge YAML data with defaults
|
||||||
|
# Keep all keys from YAML to support plugin configs and future extensions
|
||||||
|
for k, v in data.items():
|
||||||
|
cfg[k] = v
|
||||||
|
else:
|
||||||
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
|
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||||
|
return cfg
|
||||||
@@ -0,0 +1,801 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
HeartBeat Client (hbc) - Async version with plugin support.
|
||||||
|
|
||||||
|
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
|
||||||
|
system information via plugins.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from logging.handlers import SysLogHandler
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
# Import protocol and config
|
||||||
|
from .config import load_config
|
||||||
|
from ..common.proto import dicttos, stodict
|
||||||
|
from .. import __version__
|
||||||
|
|
||||||
|
# Import plugin system
|
||||||
|
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
PORT = 50003
|
||||||
|
INTERVAL = 10
|
||||||
|
MAXRECV = 32767
|
||||||
|
|
||||||
|
# Global state
|
||||||
|
running = True
|
||||||
|
dorestart = False
|
||||||
|
shutdown_event: Optional[asyncio.Event] = None
|
||||||
|
active_tasks: List[asyncio.Task] = []
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncConnection:
|
||||||
|
"""Async UDP connection to a heartbeat server."""
|
||||||
|
|
||||||
|
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
|
||||||
|
self.conn_id = conn_id
|
||||||
|
self.addr = addr
|
||||||
|
self.port = port
|
||||||
|
self.af = af
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
self.ackcount = 0
|
||||||
|
self.lastack = 0.0
|
||||||
|
self.send_count = 0
|
||||||
|
self.lastsend = 0.0
|
||||||
|
self.rtts = [0.0]
|
||||||
|
|
||||||
|
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||||
|
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||||
|
self._dead = False
|
||||||
|
self._ever_opened = False
|
||||||
|
self._open_fail_count = 0 # consecutive failures before first success
|
||||||
|
self.request_info_event: asyncio.Event = asyncio.Event()
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||||
|
|
||||||
|
async def open(self) -> bool:
|
||||||
|
"""Open the UDP connection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
# Create datagram endpoint
|
||||||
|
self.transport, self.protocol = await loop.create_datagram_endpoint(
|
||||||
|
lambda: HeartbeatProtocol(self),
|
||||||
|
family=self.af
|
||||||
|
)
|
||||||
|
self._ever_opened = True
|
||||||
|
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to open connection: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close the connection."""
|
||||||
|
if self.transport:
|
||||||
|
self.transport.close()
|
||||||
|
self.transport = None
|
||||||
|
self.protocol = None
|
||||||
|
|
||||||
|
async def sendto(self, msg: dict, msg_id: str = "HTB"):
|
||||||
|
"""Send a message to the server.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Message dictionary
|
||||||
|
msg_id: Message ID (HTB, PLG, etc.)
|
||||||
|
"""
|
||||||
|
if self._dead:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.transport:
|
||||||
|
await self.open()
|
||||||
|
|
||||||
|
if not self.transport:
|
||||||
|
self.logger.error("Cannot send - no transport")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Add standard fields
|
||||||
|
msg["name"] = shortname(self.name)
|
||||||
|
msg["id"] = self.conn_id
|
||||||
|
msg["time"] = time.time()
|
||||||
|
|
||||||
|
# Encode message
|
||||||
|
data = dicttos(msg_id, msg)
|
||||||
|
|
||||||
|
# Send
|
||||||
|
self.transport.sendto(data, (self.addr, self.port))
|
||||||
|
self.send_count += 1
|
||||||
|
self.lastsend = time.time()
|
||||||
|
|
||||||
|
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||||
|
|
||||||
|
def handle_ack(self, msg: dict, now: float):
|
||||||
|
"""Handle ACK message from server.
|
||||||
|
|
||||||
|
RTT is calculated as: (time ACK received) - (time HTB sent)
|
||||||
|
"""
|
||||||
|
self.lastack = now
|
||||||
|
|
||||||
|
# Calculate RTT: time ACK received minus time HTB sent
|
||||||
|
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
|
||||||
|
|
||||||
|
self.rtts.append(rtt)
|
||||||
|
if len(self.rtts) > 10:
|
||||||
|
self.rtts.pop(0)
|
||||||
|
|
||||||
|
self.ackcount += 1
|
||||||
|
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||||
|
if msg.get("request_update"):
|
||||||
|
self.logger.info("server requested plugin info refresh")
|
||||||
|
self.request_info_event.set()
|
||||||
|
|
||||||
|
|
||||||
|
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||||
|
"""Protocol handler for incoming UDP messages."""
|
||||||
|
|
||||||
|
def __init__(self, connection: AsyncConnection):
|
||||||
|
self.connection = connection
|
||||||
|
self.logger = logging.getLogger("hbc.protocol")
|
||||||
|
|
||||||
|
def datagram_received(self, data: bytes, addr):
|
||||||
|
"""Handle incoming datagram."""
|
||||||
|
try:
|
||||||
|
msg = stodict(data)
|
||||||
|
if not msg:
|
||||||
|
self.logger.warning(f"Failed to parse message from {addr}")
|
||||||
|
return
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
msg_id = msg.get("ID")
|
||||||
|
|
||||||
|
if msg_id == "ACK":
|
||||||
|
self.connection.handle_ack(msg, now)
|
||||||
|
elif msg_id == "CMD":
|
||||||
|
# Command from server
|
||||||
|
asyncio.create_task(handle_command(self.connection, msg))
|
||||||
|
elif msg_id == "UPD":
|
||||||
|
# Update from server
|
||||||
|
asyncio.create_task(handle_update(self.connection, msg))
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"Unknown message type: {msg_id}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||||
|
|
||||||
|
def error_received(self, exc):
|
||||||
|
"""Handle protocol errors — close transport so the heartbeat sender retries."""
|
||||||
|
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — will retry")
|
||||||
|
self.connection.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_command(conn: AsyncConnection, msg: dict):
|
||||||
|
"""Execute a command received from server."""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
cmd = msg.get("cmd", "")
|
||||||
|
if not cmd:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.command")
|
||||||
|
logger.info(f"Executing command: {cmd}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.check_output(
|
||||||
|
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
|
||||||
|
).decode()
|
||||||
|
status = "OK"
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
result = str(e)
|
||||||
|
status = "CalledProcessError"
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
result = "Command timed out"
|
||||||
|
status = "Timeout"
|
||||||
|
except Exception as e:
|
||||||
|
result = str(e)
|
||||||
|
status = "Error"
|
||||||
|
|
||||||
|
# Send response
|
||||||
|
response = {
|
||||||
|
"service": "command",
|
||||||
|
"msg": f"{status} {result}"
|
||||||
|
}
|
||||||
|
await conn.sendto(response)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_update(conn: AsyncConnection, _msg: dict): # pyright: ignore[reportUnusedParameter]
|
||||||
|
"""Handle self-update by running hb_install.sh."""
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.update")
|
||||||
|
|
||||||
|
installer = shutil.which("hb_install.sh")
|
||||||
|
if installer is None:
|
||||||
|
candidate = Path(sys.argv[0]).parent / "hb_install.sh"
|
||||||
|
if candidate.exists():
|
||||||
|
installer = str(candidate)
|
||||||
|
|
||||||
|
if installer is None:
|
||||||
|
error = "hb_install.sh not found in PATH or alongside hbc"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Running installer: {installer}")
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
installer, "client",
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.STDOUT,
|
||||||
|
)
|
||||||
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=120)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
error = "Installer timed out"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
error = f"Installer failed: {e}"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
error = f"Installer exited {proc.returncode}: {out.decode().strip()}"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Update successful, restart required")
|
||||||
|
await conn.sendto({"service": "update", "msg": "OK"})
|
||||||
|
|
||||||
|
# Trigger restart
|
||||||
|
global dorestart
|
||||||
|
dorestart = True
|
||||||
|
stop()
|
||||||
|
|
||||||
|
|
||||||
|
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||||
|
"""Send periodic heartbeats, retrying the connection if it is not open.
|
||||||
|
|
||||||
|
IPv6 connections that fail to open before their first successful send are
|
||||||
|
dropped after IPV6_EARLY_FAIL_LIMIT attempts so that a network without IPv6
|
||||||
|
does not keep a dead sender alive. IPv4 connections are retried indefinitely.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
interval: Heartbeat interval in seconds
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger("hbc.heartbeat")
|
||||||
|
IPV6_EARLY_FAIL_LIMIT = 3
|
||||||
|
|
||||||
|
while running and not conn._dead:
|
||||||
|
# Ensure transport is open before attempting to send.
|
||||||
|
if not conn.transport:
|
||||||
|
opened = await conn.open()
|
||||||
|
if opened:
|
||||||
|
conn._open_fail_count = 0
|
||||||
|
else:
|
||||||
|
conn._open_fail_count += 1
|
||||||
|
# Drop an IPv6 connection that has never come up within the
|
||||||
|
# first few attempts — it is likely unavailable on this network.
|
||||||
|
if (not conn._ever_opened
|
||||||
|
and conn.af == socket.AF_INET6
|
||||||
|
and conn._open_fail_count >= IPV6_EARLY_FAIL_LIMIT):
|
||||||
|
logger.warning(
|
||||||
|
f"IPv6 connection to {conn.addr} unreachable after "
|
||||||
|
f"{conn._open_fail_count} attempts, disabling"
|
||||||
|
)
|
||||||
|
conn._dead = True
|
||||||
|
break
|
||||||
|
# Retry after the normal interval; IPv4 retries forever.
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
msg = {
|
||||||
|
"acks": conn.ackcount,
|
||||||
|
"rtt": conn.rtts[-1],
|
||||||
|
"interval": interval
|
||||||
|
}
|
||||||
|
await conn.sendto(msg, "HTB")
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Heartbeat sender cancelled")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||||
|
|
||||||
|
# Wait for next interval or shutdown event
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass # Normal timeout, continue loop
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Heartbeat sender cancelled during sleep")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def _info_plugin_refresh_loop(conn: AsyncConnection, info_plugins: List):
|
||||||
|
"""Wait for server requests to re-send InfoPlugin data."""
|
||||||
|
logger = logging.getLogger("hbc.plugins")
|
||||||
|
while running:
|
||||||
|
await conn.request_info_event.wait()
|
||||||
|
if not running:
|
||||||
|
break
|
||||||
|
conn.request_info_event.clear()
|
||||||
|
logger.info("refreshing InfoPlugins on server request")
|
||||||
|
for plugin in info_plugins:
|
||||||
|
plugin._cache = None
|
||||||
|
try:
|
||||||
|
data = await plugin.collect()
|
||||||
|
if data:
|
||||||
|
await conn.sendto({"plugin": plugin.name, **data}, "PLG")
|
||||||
|
logger.info(f"Resent {plugin.name} data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error re-collecting {plugin.name}: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||||
|
"""Collect and send plugin data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
registry: Plugin registry
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger("hbc.plugins")
|
||||||
|
|
||||||
|
# Collect InfoPlugins once at startup
|
||||||
|
info_plugins = registry.get_by_type(InfoPlugin)
|
||||||
|
for plugin in info_plugins:
|
||||||
|
try:
|
||||||
|
data = await plugin.collect()
|
||||||
|
if data:
|
||||||
|
# Create PLG message with plugin name
|
||||||
|
plugin_msg = {"plugin": plugin.name, **data}
|
||||||
|
await conn.sendto(plugin_msg, "PLG")
|
||||||
|
logger.info(f"Sent {plugin.name} data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||||
|
|
||||||
|
# Schedule MonitorPlugins
|
||||||
|
# Group plugins by interval
|
||||||
|
from collections import defaultdict
|
||||||
|
by_interval = defaultdict(list)
|
||||||
|
|
||||||
|
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||||
|
for plugin in monitor_plugins:
|
||||||
|
by_interval[plugin.interval].append(plugin)
|
||||||
|
|
||||||
|
# Create tasks for each interval; always include the info-refresh watcher
|
||||||
|
tasks = [asyncio.create_task(_info_plugin_refresh_loop(conn, info_plugins))]
|
||||||
|
for interval, plugins in by_interval.items():
|
||||||
|
tasks.append(asyncio.create_task(
|
||||||
|
plugin_collector_interval(conn, plugins, interval)
|
||||||
|
))
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||||
|
for task in tasks:
|
||||||
|
if not task.done():
|
||||||
|
task.cancel()
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def plugin_collector_interval(
|
||||||
|
conn: AsyncConnection,
|
||||||
|
plugins: List,
|
||||||
|
interval: int
|
||||||
|
):
|
||||||
|
"""Collect plugins on a specific interval.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
plugins: List of plugins to collect
|
||||||
|
interval: Collection interval in seconds
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(f"hbc.plugins.{interval}s")
|
||||||
|
|
||||||
|
while running:
|
||||||
|
for plugin in plugins:
|
||||||
|
try:
|
||||||
|
data = await plugin.collect()
|
||||||
|
if data:
|
||||||
|
# Don't use encode_plugin_data - create dict directly
|
||||||
|
plugin_msg = {"plugin": plugin.name, **data}
|
||||||
|
await conn.sendto(plugin_msg, "PLG")
|
||||||
|
logger.debug(f"Sent {plugin.name} data")
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Error collecting {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for next interval or shutdown event
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
shutdown_event.wait(),
|
||||||
|
timeout=interval
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass # Normal timeout, continue loop
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled during sleep")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def shortname(name: str) -> str:
|
||||||
|
"""Extract short hostname."""
|
||||||
|
return name.split(".")[0]
|
||||||
|
|
||||||
|
|
||||||
|
def stop():
|
||||||
|
"""Stop the event loop."""
|
||||||
|
global running
|
||||||
|
running = False
|
||||||
|
|
||||||
|
# Set shutdown event to wake up sleeping tasks
|
||||||
|
if shutdown_event:
|
||||||
|
shutdown_event.set()
|
||||||
|
|
||||||
|
# Cancel all active tasks
|
||||||
|
for task in active_tasks:
|
||||||
|
if not task.done():
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup(connections: List[AsyncConnection]):
|
||||||
|
"""Cleanup connections on shutdown."""
|
||||||
|
logger = logging.getLogger("hbc.cleanup")
|
||||||
|
logger.info("Cleaning up connections")
|
||||||
|
|
||||||
|
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||||
|
if target and send_shutdown:
|
||||||
|
try:
|
||||||
|
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending shutdown: {e}")
|
||||||
|
for conn in connections:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Give messages time to send
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
|
async def async_main(args, config):
|
||||||
|
"""Async main function."""
|
||||||
|
global running, shutdown_event, active_tasks, send_shutdown
|
||||||
|
|
||||||
|
# Create shutdown event
|
||||||
|
shutdown_event = asyncio.Event()
|
||||||
|
active_tasks = []
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.main")
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
iam = socket.gethostname()
|
||||||
|
if args.name:
|
||||||
|
iam = args.name
|
||||||
|
|
||||||
|
hb_hosts = args.hosts
|
||||||
|
hb_port = config.get("hb_port", PORT)
|
||||||
|
interval = config.get("interval", INTERVAL)
|
||||||
|
|
||||||
|
logger.info(f"hbc {__version__} on {iam} -> {hb_hosts} port={hb_port}, interval={interval}s")
|
||||||
|
|
||||||
|
af_filter = (socket.AF_INET if getattr(args, "ipv4_only", False)
|
||||||
|
else socket.AF_INET6 if getattr(args, "ipv6_only", False)
|
||||||
|
else 0)
|
||||||
|
|
||||||
|
# Create connections
|
||||||
|
connections = []
|
||||||
|
conn_id = 1
|
||||||
|
_retry_delay = 5
|
||||||
|
|
||||||
|
while running and not connections:
|
||||||
|
for host in hb_hosts:
|
||||||
|
try:
|
||||||
|
addrs = socket.getaddrinfo(host, hb_port, af_filter, 0, socket.SOL_UDP)
|
||||||
|
except socket.gaierror as e:
|
||||||
|
logger.warning(f"Cannot resolve {host}: {e} — retrying in {_retry_delay}s")
|
||||||
|
continue
|
||||||
|
for addr_info in addrs:
|
||||||
|
af = addr_info[0]
|
||||||
|
addr = addr_info[4][0]
|
||||||
|
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||||
|
if not await conn.open():
|
||||||
|
logger.warning(f"Initial open to {addr} failed, heartbeat sender will retry")
|
||||||
|
connections.append(conn)
|
||||||
|
conn_id += 1
|
||||||
|
if not connections:
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(shutdown_event.wait(), timeout=_retry_delay)
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(_retry_delay)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
_retry_delay = min(_retry_delay * 2, 60)
|
||||||
|
|
||||||
|
if not connections:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
logger.info(f"Created {len(connections)} connections")
|
||||||
|
|
||||||
|
# Send boot/message if requested
|
||||||
|
send_shutdown = False
|
||||||
|
if args.boot or args.message:
|
||||||
|
boot_msg = {}
|
||||||
|
if args.boot:
|
||||||
|
boot_msg["boot"] = 1
|
||||||
|
args.boot = False # Clear boot flag so we don't send it again in main loop
|
||||||
|
send_shutdown = True
|
||||||
|
if args.message:
|
||||||
|
boot_msg["service"] = "service"
|
||||||
|
boot_msg["msg"] = args.message
|
||||||
|
|
||||||
|
boot_msg["acks"] = 0
|
||||||
|
target = next((c for c in connections if c.transport), connections[0])
|
||||||
|
await target.sendto(boot_msg)
|
||||||
|
|
||||||
|
if args.message and not args.daemon:
|
||||||
|
# Message-only mode
|
||||||
|
await cleanup(connections)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Load plugins
|
||||||
|
registry = PluginRegistry()
|
||||||
|
loader = PluginLoader(registry)
|
||||||
|
|
||||||
|
plugin_dir = Path(__file__).parent / "plugins"
|
||||||
|
if plugin_dir.exists():
|
||||||
|
count = await loader.load_from_directory(plugin_dir, config)
|
||||||
|
logger.info(f"Loaded {count} plugins")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Plugin directory not found: {plugin_dir}")
|
||||||
|
|
||||||
|
# Setup signal handlers
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||||
|
loop.add_signal_handler(sig, stop)
|
||||||
|
|
||||||
|
def _sighup():
|
||||||
|
global dorestart
|
||||||
|
dorestart = True
|
||||||
|
stop()
|
||||||
|
|
||||||
|
loop.add_signal_handler(signal.SIGHUP, _sighup)
|
||||||
|
|
||||||
|
# Start async tasks
|
||||||
|
# Heartbeat senders (one per connection)
|
||||||
|
for conn in connections:
|
||||||
|
task = asyncio.create_task(heartbeat_sender(conn, interval))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
# Plugin collector (uses all connections, but we'll use first one)
|
||||||
|
if connections and registry.get_enabled():
|
||||||
|
task = asyncio.create_task(plugin_collector(connections[0], registry))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
# Wait for stop or tasks to complete
|
||||||
|
try:
|
||||||
|
await asyncio.gather(*active_tasks, return_exceptions=True)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Tasks cancelled")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
logger.info("Shutting down...")
|
||||||
|
await cleanup(connections)
|
||||||
|
await loader.unload_all()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def daemonize(
|
||||||
|
working_dir="/",
|
||||||
|
stdin="/dev/zero",
|
||||||
|
stdout="/dev/null",
|
||||||
|
stderr="/dev/null"
|
||||||
|
):
|
||||||
|
"""UNIX double-fork daemonization."""
|
||||||
|
try:
|
||||||
|
pid = os.fork()
|
||||||
|
if pid > 0:
|
||||||
|
os._exit(0)
|
||||||
|
except OSError as e:
|
||||||
|
sys.stderr.write(f"fork #1 failed: {e}\n")
|
||||||
|
os._exit(1)
|
||||||
|
|
||||||
|
os.chdir(working_dir)
|
||||||
|
os.setsid()
|
||||||
|
os.umask(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
pid = os.fork()
|
||||||
|
if pid > 0:
|
||||||
|
os._exit(0)
|
||||||
|
except OSError as e:
|
||||||
|
sys.stderr.write(f"fork #2 failed: {e}\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sys.stdout.flush()
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
si = open(stdin, "r")
|
||||||
|
so = open(stdout, "a+")
|
||||||
|
se = open(stderr, "a+")
|
||||||
|
|
||||||
|
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||||
|
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||||
|
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||||
|
|
||||||
|
|
||||||
|
def _reconfigure_logging_for_daemon(log_level: int) -> None:
|
||||||
|
"""Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
|
||||||
|
root = logging.getLogger()
|
||||||
|
for handler in root.handlers[:]:
|
||||||
|
root.removeHandler(handler)
|
||||||
|
handler.close()
|
||||||
|
|
||||||
|
use_udp_fallback = not os.path.exists("/dev/log")
|
||||||
|
|
||||||
|
if use_udp_fallback:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address=("localhost", 514),
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address="/dev/log",
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
|
||||||
|
syslog_handler.setFormatter(
|
||||||
|
logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
|
||||||
|
)
|
||||||
|
root.addHandler(syslog_handler)
|
||||||
|
root.setLevel(log_level)
|
||||||
|
|
||||||
|
if use_udp_fallback:
|
||||||
|
logging.warning("/dev/log not found, using syslog UDP localhost:514")
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser():
|
||||||
|
"""Build argument parser."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="hbc",
|
||||||
|
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-b", "--boot",
|
||||||
|
action="store_true",
|
||||||
|
help="Send a boot message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c", "--config",
|
||||||
|
dest="configfile",
|
||||||
|
help="Config file path (YAML)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-m", "--message",
|
||||||
|
dest="message",
|
||||||
|
help="Send a message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", "--name",
|
||||||
|
dest="name",
|
||||||
|
help="Name to use in heartbeat message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d", "--daemon",
|
||||||
|
action="store_true",
|
||||||
|
help="Run in daemon mode"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--verbose",
|
||||||
|
action="store_true",
|
||||||
|
help="Verbose output"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-x", "--debug",
|
||||||
|
action="count",
|
||||||
|
default=0,
|
||||||
|
help="Increase debug level"
|
||||||
|
)
|
||||||
|
af_group = parser.add_mutually_exclusive_group()
|
||||||
|
af_group.add_argument("-4", dest="ipv4_only", action="store_true", help="Use IPv4 only")
|
||||||
|
af_group.add_argument("-6", dest="ipv6_only", action="store_true", help="Use IPv6 only")
|
||||||
|
parser.add_argument(
|
||||||
|
"hosts",
|
||||||
|
nargs="+",
|
||||||
|
help="Heartbeat daemon hosts to send to"
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
"""Main entry point."""
|
||||||
|
global running, dorestart
|
||||||
|
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
log_level = logging.WARNING
|
||||||
|
if args.verbose:
|
||||||
|
log_level = logging.INFO
|
||||||
|
if args.debug:
|
||||||
|
log_level = logging.DEBUG
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=log_level,
|
||||||
|
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
|
||||||
|
# Daemonize if requested
|
||||||
|
if args.daemon:
|
||||||
|
logging.info("Daemonizing...")
|
||||||
|
daemonize()
|
||||||
|
_reconfigure_logging_for_daemon(log_level)
|
||||||
|
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||||
|
|
||||||
|
# Run async main
|
||||||
|
try:
|
||||||
|
exit_code = asyncio.run(async_main(args, config))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Interrupted by user")
|
||||||
|
exit_code = 0
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Fatal error: {e}", exc_info=True)
|
||||||
|
exit_code = 1
|
||||||
|
|
||||||
|
# Handle restart
|
||||||
|
if dorestart:
|
||||||
|
logging.info("Restarting...")
|
||||||
|
os.execv(sys.argv[0], sys.argv)
|
||||||
|
|
||||||
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,425 @@
|
|||||||
|
"""Plugin system for extending Heartbeat data collection and monitoring.
|
||||||
|
|
||||||
|
This module provides the base classes and infrastructure for the plugin system
|
||||||
|
that enables extending hbc (client) data collection and hbd (server) processing.
|
||||||
|
|
||||||
|
Plugin Types:
|
||||||
|
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
|
||||||
|
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
|
||||||
|
|
||||||
|
Plugins run on the client (hbc) to gather data, which is then sent to the server
|
||||||
|
(hbd) for storage, threshold checking, and display.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import inspect
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Type
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(ABC):
|
||||||
|
"""Base class for all plugins.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
|
||||||
|
version: Plugin version string
|
||||||
|
description: Human-readable description
|
||||||
|
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
|
||||||
|
enabled: Whether plugin is active (can be disabled via config)
|
||||||
|
skip_reason: Set by plugin before returning False from initialize(); causes loader to log INFO instead of WARNING.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = ""
|
||||||
|
version: str = "1.0.0"
|
||||||
|
description: str = ""
|
||||||
|
interval: int = 0
|
||||||
|
enabled: bool = True
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""Initialize plugin with optional configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
|
||||||
|
"""
|
||||||
|
self.config = config or {}
|
||||||
|
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||||
|
self._initialized = False
|
||||||
|
self.skip_reason: Optional[str] = None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize plugin (load resources, check dependencies).
|
||||||
|
|
||||||
|
Called once when plugin is loaded. Plugins should validate dependencies
|
||||||
|
(e.g., check if psutil is available) and prepare any resources.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization succeeded, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect data from the system.
|
||||||
|
|
||||||
|
This is the main method called on each collection interval. Should return
|
||||||
|
a dictionary of key-value pairs representing the collected data.
|
||||||
|
|
||||||
|
Keys should be strings (metric names). Values can be:
|
||||||
|
- Scalars: int, float, str, bool
|
||||||
|
- Lists/dicts (will be serialized appropriately)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of collected metrics, or empty dict on error
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def cleanup(self) -> None:
|
||||||
|
"""Cleanup plugin resources before shutdown.
|
||||||
|
|
||||||
|
Called when plugin is being unloaded or on system shutdown.
|
||||||
|
Override to release resources, close connections, etc.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def validate_data(self, data: Dict[str, Any]) -> bool:
|
||||||
|
"""Validate collected data before sending to server.
|
||||||
|
|
||||||
|
Override to implement custom validation logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data returned from collect()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if data is valid, False otherwise
|
||||||
|
"""
|
||||||
|
return isinstance(data, dict)
|
||||||
|
|
||||||
|
|
||||||
|
class InfoPlugin(Plugin):
|
||||||
|
"""Plugin for collecting static or rarely-changing information.
|
||||||
|
|
||||||
|
InfoPlugins collect data that doesn't change frequently:
|
||||||
|
- OS name and version
|
||||||
|
- Hardware specifications (CPU model, RAM size)
|
||||||
|
- Network interface MAC addresses
|
||||||
|
|
||||||
|
Characteristics:
|
||||||
|
- interval = 0 (collected once at startup by default)
|
||||||
|
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
|
||||||
|
- Data is cached and reused until next collection
|
||||||
|
"""
|
||||||
|
|
||||||
|
interval: int = 0 # Collect once at startup
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._cached_data: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get cached data if available (avoids re-collection).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached data dict, or None if not yet collected
|
||||||
|
"""
|
||||||
|
return self._cached_data
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect and cache static information."""
|
||||||
|
if self._cached_data is None:
|
||||||
|
self._cached_data = await self._collect_info()
|
||||||
|
return self._cached_data
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to perform actual data collection.
|
||||||
|
|
||||||
|
Override this method instead of collect() for InfoPlugins.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def invalidate_cache(self) -> None:
|
||||||
|
"""Force re-collection on next collect() call."""
|
||||||
|
self._cached_data = None
|
||||||
|
|
||||||
|
|
||||||
|
class MonitorPlugin(Plugin):
|
||||||
|
"""Plugin for collecting periodic monitoring data.
|
||||||
|
|
||||||
|
MonitorPlugins collect time-series metrics that change frequently:
|
||||||
|
- CPU usage percentage
|
||||||
|
- Memory consumption
|
||||||
|
- Disk I/O statistics
|
||||||
|
- Network traffic
|
||||||
|
|
||||||
|
Characteristics:
|
||||||
|
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
|
||||||
|
- Collected continuously on schedule
|
||||||
|
- Data includes timestamps for time-series tracking
|
||||||
|
"""
|
||||||
|
|
||||||
|
interval: int = 30 # Default: collect every 30 seconds
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._last_reading: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
def get_last_reading(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get the last collected reading.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Last reading dict with timestamp, or None if not yet collected
|
||||||
|
"""
|
||||||
|
return self._last_reading
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect monitoring data and store as last reading."""
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
if data:
|
||||||
|
# Add collection timestamp
|
||||||
|
import time
|
||||||
|
data['_timestamp'] = time.time()
|
||||||
|
self._last_reading = data
|
||||||
|
return data
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to perform actual metric collection.
|
||||||
|
|
||||||
|
Override this method instead of collect() for MonitorPlugins.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PluginRegistry:
|
||||||
|
"""Registry for managing loaded plugins.
|
||||||
|
|
||||||
|
Maintains a collection of loaded plugins and provides methods to
|
||||||
|
query plugins by name, type, or interval.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._plugins: Dict[str, Plugin] = {}
|
||||||
|
self.logger = logging.getLogger("plugin.registry")
|
||||||
|
|
||||||
|
def register(self, plugin: Plugin) -> bool:
|
||||||
|
"""Register a plugin instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin: Plugin instance to register
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if registered successfully, False if name conflict
|
||||||
|
"""
|
||||||
|
if plugin.name in self._plugins:
|
||||||
|
self.logger.error(f"Plugin '{plugin.name}' already registered")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self._plugins[plugin.name] = plugin
|
||||||
|
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def unregister(self, name: str) -> bool:
|
||||||
|
"""Unregister a plugin by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Plugin name to unregister
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if unregistered, False if not found
|
||||||
|
"""
|
||||||
|
if name in self._plugins:
|
||||||
|
del self._plugins[name]
|
||||||
|
self.logger.info(f"Unregistered plugin: {name}")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get(self, name: str) -> Optional[Plugin]:
|
||||||
|
"""Get plugin by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Plugin name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Plugin instance or None if not found
|
||||||
|
"""
|
||||||
|
return self._plugins.get(name)
|
||||||
|
|
||||||
|
def get_all(self) -> List[Plugin]:
|
||||||
|
"""Get all registered plugins."""
|
||||||
|
return list(self._plugins.values())
|
||||||
|
|
||||||
|
def get_enabled(self) -> List[Plugin]:
|
||||||
|
"""Get all enabled plugins."""
|
||||||
|
return [p for p in self._plugins.values() if p.enabled]
|
||||||
|
|
||||||
|
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
|
||||||
|
"""Get all plugins of a specific type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of plugins matching the type
|
||||||
|
"""
|
||||||
|
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
|
||||||
|
|
||||||
|
def get_by_interval(self, interval: int) -> List[Plugin]:
|
||||||
|
"""Get all plugins with a specific collection interval.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
interval: Interval in seconds (0 for one-time collection)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of plugins with matching interval
|
||||||
|
"""
|
||||||
|
return [p for p in self._plugins.values() if p.interval == interval]
|
||||||
|
|
||||||
|
|
||||||
|
class PluginLoader:
|
||||||
|
"""Load plugins from filesystem and instantiate them.
|
||||||
|
|
||||||
|
Scans plugin directories for Python modules containing Plugin subclasses,
|
||||||
|
loads them dynamically, and registers them with the PluginRegistry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, registry: PluginRegistry):
|
||||||
|
self.registry = registry
|
||||||
|
self.logger = logging.getLogger("plugin.loader")
|
||||||
|
self._loaded_modules: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
async def load_from_directory(
|
||||||
|
self,
|
||||||
|
directory: Path,
|
||||||
|
config: Optional[Dict[str, Any]] = None
|
||||||
|
) -> int:
|
||||||
|
"""Load all plugins from a directory.
|
||||||
|
|
||||||
|
Scans for .py files, imports them, finds Plugin subclasses,
|
||||||
|
instantiates them with config, initializes, and registers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Path to plugin directory
|
||||||
|
config: Configuration dict (may contain per-plugin config)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of plugins successfully loaded
|
||||||
|
"""
|
||||||
|
if not directory.exists() or not directory.is_dir():
|
||||||
|
self.logger.warning(f"Plugin directory not found: {directory}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
loaded_count = 0
|
||||||
|
raw_config = config or {}
|
||||||
|
# Per-plugin config lives under the 'plugins' key or at top-level.
|
||||||
|
# CLIENT_DEFAULTS seeds "plugins": {} so the key always exists; check
|
||||||
|
# both the subdict and top-level so that either layout in .hbc.yaml works.
|
||||||
|
plugins_subconfig = raw_config.get("plugins", {})
|
||||||
|
|
||||||
|
# Scan for Python files
|
||||||
|
for plugin_file in directory.glob("*.py"):
|
||||||
|
if plugin_file.name.startswith("_"):
|
||||||
|
continue # Skip __init__.py and private modules
|
||||||
|
|
||||||
|
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load module dynamically
|
||||||
|
module_name = f"plugins.{plugin_file.stem}"
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
|
||||||
|
if not spec or not spec.loader:
|
||||||
|
self.logger.warning(f"Could not create spec for {plugin_file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
sys.modules[module_name] = module
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
self._loaded_modules[module_name] = module
|
||||||
|
|
||||||
|
self.logger.debug(f"Loaded module: {module_name}")
|
||||||
|
|
||||||
|
# Track which plugin classes we've already processed to avoid duplicates
|
||||||
|
processed_classes = set()
|
||||||
|
|
||||||
|
# Find Plugin subclasses in module
|
||||||
|
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||||
|
# Skip base classes and non-Plugin classes
|
||||||
|
if obj in (Plugin, InfoPlugin, MonitorPlugin):
|
||||||
|
self.logger.debug(f"Skipping base class: {name}")
|
||||||
|
continue
|
||||||
|
if not issubclass(obj, Plugin):
|
||||||
|
self.logger.debug(f"Skipping non-Plugin class: {name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if we've already processed this class (handles module-level aliases)
|
||||||
|
if id(obj) in processed_classes:
|
||||||
|
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
|
||||||
|
continue
|
||||||
|
processed_classes.add(id(obj))
|
||||||
|
|
||||||
|
self.logger.debug(f"Found plugin class: {name}")
|
||||||
|
|
||||||
|
# Instantiate plugin with config — check plugins subdict first,
|
||||||
|
# then top-level keys (e.g. nagios_runner: ... at root of config).
|
||||||
|
plugin_instance_config = dict(plugins_subconfig.get(obj.name) or raw_config.get(obj.name) or {})
|
||||||
|
# Propagate top-level owner so os_info (and any future plugin) can report it.
|
||||||
|
if "owner" in raw_config and "owner" not in plugin_instance_config:
|
||||||
|
plugin_instance_config["owner"] = raw_config["owner"]
|
||||||
|
plugin = obj(config=plugin_instance_config)
|
||||||
|
|
||||||
|
# Initialize plugin
|
||||||
|
try:
|
||||||
|
initialized = await plugin.initialize()
|
||||||
|
if not initialized:
|
||||||
|
if plugin.skip_reason:
|
||||||
|
self.logger.info(
|
||||||
|
f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Plugin {plugin.name} failed initialization, skipping"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error initializing plugin {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Register with registry
|
||||||
|
if self.registry.register(plugin):
|
||||||
|
loaded_count += 1
|
||||||
|
self.logger.info(
|
||||||
|
f"Loaded plugin: {plugin.name} v{plugin.version} "
|
||||||
|
f"(interval: {plugin.interval}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error loading plugin from {plugin_file}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return loaded_count
|
||||||
|
|
||||||
|
async def unload_all(self) -> None:
|
||||||
|
"""Unload all plugins and cleanup resources."""
|
||||||
|
for plugin in self.registry.get_all():
|
||||||
|
try:
|
||||||
|
await plugin.cleanup()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error cleaning up plugin {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
self.registry.unregister(plugin.name)
|
||||||
|
|
||||||
|
# Remove loaded modules
|
||||||
|
for module_name in self._loaded_modules:
|
||||||
|
if module_name in sys.modules:
|
||||||
|
del sys.modules[module_name]
|
||||||
|
self._loaded_modules.clear()
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
"""CPU Monitoring Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
||||||
|
load average, and process counts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import from parent package
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class CPUMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Monitor CPU usage and load.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Overall CPU usage percentage
|
||||||
|
- Per-core CPU usage (if enabled in config)
|
||||||
|
- Load average (1min, 5min, 15min)
|
||||||
|
- Process count
|
||||||
|
- CPU frequency (if available)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "cpu_monitor"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "CPU usage and load monitoring"
|
||||||
|
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self.psutil = None
|
||||||
|
self.per_core = config.get("per_core", False) if config else False
|
||||||
|
self.interval = config.get("interval", 300) if config else 300
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the CPU monitor plugin.
|
||||||
|
|
||||||
|
Checks if psutil is available.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if psutil is available, False otherwise
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
self.psutil = psutil
|
||||||
|
self.logger.info(f"{self.name} initialized successfully")
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
self.logger.error(
|
||||||
|
"psutil module not available. Install with: pip install psutil"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect CPU metrics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with CPU metrics
|
||||||
|
"""
|
||||||
|
if not self.psutil:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
||||||
|
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
||||||
|
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
||||||
|
|
||||||
|
# Per-core CPU usage (if enabled)
|
||||||
|
if self.per_core:
|
||||||
|
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
||||||
|
data["cpu_per_core"] = per_core_percents
|
||||||
|
data["cpu_core_count"] = len(per_core_percents)
|
||||||
|
else:
|
||||||
|
# Just report core count
|
||||||
|
data["cpu_core_count"] = self.psutil.cpu_count()
|
||||||
|
|
||||||
|
# Load average (Unix-like systems only)
|
||||||
|
try:
|
||||||
|
load_avg = self.psutil.getloadavg()
|
||||||
|
data["load_1min"] = round(load_avg[0], 2)
|
||||||
|
data["load_5min"] = round(load_avg[1], 2)
|
||||||
|
data["load_15min"] = round(load_avg[2], 2)
|
||||||
|
except (AttributeError, OSError):
|
||||||
|
# Not available on Windows
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Process count
|
||||||
|
try:
|
||||||
|
data["process_count"] = len(self.psutil.pids())
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not get process count: {e}")
|
||||||
|
|
||||||
|
# CPU frequency (if available)
|
||||||
|
try:
|
||||||
|
freq = self.psutil.cpu_freq()
|
||||||
|
if freq:
|
||||||
|
data["cpu_freq_current"] = round(freq.current, 2)
|
||||||
|
data["cpu_freq_min"] = round(freq.min, 2)
|
||||||
|
data["cpu_freq_max"] = round(freq.max, 2)
|
||||||
|
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
||||||
|
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
||||||
|
self.logger.debug(f"CPU frequency not available: {e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
# CPU times (user, system, idle, etc.)
|
||||||
|
try:
|
||||||
|
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
||||||
|
data["cpu_user"] = round(cpu_times.user, 1)
|
||||||
|
data["cpu_system"] = round(cpu_times.system, 1)
|
||||||
|
data["cpu_idle"] = round(cpu_times.idle, 1)
|
||||||
|
if hasattr(cpu_times, "iowait"):
|
||||||
|
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Could not get CPU times: {e}")
|
||||||
|
|
||||||
|
# Uptime in seconds
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Could not get uptime: {e}")
|
||||||
|
|
||||||
|
self.logger.debug(
|
||||||
|
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
||||||
|
return {}
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
"""
|
||||||
|
Disk monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects disk usage and I/O statistics using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DiskMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor disk usage and I/O statistics.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Disk partition information
|
||||||
|
- Disk usage per partition (total, used, free, percent)
|
||||||
|
- Disk I/O counters (read/write bytes, read/write count)
|
||||||
|
- Disk I/O time statistics
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
partitions: List of mount points to monitor (default: all)
|
||||||
|
include_io: Include disk I/O statistics (default: True)
|
||||||
|
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "disk_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the disk monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- partitions: List of specific mount points to monitor
|
||||||
|
- include_io: Include I/O statistics (default: True)
|
||||||
|
- exclude_types: List of filesystem types to exclude
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.partitions = self.config.get('partitions', None) # None = all partitions
|
||||||
|
self.include_io = self.config.get('include_io', True)
|
||||||
|
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for disk_monitor plugin")
|
||||||
|
|
||||||
|
# Store previous I/O counters for delta calculation
|
||||||
|
self._prev_io = {}
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - disk_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
|
||||||
|
|
||||||
|
# Initialize I/O counters if available
|
||||||
|
if self.include_io:
|
||||||
|
try:
|
||||||
|
self._prev_io = psutil.disk_io_counters(perdisk=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not initialize disk I/O counters: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current disk statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with disk metrics organized by partition:
|
||||||
|
- partitions: Dict of partition data, keyed by mount point
|
||||||
|
- device: Device name (e.g., /dev/sda1)
|
||||||
|
- fstype: Filesystem type (e.g., ext4)
|
||||||
|
- total: Total space in bytes
|
||||||
|
- used: Used space in bytes
|
||||||
|
- free: Free space in bytes
|
||||||
|
- percent: Usage percentage
|
||||||
|
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
|
||||||
|
- read_count: Number of reads
|
||||||
|
- write_count: Number of writes
|
||||||
|
- read_bytes: Bytes read
|
||||||
|
- write_bytes: Bytes written
|
||||||
|
- read_time: Time spent reading in ms
|
||||||
|
- write_time: Time spent writing in ms
|
||||||
|
- read_bytes_delta: Bytes read since last collection
|
||||||
|
- write_bytes_delta: Bytes written since last collection
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting disk metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect disk metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect partition usage
|
||||||
|
partitions_data = {}
|
||||||
|
partitions = psutil.disk_partitions(all=False)
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
# Skip unwanted filesystem types
|
||||||
|
if partition.fstype in self.exclude_types:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if we're only monitoring specific partitions
|
||||||
|
if self.partitions and partition.mountpoint not in self.partitions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
usage = psutil.disk_usage(partition.mountpoint)
|
||||||
|
partitions_data[partition.mountpoint] = {
|
||||||
|
'device': partition.device,
|
||||||
|
'fstype': partition.fstype,
|
||||||
|
'total': usage.total,
|
||||||
|
'used': usage.used,
|
||||||
|
'free': usage.free,
|
||||||
|
'percent': usage.percent
|
||||||
|
}
|
||||||
|
except PermissionError:
|
||||||
|
logger.debug(f"Permission denied accessing {partition.mountpoint}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error reading {partition.mountpoint}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
metrics['partitions'] = partitions_data
|
||||||
|
|
||||||
|
# Collect I/O statistics
|
||||||
|
if self.include_io:
|
||||||
|
try:
|
||||||
|
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||||
|
io_data = {}
|
||||||
|
|
||||||
|
for disk_name, counters in io_counters.items():
|
||||||
|
disk_stats = {
|
||||||
|
'read_count': counters.read_count,
|
||||||
|
'write_count': counters.write_count,
|
||||||
|
'read_bytes': counters.read_bytes,
|
||||||
|
'write_bytes': counters.write_bytes,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add time statistics if available
|
||||||
|
if hasattr(counters, 'read_time'):
|
||||||
|
disk_stats['read_time'] = counters.read_time
|
||||||
|
if hasattr(counters, 'write_time'):
|
||||||
|
disk_stats['write_time'] = counters.write_time
|
||||||
|
if hasattr(counters, 'busy_time'):
|
||||||
|
disk_stats['busy_time'] = counters.busy_time
|
||||||
|
|
||||||
|
# Calculate deltas from previous collection
|
||||||
|
if disk_name in self._prev_io:
|
||||||
|
prev = self._prev_io[disk_name]
|
||||||
|
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
|
||||||
|
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
|
||||||
|
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
|
||||||
|
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
|
||||||
|
|
||||||
|
io_data[disk_name] = disk_stats
|
||||||
|
|
||||||
|
metrics['io_counters'] = io_data
|
||||||
|
|
||||||
|
# Store current counters for next delta calculation
|
||||||
|
self._prev_io = io_counters
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect disk I/O statistics: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Disk monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = DiskMonitorPlugin
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
"""
|
||||||
|
Filesystem information plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects static filesystem and partition information using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import InfoPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemInfoPlugin(InfoPlugin):
|
||||||
|
"""
|
||||||
|
Collect filesystem and partition information.
|
||||||
|
|
||||||
|
This is an InfoPlugin that collects static information once during startup.
|
||||||
|
|
||||||
|
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
|
||||||
|
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- List of mounted filesystems
|
||||||
|
- Partition details (device, mount point, filesystem type, options)
|
||||||
|
- Filesystem capabilities and features
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||||
|
exclude_types: List of additional filesystem types to exclude (default: [])
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "filesystem_info"
|
||||||
|
interval = 0 # InfoPlugin - collect once
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the filesystem info plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||||
|
- exclude_types: List of filesystem types to exclude (default: [])
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.include_pseudo = self.config.get('include_pseudo', False)
|
||||||
|
# By default, no exclusions since all=False filters most pseudo filesystems
|
||||||
|
# Users can add specific types to exclude if needed
|
||||||
|
self.exclude_types = set(self.config.get('exclude_types', []))
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for filesystem_info plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - filesystem_info cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect filesystem information.
|
||||||
|
|
||||||
|
Returns only physical mounted filesystems by default.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with filesystem data:
|
||||||
|
- filesystems: List of filesystem dictionaries:
|
||||||
|
- device: Device name (e.g., /dev/sda1)
|
||||||
|
- mountpoint: Mount point path
|
||||||
|
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
|
||||||
|
- opts: Mount options (comma-separated string)
|
||||||
|
- maxfile: Maximum filename length
|
||||||
|
- maxpath: Maximum path length
|
||||||
|
- filesystem_types: List of unique filesystem types found
|
||||||
|
- mount_count: Total number of mounted filesystems
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_info()
|
||||||
|
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting filesystem info: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Collect filesystem information from psutil."""
|
||||||
|
info = {}
|
||||||
|
filesystems = []
|
||||||
|
filesystem_types = set()
|
||||||
|
|
||||||
|
# Get mounted disk partitions
|
||||||
|
# all=False returns only physical devices (real mounted filesystems)
|
||||||
|
# all=True would include pseudo filesystems (proc, sysfs, etc.)
|
||||||
|
partitions = psutil.disk_partitions(all=self.include_pseudo)
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
# Additional filtering if exclude_types is specified
|
||||||
|
if partition.fstype in self.exclude_types:
|
||||||
|
continue
|
||||||
|
|
||||||
|
fs_info = {
|
||||||
|
'device': partition.device,
|
||||||
|
'mountpoint': partition.mountpoint,
|
||||||
|
'fstype': partition.fstype,
|
||||||
|
'opts': partition.opts,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to get filesystem capabilities
|
||||||
|
try:
|
||||||
|
# Get path configuration for this mount point
|
||||||
|
import os
|
||||||
|
if hasattr(os, 'pathconf'):
|
||||||
|
try:
|
||||||
|
# Maximum filename length
|
||||||
|
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||||
|
if max_name:
|
||||||
|
fs_info['maxfile'] = max_name
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Maximum path length
|
||||||
|
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||||
|
if max_path:
|
||||||
|
fs_info['maxpath'] = max_path
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
|
||||||
|
|
||||||
|
filesystems.append(fs_info)
|
||||||
|
filesystem_types.add(partition.fstype)
|
||||||
|
|
||||||
|
info['filesystems'] = filesystems
|
||||||
|
info['filesystem_types'] = sorted(list(filesystem_types))
|
||||||
|
info['mount_count'] = len(filesystems)
|
||||||
|
|
||||||
|
# Add some additional filesystem statistics
|
||||||
|
try:
|
||||||
|
# Get boot time (useful for determining filesystem mount times)
|
||||||
|
boot_time = psutil.boot_time()
|
||||||
|
info['boot_time'] = boot_time
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not get boot time: {e}")
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Filesystem info cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = FilesystemInfoPlugin
|
||||||
@@ -0,0 +1,175 @@
|
|||||||
|
"""
|
||||||
|
Memory monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects memory and swap usage statistics using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
def _zfs_arc_bytes() -> int:
|
||||||
|
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
|
||||||
|
|
||||||
|
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
|
||||||
|
kernel (it is not in SReclaimable), so it would otherwise be counted as
|
||||||
|
used memory.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open("/proc/spl/kstat/zfs/arcstats") as fh:
|
||||||
|
for line in fh:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 3 and parts[0] == "size":
|
||||||
|
return int(parts[2])
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor memory and swap usage.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Physical memory (RAM) usage and statistics
|
||||||
|
- Virtual memory details
|
||||||
|
- Swap memory usage and statistics
|
||||||
|
- Memory available for applications
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
include_swap: Include swap statistics (default: True)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "memory_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the memory monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- include_swap: Include swap statistics (default: True)
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.include_swap = self.config.get('include_swap', True)
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for memory_monitor plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - memory_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current memory statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with memory metrics:
|
||||||
|
- memory_total: Total physical RAM in bytes
|
||||||
|
- memory_available: Available memory in bytes
|
||||||
|
- memory_used: Used memory in bytes
|
||||||
|
- memory_free: Free memory in bytes
|
||||||
|
- memory_percent: Memory usage percentage
|
||||||
|
- memory_active: Active memory (Unix)
|
||||||
|
- memory_inactive: Inactive memory (Unix)
|
||||||
|
- memory_buffers: Buffers (Linux)
|
||||||
|
- memory_cached: Cached (Linux)
|
||||||
|
- memory_shared: Shared (Linux)
|
||||||
|
- swap_total: Total swap in bytes (if include_swap)
|
||||||
|
- swap_used: Used swap in bytes (if include_swap)
|
||||||
|
- swap_free: Free swap in bytes (if include_swap)
|
||||||
|
- swap_percent: Swap usage percentage (if include_swap)
|
||||||
|
- swap_sin: Bytes swapped in from disk (if include_swap)
|
||||||
|
- swap_sout: Bytes swapped out to disk (if include_swap)
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected memory metrics: {len(data)} fields")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting memory metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect memory metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Virtual (physical) memory statistics
|
||||||
|
vmem = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# psutil's available already excludes page cache / file buffers
|
||||||
|
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
|
||||||
|
# does not include it in SReclaimable / MemAvailable even though it is
|
||||||
|
# reclaimable.
|
||||||
|
arc_bytes = _zfs_arc_bytes()
|
||||||
|
available = min(vmem.available + arc_bytes, vmem.total)
|
||||||
|
used = vmem.total - available
|
||||||
|
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
|
||||||
|
|
||||||
|
metrics['memory_total'] = vmem.total
|
||||||
|
metrics['memory_available'] = available
|
||||||
|
metrics['memory_used'] = used
|
||||||
|
metrics['memory_free'] = vmem.free
|
||||||
|
metrics['memory_percent'] = percent
|
||||||
|
|
||||||
|
# Platform-specific memory details
|
||||||
|
if hasattr(vmem, 'active'):
|
||||||
|
metrics['memory_active'] = vmem.active
|
||||||
|
if hasattr(vmem, 'inactive'):
|
||||||
|
metrics['memory_inactive'] = vmem.inactive
|
||||||
|
if hasattr(vmem, 'buffers'):
|
||||||
|
metrics['memory_buffers'] = vmem.buffers
|
||||||
|
if hasattr(vmem, 'cached'):
|
||||||
|
metrics['memory_cached'] = vmem.cached
|
||||||
|
if hasattr(vmem, 'shared'):
|
||||||
|
metrics['memory_shared'] = vmem.shared
|
||||||
|
|
||||||
|
# Swap memory statistics
|
||||||
|
if self.include_swap:
|
||||||
|
try:
|
||||||
|
swap = psutil.swap_memory()
|
||||||
|
metrics['swap_total'] = swap.total
|
||||||
|
metrics['swap_used'] = swap.used
|
||||||
|
metrics['swap_free'] = swap.free
|
||||||
|
metrics['swap_percent'] = swap.percent
|
||||||
|
|
||||||
|
# Swap in/out counters (may not be available on all platforms)
|
||||||
|
if hasattr(swap, 'sin'):
|
||||||
|
metrics['swap_sin'] = swap.sin
|
||||||
|
if hasattr(swap, 'sout'):
|
||||||
|
metrics['swap_sout'] = swap.sout
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect swap statistics: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Memory monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = MemoryMonitorPlugin
|
||||||
@@ -0,0 +1,287 @@
|
|||||||
|
"""Nagios Plugin Runner for Heartbeat.
|
||||||
|
|
||||||
|
Executes Nagios-compatible monitoring plugins and parses their output.
|
||||||
|
|
||||||
|
Nagios Plugin Standard:
|
||||||
|
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||||
|
- Output format: Single line status message, optional performance data
|
||||||
|
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
|
||||||
|
Example configuration in ~/.hb.yaml:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
interval: 60
|
||||||
|
commands:
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shlex
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# Nagios exit codes
|
||||||
|
NAGIOS_UNKNOWN = 3
|
||||||
|
|
||||||
|
STATUS_NAMES = {
|
||||||
|
0: "OK",
|
||||||
|
1: "WARNING",
|
||||||
|
2: "CRITICAL",
|
||||||
|
3: "UNKNOWN",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class NagiosRunnerPlugin(MonitorPlugin):
|
||||||
|
"""Run Nagios-compatible monitoring plugins.
|
||||||
|
|
||||||
|
This plugin executes external Nagios plugins and collects their output,
|
||||||
|
including status codes, messages, and performance data.
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
commands: List of command definitions with 'name' and 'command' keys
|
||||||
|
timeout: Command execution timeout in seconds (default: 30)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300 # Check every 5 minutes
|
||||||
|
timeout: 30
|
||||||
|
commands:
|
||||||
|
- name: check_disk
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "nagios_runner"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Execute Nagios-compatible monitoring plugins"
|
||||||
|
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Extract configuration
|
||||||
|
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||||
|
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||||
|
self.interval = config.get("interval", 300) if config else 300
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the Nagios runner plugin.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if at least one command is configured, False otherwise
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
|
||||||
|
if not self.commands:
|
||||||
|
self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name", "unnamed")
|
||||||
|
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
|
||||||
|
|
||||||
|
# Validate absolute command paths early
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name", "unnamed")
|
||||||
|
command = cmd_config.get("command", "")
|
||||||
|
if not command:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
tokens = shlex.split(command)
|
||||||
|
except ValueError:
|
||||||
|
continue # malformed command string; skip validation
|
||||||
|
if not tokens:
|
||||||
|
continue
|
||||||
|
exe = tokens[0]
|
||||||
|
if os.path.isabs(exe):
|
||||||
|
if not os.path.isfile(exe):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not found: {exe}"
|
||||||
|
)
|
||||||
|
elif not os.access(exe, os.X_OK):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not executable: {exe}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect metrics from all configured Nagios plugins.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with results from all plugins
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name")
|
||||||
|
command = cmd_config.get("command")
|
||||||
|
|
||||||
|
if not name or not command:
|
||||||
|
self.logger.warning("Skipping command with missing name or command")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Execute plugin
|
||||||
|
try:
|
||||||
|
status_code, output, perfdata = await self._run_nagios_plugin(command)
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
|
||||||
|
results[f"{name}_status_code"] = status_code
|
||||||
|
results[f"{name}_output"] = output
|
||||||
|
|
||||||
|
# Parse and add performance data
|
||||||
|
if perfdata:
|
||||||
|
for metric_name, metric_value in perfdata.items():
|
||||||
|
results[f"{name}_{metric_name}"] = metric_value
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error running {name}: {e}", exc_info=True)
|
||||||
|
results[f"{name}_status"] = "ERROR"
|
||||||
|
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||||
|
results[f"{name}_output"] = str(e)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _run_nagios_plugin(
|
||||||
|
self,
|
||||||
|
command: str
|
||||||
|
) -> Tuple[int, str, Dict[str, Any]]:
|
||||||
|
"""Execute a Nagios plugin and parse its output."""
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_shell(
|
||||||
|
command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||||
|
proc.communicate(), timeout=self.timeout
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
proc.kill()
|
||||||
|
await proc.communicate()
|
||||||
|
self.logger.error(f"Command timed out: {command}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||||
|
|
||||||
|
status_code = proc.returncode
|
||||||
|
|
||||||
|
if status_code < 0:
|
||||||
|
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
|
||||||
|
|
||||||
|
if status_code > 3:
|
||||||
|
status_code = NAGIOS_UNKNOWN
|
||||||
|
|
||||||
|
stdout = stdout_bytes.decode(errors="replace").strip()
|
||||||
|
stderr = stderr_bytes.decode(errors="replace").strip()
|
||||||
|
|
||||||
|
# Parse perfdata from stdout before mixing in stderr
|
||||||
|
perfdata = self._parse_perfdata(stdout)
|
||||||
|
|
||||||
|
# Build status message
|
||||||
|
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
|
||||||
|
|
||||||
|
if not stdout and stderr:
|
||||||
|
output_msg = stderr
|
||||||
|
elif stdout and stderr:
|
||||||
|
output_msg = f"{status_part} [stderr: {stderr}]"
|
||||||
|
else:
|
||||||
|
output_msg = status_part
|
||||||
|
|
||||||
|
return status_code, output_msg, perfdata
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error executing command: {e}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||||
|
|
||||||
|
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
|
||||||
|
"""Parse Nagios performance data from plugin output.
|
||||||
|
|
||||||
|
Nagios performance data format:
|
||||||
|
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
|
||||||
|
Multiple metrics separated by spaces.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output: Plugin output string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of metric_name: value
|
||||||
|
"""
|
||||||
|
perfdata = {}
|
||||||
|
|
||||||
|
# Performance data comes after the pipe character
|
||||||
|
if '|' not in output:
|
||||||
|
return perfdata
|
||||||
|
|
||||||
|
perf_section = output.split('|', 1)[1].strip()
|
||||||
|
|
||||||
|
# Regex to match performance data format
|
||||||
|
# Matches: 'label'=value or label=value
|
||||||
|
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
|
||||||
|
|
||||||
|
for match in re.finditer(perf_regex, perf_section):
|
||||||
|
label = match.group(1).strip()
|
||||||
|
value_str = match.group(2)
|
||||||
|
uom = match.group(3) or ""
|
||||||
|
warn = match.group(4)
|
||||||
|
crit = match.group(5)
|
||||||
|
min_val = match.group(6)
|
||||||
|
max_val = match.group(7)
|
||||||
|
|
||||||
|
# Convert value to float
|
||||||
|
try:
|
||||||
|
value = float(value_str)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Store the value
|
||||||
|
perfdata[label] = value
|
||||||
|
|
||||||
|
# Optionally store UOM as separate field
|
||||||
|
if uom:
|
||||||
|
perfdata[f"{label}_uom"] = uom
|
||||||
|
|
||||||
|
# Store thresholds if present
|
||||||
|
if warn:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_warn"] = float(warn)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if crit:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_crit"] = float(crit)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if min_val:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_min"] = float(min_val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if max_val:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_max"] = float(max_val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return perfdata
|
||||||
@@ -0,0 +1,240 @@
|
|||||||
|
"""
|
||||||
|
Network monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects network interface statistics and connection information using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class NetworkMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor network interface statistics and connections.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
|
||||||
|
- Per-interface statistics
|
||||||
|
- Network connection counts by state
|
||||||
|
- Interface addresses and configuration
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
interfaces: List of interfaces to monitor (default: all)
|
||||||
|
include_connections: Include connection statistics (default: True)
|
||||||
|
include_addresses: Include interface addresses (default: False)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "network_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the network monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- interfaces: List of specific interfaces to monitor
|
||||||
|
- include_connections: Include connection stats (default: True)
|
||||||
|
- include_addresses: Include interface addresses (default: False)
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
|
||||||
|
self.include_connections = self.config.get('include_connections', True)
|
||||||
|
self.include_addresses = self.config.get('include_addresses', False)
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for network_monitor plugin")
|
||||||
|
|
||||||
|
# Store previous I/O counters for delta calculation
|
||||||
|
self._prev_io = {}
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - network_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
|
||||||
|
f"connections: {self.include_connections})")
|
||||||
|
|
||||||
|
# Initialize I/O counters
|
||||||
|
try:
|
||||||
|
self._prev_io = psutil.net_io_counters(pernic=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not initialize network I/O counters: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current network statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with network metrics:
|
||||||
|
- interfaces: Dict of interface statistics, keyed by interface name
|
||||||
|
- bytes_sent: Total bytes sent
|
||||||
|
- bytes_recv: Total bytes received
|
||||||
|
- packets_sent: Total packets sent
|
||||||
|
- packets_recv: Total packets received
|
||||||
|
- errin: Total incoming errors
|
||||||
|
- errout: Total outgoing errors
|
||||||
|
- dropin: Total incoming packets dropped
|
||||||
|
- dropout: Total outgoing packets dropped
|
||||||
|
- bytes_sent_delta: Bytes sent since last collection
|
||||||
|
- bytes_recv_delta: Bytes received since last collection
|
||||||
|
- packets_sent_delta: Packets sent since last collection
|
||||||
|
- packets_recv_delta: Packets received since last collection
|
||||||
|
- connections: Connection statistics by state (if include_connections)
|
||||||
|
- ESTABLISHED: Count of established connections
|
||||||
|
- LISTEN: Count of listening sockets
|
||||||
|
- TIME_WAIT: Count of TIME_WAIT connections
|
||||||
|
- etc.
|
||||||
|
- addresses: Interface address information (if include_addresses)
|
||||||
|
- Dict keyed by interface name with address details
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting network metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect network metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect per-interface I/O counters
|
||||||
|
try:
|
||||||
|
io_counters = psutil.net_io_counters(pernic=True)
|
||||||
|
interfaces_data = {}
|
||||||
|
|
||||||
|
for iface_name, counters in io_counters.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
iface_stats = {
|
||||||
|
'bytes_sent': counters.bytes_sent,
|
||||||
|
'bytes_recv': counters.bytes_recv,
|
||||||
|
'packets_sent': counters.packets_sent,
|
||||||
|
'packets_recv': counters.packets_recv,
|
||||||
|
'errin': counters.errin,
|
||||||
|
'errout': counters.errout,
|
||||||
|
'dropin': counters.dropin,
|
||||||
|
'dropout': counters.dropout,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate deltas from previous collection
|
||||||
|
if iface_name in self._prev_io:
|
||||||
|
prev = self._prev_io[iface_name]
|
||||||
|
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
|
||||||
|
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
|
||||||
|
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
|
||||||
|
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
|
||||||
|
|
||||||
|
interfaces_data[iface_name] = iface_stats
|
||||||
|
|
||||||
|
metrics['interfaces'] = interfaces_data
|
||||||
|
|
||||||
|
# Store current counters for next delta calculation
|
||||||
|
self._prev_io = io_counters
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect network I/O counters: {e}")
|
||||||
|
|
||||||
|
# Collect connection statistics
|
||||||
|
if self.include_connections:
|
||||||
|
try:
|
||||||
|
connections = psutil.net_connections(kind='inet')
|
||||||
|
conn_stats = {}
|
||||||
|
|
||||||
|
# Count connections by state
|
||||||
|
for conn in connections:
|
||||||
|
state = conn.status
|
||||||
|
conn_stats[state] = conn_stats.get(state, 0) + 1
|
||||||
|
|
||||||
|
metrics['connections'] = conn_stats
|
||||||
|
|
||||||
|
except (PermissionError, psutil.AccessDenied):
|
||||||
|
logger.debug("Permission denied for net_connections (requires root/admin)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect connection statistics: {e}")
|
||||||
|
|
||||||
|
# Collect interface addresses
|
||||||
|
if self.include_addresses:
|
||||||
|
try:
|
||||||
|
addresses = psutil.net_if_addrs()
|
||||||
|
addr_data = {}
|
||||||
|
|
||||||
|
for iface_name, addrs in addresses.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
iface_addrs = []
|
||||||
|
for addr in addrs:
|
||||||
|
addr_info = {
|
||||||
|
'family': str(addr.family),
|
||||||
|
'address': addr.address,
|
||||||
|
}
|
||||||
|
if addr.netmask:
|
||||||
|
addr_info['netmask'] = addr.netmask
|
||||||
|
if addr.broadcast:
|
||||||
|
addr_info['broadcast'] = addr.broadcast
|
||||||
|
iface_addrs.append(addr_info)
|
||||||
|
|
||||||
|
addr_data[iface_name] = iface_addrs
|
||||||
|
|
||||||
|
metrics['addresses'] = addr_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect interface addresses: {e}")
|
||||||
|
|
||||||
|
# Add interface stats (up/down status, speed, mtu)
|
||||||
|
try:
|
||||||
|
if_stats = psutil.net_if_stats()
|
||||||
|
stats_data = {}
|
||||||
|
|
||||||
|
for iface_name, stats in if_stats.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
stats_data[iface_name] = {
|
||||||
|
'isup': stats.isup,
|
||||||
|
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
|
||||||
|
'speed': stats.speed,
|
||||||
|
'mtu': stats.mtu,
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics['interface_stats'] = stats_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect interface stats: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Network monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = NetworkMonitorPlugin
|
||||||
@@ -0,0 +1,142 @@
|
|||||||
|
"""OS Information Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects static operating system information including OS name, version,
|
||||||
|
kernel, architecture, and distribution details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
# Import from parent package
|
||||||
|
from hbd.client.plugin import InfoPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class OSInfoPlugin(InfoPlugin):
|
||||||
|
"""Collect operating system information.
|
||||||
|
|
||||||
|
This plugin gathers static OS information that rarely changes:
|
||||||
|
- OS name and version
|
||||||
|
- Kernel version
|
||||||
|
- Architecture (x86_64, arm64, etc.)
|
||||||
|
- Distribution details (for Linux)
|
||||||
|
- Python version (used by hbc)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "os_info"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Operating system and platform information"
|
||||||
|
interval = 0 # InfoPlugin: collect once at startup
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the OS info plugin.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True (always succeeds - platform module is stdlib)
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Collect OS information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with OS details
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from hbd import __version__ as hbc_version
|
||||||
|
data = {
|
||||||
|
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
|
||||||
|
"node": platform.node(), # hostname
|
||||||
|
"release": platform.release(), # kernel version
|
||||||
|
"version": platform.version(), # detailed version
|
||||||
|
"machine": platform.machine(), # e.g., "x86_64", "arm64"
|
||||||
|
"processor": platform.processor(), # processor name
|
||||||
|
"architecture": platform.architecture()[0], # e.g., "64bit"
|
||||||
|
"python_version": platform.python_version(),
|
||||||
|
"python_implementation": platform.python_implementation(),
|
||||||
|
"hbc_version": hbc_version,
|
||||||
|
"hbc_type": "full",
|
||||||
|
}
|
||||||
|
if self.config.get("owner"):
|
||||||
|
self.logger.debug(f"Adding owner from config: {self.config['owner']}")
|
||||||
|
data["owner"] = self.config["owner"]
|
||||||
|
|
||||||
|
# Add Linux-specific distribution info
|
||||||
|
if platform.system() == "Linux":
|
||||||
|
data.update(self._get_linux_distro())
|
||||||
|
|
||||||
|
# Add macOS-specific info
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
data["macos_version"] = platform.mac_ver()[0]
|
||||||
|
|
||||||
|
# Add Windows-specific info
|
||||||
|
elif platform.system() == "Windows":
|
||||||
|
win_ver = platform.win32_ver()
|
||||||
|
data["windows_release"] = win_ver[0]
|
||||||
|
data["windows_version"] = win_ver[1]
|
||||||
|
data["windows_sp"] = win_ver[2]
|
||||||
|
data["windows_type"] = win_ver[3]
|
||||||
|
|
||||||
|
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _get_linux_distro(self) -> Dict[str, str]:
|
||||||
|
"""Get Linux distribution information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with distribution details
|
||||||
|
"""
|
||||||
|
distro_info = {}
|
||||||
|
|
||||||
|
# Try reading /etc/os-release (standard on modern Linux)
|
||||||
|
os_release = Path("/etc/os-release")
|
||||||
|
if os_release.exists():
|
||||||
|
try:
|
||||||
|
with open(os_release) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if "=" in line and not line.startswith("#"):
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
# Remove quotes from value
|
||||||
|
value = value.strip('"').strip("'")
|
||||||
|
# Map common keys
|
||||||
|
if key == "NAME":
|
||||||
|
distro_info["distro_name"] = value
|
||||||
|
elif key == "VERSION":
|
||||||
|
distro_info["distro_version"] = value
|
||||||
|
elif key == "ID":
|
||||||
|
distro_info["distro_id"] = value
|
||||||
|
elif key == "VERSION_ID":
|
||||||
|
distro_info["distro_version_id"] = value
|
||||||
|
elif key == "PRETTY_NAME":
|
||||||
|
distro_info["distro_pretty_name"] = value
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not read /etc/os-release: {e}")
|
||||||
|
|
||||||
|
# Fallback: try lsb_release (older systems)
|
||||||
|
elif Path("/etc/lsb-release").exists():
|
||||||
|
try:
|
||||||
|
with open("/etc/lsb-release") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if "=" in line:
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
if key == "DISTRIB_ID":
|
||||||
|
distro_info["distro_id"] = value
|
||||||
|
elif key == "DISTRIB_RELEASE":
|
||||||
|
distro_info["distro_version"] = value
|
||||||
|
elif key == "DISTRIB_DESCRIPTION":
|
||||||
|
distro_info["distro_name"] = value
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
|
||||||
|
|
||||||
|
return distro_info
|
||||||
@@ -0,0 +1,147 @@
|
|||||||
|
"""Ping Monitor Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||||
|
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||||
|
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||||
|
|
||||||
|
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
ping_monitor:
|
||||||
|
interval: 60 # ping every 60 seconds (default)
|
||||||
|
count: 3 # ICMP packets per ping run (default 3)
|
||||||
|
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||||
|
hosts:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 192.168.1.1
|
||||||
|
```
|
||||||
|
|
||||||
|
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||||
|
by underscores so it is a valid identifier):
|
||||||
|
|
||||||
|
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||||
|
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||||
|
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||||
|
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||||
|
|
||||||
|
Server-side threshold config example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
ping_monitor:
|
||||||
|
8_8_8_8_rtt_avg:
|
||||||
|
warning: 20.0
|
||||||
|
critical: 100.0
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
def _host_key(host: str) -> str:
|
||||||
|
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||||
|
|
||||||
|
|
||||||
|
class PingMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Ping one or more configured hosts and report RTT metrics."""
|
||||||
|
|
||||||
|
name = "ping_monitor"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "ICMP ping latency monitoring"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
cfg = config or {}
|
||||||
|
self.interval = cfg.get("interval", 60)
|
||||||
|
self.count = int(cfg.get("count", 3))
|
||||||
|
self.timeout = int(cfg.get("timeout", 5))
|
||||||
|
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||||
|
raw_hosts = cfg.get("hosts", {})
|
||||||
|
if isinstance(raw_hosts, list):
|
||||||
|
self.hosts = {h: {} for h in raw_hosts}
|
||||||
|
else:
|
||||||
|
self.hosts = dict(raw_hosts)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
if not self.hosts:
|
||||||
|
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||||
|
return False
|
||||||
|
self.logger.info(
|
||||||
|
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||||
|
len(self.hosts), self.interval, self.count, self.timeout,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _ping(self, host: str) -> Dict[str, float]:
|
||||||
|
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||||
|
if sys.platform == "win32":
|
||||||
|
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||||
|
else:
|
||||||
|
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, _ = await asyncio.wait_for(
|
||||||
|
proc.communicate(),
|
||||||
|
timeout=self.timeout * self.count + 2,
|
||||||
|
)
|
||||||
|
output = stdout.decode(errors="replace")
|
||||||
|
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||||
|
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
|
||||||
|
# Parse packet loss
|
||||||
|
loss = 100.0
|
||||||
|
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||||
|
if loss_match:
|
||||||
|
loss = float(loss_match.group(1))
|
||||||
|
|
||||||
|
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||||
|
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||||
|
rtt_match = re.search(
|
||||||
|
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||||
|
output,
|
||||||
|
)
|
||||||
|
if rtt_match:
|
||||||
|
return {
|
||||||
|
"rtt_min": float(rtt_match.group(1)),
|
||||||
|
"rtt_avg": float(rtt_match.group(2)),
|
||||||
|
"rtt_max": float(rtt_match.group(3)),
|
||||||
|
"loss": loss,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Host unreachable or all packets lost
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": loss}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
data: Dict[str, Any] = {}
|
||||||
|
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||||
|
for host, task in tasks.items():
|
||||||
|
try:
|
||||||
|
result = await task
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||||
|
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
key = _host_key(host)
|
||||||
|
for metric, value in result.items():
|
||||||
|
data[f"{key}_{metric}"] = value
|
||||||
|
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||||
|
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||||
|
return data
|
||||||
@@ -0,0 +1,140 @@
|
|||||||
|
"""
|
||||||
|
ZFS pool monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects per-pool health, capacity, and cumulative I/O statistics via zpool(8).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _int(s: str) -> Optional[int]:
|
||||||
|
try:
|
||||||
|
return int(s.strip().rstrip("KMGTkBkmgt%x"))
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _float(s: str) -> Optional[float]:
|
||||||
|
try:
|
||||||
|
return float(s.strip().rstrip("%x"))
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class ZFSMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Monitor ZFS pool health, capacity, and I/O statistics.
|
||||||
|
|
||||||
|
Collects per pool:
|
||||||
|
- health: ONLINE, DEGRADED, FAULTED, etc.
|
||||||
|
- size / alloc / free: total, allocated and free bytes
|
||||||
|
- capacity: percentage used (0-100)
|
||||||
|
- frag: fragmentation percentage
|
||||||
|
- dedup: deduplication ratio
|
||||||
|
- read_ops / write_ops: cumulative I/O operations since last boot/clear
|
||||||
|
- read_bw / write_bw: cumulative bytes transferred since last boot/clear
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: collection interval in seconds (default: 300)
|
||||||
|
pools: list of pool names to monitor (default: all)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "zfs_monitor"
|
||||||
|
description = "ZFS pool health, capacity, and I/O statistics"
|
||||||
|
interval = 300
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self.interval = self.config.get("interval", 300)
|
||||||
|
self._pools_filter: Optional[List[str]] = self.config.get("pools", None)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
if not shutil.which("zpool"):
|
||||||
|
self.skip_reason = "zpool not found"
|
||||||
|
return False
|
||||||
|
logger.info("ZFS monitor initialized (interval: %ds)", self.interval)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _run(self, *args: str) -> List[str]:
|
||||||
|
"""Run a command and return its stdout lines, or [] on error."""
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*args,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
|
||||||
|
return stdout.decode(errors="replace").splitlines()
|
||||||
|
except (FileNotFoundError, asyncio.TimeoutError) as exc:
|
||||||
|
logger.warning("zfs_monitor: %s: %s", args[0], exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def _zpool_list(self) -> Dict[str, Dict]:
|
||||||
|
"""Return per-pool health and capacity from `zpool list`."""
|
||||||
|
lines = await self._run(
|
||||||
|
"zpool", "list", "-H", "-p",
|
||||||
|
"-o", "name,health,size,alloc,free,cap,frag,dedup",
|
||||||
|
)
|
||||||
|
pools: Dict[str, Dict] = {}
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split("\t")
|
||||||
|
if len(parts) < 8:
|
||||||
|
continue
|
||||||
|
name = parts[0].strip()
|
||||||
|
if self._pools_filter and name not in self._pools_filter:
|
||||||
|
continue
|
||||||
|
health = parts[1].strip()
|
||||||
|
if health == "ONLINE":
|
||||||
|
status = 0
|
||||||
|
elif health in ("DEGRADED", "ONLINE with errors"):
|
||||||
|
status = 1
|
||||||
|
elif health in ("FAULTED", "OFFLINE", "UNAVAIL"):
|
||||||
|
status = 2
|
||||||
|
else:
|
||||||
|
status = 3 # unknown status
|
||||||
|
pools[name] = {
|
||||||
|
"health": health,
|
||||||
|
"status": status,
|
||||||
|
"size": _int(parts[2]),
|
||||||
|
"alloc": _int(parts[3]),
|
||||||
|
"free": _int(parts[4]),
|
||||||
|
"capacity": _float(parts[5]),
|
||||||
|
"frag": _float(parts[6]),
|
||||||
|
"dedup": _float(parts[7]),
|
||||||
|
}
|
||||||
|
return pools
|
||||||
|
|
||||||
|
async def _zpool_iostat(self) -> Dict[str, Dict]:
|
||||||
|
"""Return per-pool cumulative I/O counters from `zpool iostat`."""
|
||||||
|
lines = await self._run("zpool", "iostat", "-H", "-p")
|
||||||
|
io: Dict[str, Dict] = {}
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split("\t")
|
||||||
|
if len(parts) < 7:
|
||||||
|
continue
|
||||||
|
name = parts[0].strip()
|
||||||
|
if not name or name.startswith(" "):
|
||||||
|
continue
|
||||||
|
io[name] = {
|
||||||
|
"read_ops": _int(parts[3]),
|
||||||
|
"write_ops": _int(parts[4]),
|
||||||
|
"read_bw": _int(parts[5]),
|
||||||
|
"write_bw": _int(parts[6]),
|
||||||
|
}
|
||||||
|
return io
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
pools, io = await asyncio.gather(self._zpool_list(), self._zpool_iostat())
|
||||||
|
for name, stats in io.items():
|
||||||
|
if name in pools:
|
||||||
|
pools[name].update(stats)
|
||||||
|
return {"pools": pools}
|
||||||
|
|
||||||
|
|
||||||
|
plugin = ZFSMonitorPlugin
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""Common utilities shared between hbc and hbd."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,162 @@
|
|||||||
|
"""Message encoding/decoding utilities for hbd protocol.
|
||||||
|
|
||||||
|
Message Types:
|
||||||
|
HTB: Heartbeat message (client -> server)
|
||||||
|
ACK: Acknowledgment (server -> client)
|
||||||
|
CMD: Command message (server -> client)
|
||||||
|
UPD: Update message (server -> client)
|
||||||
|
PLG: Plugin data message (client -> server)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, Any, Union
|
||||||
|
import json
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
|
||||||
|
def encode_value(v: Any) -> str:
|
||||||
|
"""Encode a value for protocol transmission.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
v: Value to encode (int, float, str, bool, list, dict, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String representation suitable for protocol
|
||||||
|
"""
|
||||||
|
if isinstance(v, float):
|
||||||
|
return f"{v:0.5f}"
|
||||||
|
elif isinstance(v, (list, dict)):
|
||||||
|
# Use JSON encoding for complex types, prefixed with @
|
||||||
|
return "@" + json.dumps(v)
|
||||||
|
elif isinstance(v, bool):
|
||||||
|
return str(int(v)) # True->1, False->0
|
||||||
|
else:
|
||||||
|
return str(v)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_value(val: str) -> Any:
|
||||||
|
"""Decode a value from protocol format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
val: String value from protocol
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded Python object
|
||||||
|
"""
|
||||||
|
if not val:
|
||||||
|
return val
|
||||||
|
|
||||||
|
# Check for JSON-encoded complex types
|
||||||
|
if val.startswith("@"):
|
||||||
|
try:
|
||||||
|
return json.loads(val[1:])
|
||||||
|
except Exception:
|
||||||
|
return val[1:] # Return as string without @
|
||||||
|
|
||||||
|
# Try numeric conversion (avoid eval to prevent SyntaxWarnings on version strings)
|
||||||
|
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return float(val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return val
|
||||||
|
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def dicttos(ID: str, d: Dict[str, Any]):
|
||||||
|
"""Serialize a dict to protocol message bytes.
|
||||||
|
|
||||||
|
If compress is True, the payload is zlib-compressed and the message is
|
||||||
|
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||||
|
`ID:key=value;...` (bytes).
|
||||||
|
"""
|
||||||
|
s = []
|
||||||
|
for k in d:
|
||||||
|
v = d[k]
|
||||||
|
encoded_val = encode_value(v)
|
||||||
|
s.append(f"{k}={encoded_val}")
|
||||||
|
pk = ";".join(s)
|
||||||
|
zpk = zlib.compress(pk.encode(), 6)
|
||||||
|
hdr = ("!" + ID + ":").encode()
|
||||||
|
return hdr + zpk
|
||||||
|
|
||||||
|
|
||||||
|
def stodict(msg: bytes):
|
||||||
|
"""Deserialize a protocol message into a dict.
|
||||||
|
|
||||||
|
Mirrors original behaviour: detects compressed messages starting with
|
||||||
|
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||||
|
message ID and the parsed key/value pairs.
|
||||||
|
"""
|
||||||
|
d = {}
|
||||||
|
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||||
|
# message is: b'!ID:' + compressed_payload
|
||||||
|
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||||
|
try:
|
||||||
|
pk = zlib.decompress(msg[5:]).decode()
|
||||||
|
except Exception:
|
||||||
|
# malformed compressed payload
|
||||||
|
return {}
|
||||||
|
d["ID"] = msg[1:4].decode()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
r0 = msg.split(b":", 1)
|
||||||
|
pk = r0[1].decode()
|
||||||
|
d["ID"] = r0[0].decode()
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
if not pk:
|
||||||
|
return d
|
||||||
|
parts = pk.split(";")
|
||||||
|
for v in parts:
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
vr = v.split("=", 1)
|
||||||
|
k = vr[0].strip()
|
||||||
|
if len(vr) == 1:
|
||||||
|
d[k] = None
|
||||||
|
else:
|
||||||
|
val = vr[1].strip()
|
||||||
|
d[k] = decode_value(val)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def oldmtodict(msg: bytes):
|
||||||
|
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||||
|
|
||||||
|
The original implementation prefixed with 'HTB:' and called stodict.
|
||||||
|
"""
|
||||||
|
return stodict(b"HTB:" + msg)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
|
||||||
|
"""Encode plugin data into a PLG message.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
|
||||||
|
data: Plugin data dictionary
|
||||||
|
compress: Whether to compress the payload
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Encoded message bytes
|
||||||
|
"""
|
||||||
|
# Add plugin name to data
|
||||||
|
full_data = {"plugin": plugin_name, **data}
|
||||||
|
return dicttos("PLG", full_data)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
|
||||||
|
"""Decode a PLG message into plugin data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Raw message bytes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with 'ID', 'plugin', and plugin data fields
|
||||||
|
"""
|
||||||
|
return stodict(msg)
|
||||||
|
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
"""Utility helpers extracted from the original script."""
|
"""Utility helpers extracted from the original script."""
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def shortname(name: str) -> str:
|
def shortname(name: str) -> str:
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
"""Configuration loader and defaults for hbd."""
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
try:
|
|
||||||
import yaml
|
|
||||||
except Exception:
|
|
||||||
yaml = None
|
|
||||||
|
|
||||||
DEFAULTS = {
|
|
||||||
"hb_port": 50003,
|
|
||||||
"hbd_port": 50004,
|
|
||||||
"hbd_host": "",
|
|
||||||
"pickfile": "/tmp/hb.pick",
|
|
||||||
"logfile": "/var/log/heartbeat.log",
|
|
||||||
"logfmt": "text",
|
|
||||||
"pushsrv": "pushover",
|
|
||||||
"pushover_token": "",
|
|
||||||
"pushover_user": "",
|
|
||||||
"interval": 20,
|
|
||||||
"grace": 2,
|
|
||||||
"dyndomains": ["wrede.org"],
|
|
||||||
"watchhosts": [],
|
|
||||||
"dyndnshosts": [],
|
|
||||||
"drophosts": [],
|
|
||||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
|
||||||
"foreground": False,
|
|
||||||
"verbose": False,
|
|
||||||
"debug": 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def load_config(path=None):
|
|
||||||
"""Load configuration from a YAML file and merge with defaults.
|
|
||||||
|
|
||||||
If YAML is not available or the file does not exist, defaults are returned.
|
|
||||||
"""
|
|
||||||
cfg = DEFAULTS.copy()
|
|
||||||
if not path:
|
|
||||||
# default path (~/.hb.yaml)
|
|
||||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
|
||||||
|
|
||||||
if os.path.exists(path):
|
|
||||||
if yaml:
|
|
||||||
with open(path) as fh:
|
|
||||||
data = yaml.safe_load(fh)
|
|
||||||
# only keep known keys
|
|
||||||
for k, v in data.items():
|
|
||||||
if k in cfg:
|
|
||||||
cfg[k] = v
|
|
||||||
else:
|
|
||||||
logging.warning("unknown config key %s in %s", k, path)
|
|
||||||
else:
|
|
||||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
|
||||||
pass
|
|
||||||
return cfg
|
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# Example Heartbeat Client Configuration
|
||||||
|
# This file demonstrates all available configuration options for the heartbeat client (hbc)
|
||||||
|
# and its plugin system.
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Server Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
server: hbd.example.com # Heartbeat server hostname or IP
|
||||||
|
port: 50003 # Server UDP port (default: 50003)
|
||||||
|
interval: 30 # Heartbeat interval in seconds (default: 30)
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Plugin Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
|
||||||
|
# and configured with plugin-specific settings.
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# OS Information Plugin (InfoPlugin - runs once at startup)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
os_info:
|
||||||
|
enabled: true
|
||||||
|
# No additional configuration needed
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
cpu_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
per_core: false # Collect per-core CPU statistics (default: false)
|
||||||
|
# When per_core is true, will report CPU usage for each core separately
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Memory Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
memory_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_swap: true # Include swap memory statistics (default: true)
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Disk Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
disk_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_io: true # Include I/O statistics (default: true)
|
||||||
|
# Optional: Monitor only specific partitions
|
||||||
|
# partitions:
|
||||||
|
# - /
|
||||||
|
# - /home
|
||||||
|
# - /var
|
||||||
|
# Optional: Exclude specific filesystem types
|
||||||
|
exclude_types:
|
||||||
|
- tmpfs
|
||||||
|
- devtmpfs
|
||||||
|
- squashfs
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Network Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
network_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_connections: true # Include connection statistics (default: true)
|
||||||
|
include_addresses: false # Include interface addresses (default: false)
|
||||||
|
# Optional: Monitor only specific interfaces
|
||||||
|
# interfaces:
|
||||||
|
# - eth0
|
||||||
|
# - wlan0
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
filesystem_info:
|
||||||
|
enabled: true
|
||||||
|
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
|
||||||
|
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
|
||||||
|
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
|
||||||
|
# Optional: Exclude additional specific filesystem types
|
||||||
|
# exclude_types:
|
||||||
|
# - squashfs
|
||||||
|
# - iso9660
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Nagios Runner Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
nagios_runner:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
timeout: 30 # Plugin execution timeout in seconds (default: 30)
|
||||||
|
|
||||||
|
# List of Nagios plugins to execute
|
||||||
|
# Each command is executed as-is, so provide full paths and arguments
|
||||||
|
commands:
|
||||||
|
# System load monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
# Disk space monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||||
|
|
||||||
|
# Process monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
|
||||||
|
|
||||||
|
# Swap usage
|
||||||
|
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
|
||||||
|
# Custom script example
|
||||||
|
# - /usr/local/bin/check_my_app.sh
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Advanced Options
|
||||||
|
# ==============================================================================
|
||||||
|
# These options control client behavior
|
||||||
|
|
||||||
|
# Compression: Enable zlib compression for heartbeat messages (default: true)
|
||||||
|
compress: true
|
||||||
|
|
||||||
|
# Hostname: Override the system hostname (default: auto-detect)
|
||||||
|
# hostname: myhost.example.com
|
||||||
|
|
||||||
|
# Message: Custom message included in heartbeat (optional)
|
||||||
|
# message: "Production web server"
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||||
|
# logfile: /var/log/hbc.log # Optional log file path
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Example Profiles
|
||||||
|
# ==============================================================================
|
||||||
|
# Below are example configuration profiles for different use cases
|
||||||
|
|
||||||
|
# Minimal Configuration (default settings):
|
||||||
|
# -----------------------------------------
|
||||||
|
# server: hbd.example.com
|
||||||
|
# interval: 30
|
||||||
|
|
||||||
|
# Monitoring Server (comprehensive metrics):
|
||||||
|
# ------------------------------------------
|
||||||
|
# server: monitoring.example.com
|
||||||
|
# interval: 30
|
||||||
|
# plugins:
|
||||||
|
# cpu_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 15
|
||||||
|
# per_core: true
|
||||||
|
# memory_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 15
|
||||||
|
# disk_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 60
|
||||||
|
# network_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 30
|
||||||
|
# include_connections: true
|
||||||
|
|
||||||
|
# Nagios Integration (leverage existing plugins):
|
||||||
|
# -----------------------------------------------
|
||||||
|
# server: hbd.example.com
|
||||||
|
# plugins:
|
||||||
|
# nagios_runner:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 300 # Check every 5 minutes
|
||||||
|
# commands:
|
||||||
|
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
|
||||||
|
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
|
||||||
|
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Threshold Configuration (for Heartbeat Daemon)
|
||||||
|
# ==============================================================================
|
||||||
|
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
|
||||||
|
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
|
||||||
|
#
|
||||||
|
# Basic threshold example:
|
||||||
|
# thresholds:
|
||||||
|
# cpu_monitor:
|
||||||
|
# cpu_percent:
|
||||||
|
# warning: 80.0
|
||||||
|
# critical: 90.0
|
||||||
|
# memory_monitor:
|
||||||
|
# percent:
|
||||||
|
# warning: 85.0
|
||||||
|
# critical: 95.0
|
||||||
|
# disk_monitor:
|
||||||
|
# partitions:
|
||||||
|
# /:
|
||||||
|
# percent:
|
||||||
|
# warning: 80.0
|
||||||
|
# critical: 90.0
|
||||||
|
|
||||||
@@ -0,0 +1,296 @@
|
|||||||
|
# ==============================================================================
|
||||||
|
# Heartbeat Daemon Multi-Threshold Configuration Example
|
||||||
|
# ==============================================================================
|
||||||
|
# This file demonstrates the new multi-threshold configuration feature that allows
|
||||||
|
# different threshold settings for different hosts/clients.
|
||||||
|
#
|
||||||
|
# Features:
|
||||||
|
# - Define multiple named threshold configurations
|
||||||
|
# - Map specific hosts to specific threshold configurations
|
||||||
|
# - Set a default configuration for unmapped hosts
|
||||||
|
# - Backward compatible with single threshold configuration
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Global threshold settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||||
|
|
||||||
|
# Optional: Set default threshold config (defaults to "default" if not specified)
|
||||||
|
default_threshold_config: "default"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Multiple Named Threshold Configurations
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define multiple threshold configurations with different sensitivity levels
|
||||||
|
threshold_configs:
|
||||||
|
|
||||||
|
# Default configuration - moderate thresholds for most servers
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
load_1min:
|
||||||
|
warning: 4.0
|
||||||
|
critical: 8.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
# RTT thresholds (applies to all hosts)
|
||||||
|
warning: 50.0 # ms
|
||||||
|
critical: 200.0
|
||||||
|
|
||||||
|
# High sensitivity configuration - lower thresholds for critical systems
|
||||||
|
high_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 60.0 # Alert earlier
|
||||||
|
critical: 75.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15 # More hysteresis to reduce flapping
|
||||||
|
load_1min:
|
||||||
|
warning: 2.0
|
||||||
|
critical: 4.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert at lower memory usage
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 75.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
/var:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 30.0
|
||||||
|
critical: 100.0
|
||||||
|
|
||||||
|
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||||
|
low_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 90.0 # Only alert at very high usage
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 98.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 100.0
|
||||||
|
critical: 500.0
|
||||||
|
|
||||||
|
# Production database servers - specialized thresholds
|
||||||
|
database:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 70.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0 # Databases can use high memory
|
||||||
|
critical: 97.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
/var/lib/mysql: # Database data partition
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert earlier for DB partition
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 20.0 # Stricter latency requirements
|
||||||
|
critical: 50.0
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Host to Threshold Configuration Mapping
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Map specific hosts to specific threshold configurations
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Notification Channels
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define notification providers centrally with their credentials
|
||||||
|
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
||||||
|
notification_channels:
|
||||||
|
# Signal notifications
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +1234567890
|
||||||
|
|
||||||
|
signal_oncall:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +0987654321
|
||||||
|
|
||||||
|
# Email notifications
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, alerts@example.com]
|
||||||
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
|
# Pushover notifications
|
||||||
|
pushover_urgent:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost notifications
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
|
||||||
|
# Default notification channels (used if host doesn't specify channels)
|
||||||
|
default_notification_channels: [email_ops]
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Host Definitions (New Unified Format)
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
||||||
|
hosts:
|
||||||
|
# Critical production servers - high sensitivity, multiple notification channels
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-web-02:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-api-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Database servers - database-specific thresholds
|
||||||
|
prod-db-01:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-02:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-replica:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [email_ops] # Replica gets email only
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Development servers - low sensitivity, minimal notifications
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false # Don't monitor dev servers closely
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dev-server-02:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Test servers
|
||||||
|
test-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
dyndns: false
|
||||||
|
# No notification channels - uses default_notification_channels
|
||||||
|
|
||||||
|
# Home server with dynamic DNS
|
||||||
|
home-server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops]
|
||||||
|
dyndns: true # Update DNS when IP changes
|
||||||
|
|
||||||
|
# Hosts not listed in the hosts section will use:
|
||||||
|
# - default_threshold_config for thresholds (falls back to "default")
|
||||||
|
# - default_notification_channels for notifications
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Notes on Configuration Structure
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All configuration is centralized in the hosts section. Each host can specify:
|
||||||
|
# - threshold_config: Name of threshold configuration to use
|
||||||
|
# - watch: Whether to monitor this host actively (send notifications)
|
||||||
|
# - notification_channels: List of channels to use for this host
|
||||||
|
# - dyndns: Whether to update DNS when IP address changes
|
||||||
|
#
|
||||||
|
# Notification channels are defined once at the top level and referenced
|
||||||
|
# by name in host definitions, allowing easy reuse and updates.
|
||||||
|
#
|
||||||
|
# For hosts not explicitly listed, the system will still accept heartbeats
|
||||||
|
# and track their state, but won't apply thresholds or send notifications
|
||||||
|
# unless default settings are configured.
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
# Heartbeat Configuration Example with Nagios Plugin Runner
|
||||||
|
|
||||||
|
# This example shows how to configure the Nagios Runner plugin
|
||||||
|
# to execute existing Nagios-compatible monitoring plugins
|
||||||
|
|
||||||
|
# Basic server settings (existing config)
|
||||||
|
hb_port: 50003
|
||||||
|
hbd_port: 50004
|
||||||
|
interval: 20
|
||||||
|
grace: 2
|
||||||
|
|
||||||
|
# Plugin configuration
|
||||||
|
# Each plugin can have its own configuration section
|
||||||
|
|
||||||
|
# CPU Monitor Plugin
|
||||||
|
cpu_monitor:
|
||||||
|
interval: 300 # Collect every 5 minutes (default)
|
||||||
|
per_core: false # Set to true to get per-core CPU usage
|
||||||
|
|
||||||
|
# Nagios Runner Plugin
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300 # Run Nagios plugins every 5 minutes (default)
|
||||||
|
timeout: 30 # Command execution timeout in seconds
|
||||||
|
shell: true # Execute commands via shell
|
||||||
|
|
||||||
|
# List of Nagios plugins to run
|
||||||
|
commands:
|
||||||
|
|
||||||
|
# Example 1: Check disk space
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
|
||||||
|
# Example 2: Check disk space for /home
|
||||||
|
- name: check_disk_home
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||||
|
|
||||||
|
# Example 3: Check system load
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
# Example 4: Check process count
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
|
||||||
|
# Example 5: Check SSH service
|
||||||
|
- name: check_ssh
|
||||||
|
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||||
|
|
||||||
|
# Example 6: Check HTTP service
|
||||||
|
- name: check_http
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||||
|
|
||||||
|
# Example 7: Check swap usage
|
||||||
|
- name: check_swap
|
||||||
|
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
|
||||||
|
# Example 8: Custom script (Nagios plugin format)
|
||||||
|
- name: check_custom
|
||||||
|
command: /usr/local/bin/my_custom_check.sh
|
||||||
|
|
||||||
|
# Example 9: Check specific log file
|
||||||
|
- name: check_logs
|
||||||
|
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
|
||||||
|
|
||||||
|
# Notes:
|
||||||
|
#
|
||||||
|
# 1. Nagios Plugin Output Format:
|
||||||
|
# - Single line: STATUS - Message | performance_data
|
||||||
|
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
#
|
||||||
|
# 2. Exit Codes:
|
||||||
|
# - 0 = OK
|
||||||
|
# - 1 = WARNING
|
||||||
|
# - 2 = CRITICAL
|
||||||
|
# - 3 = UNKNOWN
|
||||||
|
#
|
||||||
|
# 3. Performance Data:
|
||||||
|
# - Automatically parsed and included in heartbeat data
|
||||||
|
# - Metrics are stored as: {plugin_name}_{metric_name}
|
||||||
|
# - Example: check_disk_root_/ will contain the disk usage percentage
|
||||||
|
#
|
||||||
|
# 4. Overall Status:
|
||||||
|
# - The plugin reports the worst status from all commands
|
||||||
|
# - Useful for quick health checks
|
||||||
|
#
|
||||||
|
# 5. Plugin Paths:
|
||||||
|
# Common Nagios plugin directories:
|
||||||
|
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
|
||||||
|
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
|
||||||
|
# - Custom installs: /usr/local/nagios/libexec/
|
||||||
|
#
|
||||||
|
# 6. Installing Nagios Plugins:
|
||||||
|
# Debian/Ubuntu: sudo apt-get install nagios-plugins
|
||||||
|
# RHEL/CentOS: sudo yum install nagios-plugins-all
|
||||||
|
# Arch Linux: sudo pacman -S monitoring-plugins
|
||||||
|
#
|
||||||
|
# 7. Writing Custom Nagios Plugins:
|
||||||
|
# Any script can be a Nagios plugin if it:
|
||||||
|
# - Returns appropriate exit codes (0-3)
|
||||||
|
# - Prints status message to stdout
|
||||||
|
# - Optionally includes performance data after "|"
|
||||||
|
#
|
||||||
|
# Example custom plugin (save as /usr/local/bin/check_example.sh):
|
||||||
|
# #!/bin/bash
|
||||||
|
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
|
||||||
|
# echo "CRITICAL - Too many users | users=52;40;50;0"
|
||||||
|
# exit 2
|
||||||
|
# else
|
||||||
|
# echo "OK - Normal user count | users=25;40;50;0"
|
||||||
|
# exit 0
|
||||||
|
# fi
|
||||||
@@ -0,0 +1,279 @@
|
|||||||
|
# ==============================================================================
|
||||||
|
# Heartbeat Daemon Threshold Configuration Example
|
||||||
|
# ==============================================================================
|
||||||
|
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
||||||
|
# Thresholds can be defined for any metric collected by monitoring plugins.
|
||||||
|
#
|
||||||
|
# Threshold levels:
|
||||||
|
# - WARNING: First level of concern, typically for early notification
|
||||||
|
# - CRITICAL: Severe condition requiring immediate attention
|
||||||
|
#
|
||||||
|
# Alert notifications are sent when:
|
||||||
|
# - A metric crosses from OK to WARNING or CRITICAL
|
||||||
|
# - A metric crosses from WARNING to CRITICAL
|
||||||
|
# - A metric recovers (returns to a lower severity level)
|
||||||
|
#
|
||||||
|
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Global threshold settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||||
|
|
||||||
|
# Threshold definitions per plugin
|
||||||
|
thresholds:
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# CPU Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
cpu_monitor:
|
||||||
|
# Overall CPU usage percentage (0-100)
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0 # Warn when CPU usage exceeds 80%
|
||||||
|
critical: 90.0 # Critical when CPU usage exceeds 90%
|
||||||
|
operator: ">" # Alert when value is GREATER than threshold
|
||||||
|
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 1-minute load average
|
||||||
|
load_1min:
|
||||||
|
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
||||||
|
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15 # 15% hysteresis
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 5-minute load average
|
||||||
|
load_5min:
|
||||||
|
warning: 3.0
|
||||||
|
critical: 6.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 15-minute load average
|
||||||
|
load_15min:
|
||||||
|
warning: 2.0
|
||||||
|
critical: 4.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Memory Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
memory_monitor:
|
||||||
|
# Memory usage percentage
|
||||||
|
percent:
|
||||||
|
warning: 85.0 # Warn at 85% memory usage
|
||||||
|
critical: 95.0 # Critical at 95% memory usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Available memory in MB (inverse threshold - alert when LOW)
|
||||||
|
available_mb:
|
||||||
|
warning: 1000 # Warn when less than 1GB available
|
||||||
|
critical: 500 # Critical when less than 500MB available
|
||||||
|
operator: "<" # Alert when value is LESS than threshold
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Swap usage percentage
|
||||||
|
swap_percent:
|
||||||
|
warning: 50.0 # Warn at 50% swap usage
|
||||||
|
critical: 80.0 # Critical at 80% swap usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Disk Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
disk_monitor:
|
||||||
|
# Partition-specific thresholds
|
||||||
|
# Use the mount point as the key
|
||||||
|
partitions:
|
||||||
|
# Root filesystem
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0 # Warn at 80% disk usage
|
||||||
|
critical: 90.0 # Critical at 90% disk usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
free_gb:
|
||||||
|
warning: 10.0 # Warn when less than 10GB free
|
||||||
|
critical: 5.0 # Critical when less than 5GB free
|
||||||
|
operator: "<"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Home filesystem (if separate partition)
|
||||||
|
/home:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Var filesystem (logs, etc.)
|
||||||
|
/var:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
free_gb:
|
||||||
|
warning: 5.0 # Var needs space for logs
|
||||||
|
critical: 2.0
|
||||||
|
operator: "<"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# ZFS Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
zfs_monitor:
|
||||||
|
# Pool health check — built-in default; shown here for reference/override.
|
||||||
|
# status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
|
||||||
|
# Use '*' to apply the same rule to every pool, or name a specific pool.
|
||||||
|
pools:
|
||||||
|
'*':
|
||||||
|
status:
|
||||||
|
warning: 1 # Alert WARNING when pool is DEGRADED
|
||||||
|
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
|
||||||
|
operator: ">="
|
||||||
|
hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting
|
||||||
|
grace: 0 # Fire immediately — don't wait for a second collection
|
||||||
|
display: "ZFS pool {pool_name} is {health}"
|
||||||
|
|
||||||
|
# Per-pool capacity thresholds (optional; add pools you care about)
|
||||||
|
# tank:
|
||||||
|
# capacity:
|
||||||
|
# warning: 75.0 # Warn at 75% used
|
||||||
|
# critical: 90.0 # Critical at 90% used
|
||||||
|
# operator: ">"
|
||||||
|
# hysteresis: 0.05
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Network Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
network_monitor:
|
||||||
|
# Total error count across all interfaces
|
||||||
|
errors_total:
|
||||||
|
warning: 100 # Warn at 100 errors
|
||||||
|
critical: 1000 # Critical at 1000 errors
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2 # 20% hysteresis for counters
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Total dropped packets
|
||||||
|
dropin_total:
|
||||||
|
warning: 50
|
||||||
|
critical: 200
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
dropout_total:
|
||||||
|
warning: 50
|
||||||
|
critical: 200
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# TCP connections in TIME_WAIT state
|
||||||
|
connections_TIME_WAIT:
|
||||||
|
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
||||||
|
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Total established connections
|
||||||
|
connections_ESTABLISHED:
|
||||||
|
warning: 500
|
||||||
|
critical: 1000
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Nagios Plugin Thresholds (if using nagios_runner)
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
nagios_runner:
|
||||||
|
# Nagios plugins report exit codes:
|
||||||
|
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
||||||
|
# We can threshold on the exit_code directly
|
||||||
|
exit_code:
|
||||||
|
warning: 1 # Map Nagios WARNING to our WARNING
|
||||||
|
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
||||||
|
operator: ">=" # Alert when exit code >= threshold
|
||||||
|
hysteresis: 0.0 # No hysteresis for exit codes
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Notification Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
# Configure notification methods (email, pushover, etc.)
|
||||||
|
# These are used when threshold violations occur
|
||||||
|
|
||||||
|
# Email notifications
|
||||||
|
toemail:
|
||||||
|
- admin@example.com
|
||||||
|
- oncall@example.com
|
||||||
|
fromemail: heartbeat@example.com
|
||||||
|
smtpserver: smtp.example.com
|
||||||
|
smtpport: 587
|
||||||
|
smtpuser: heartbeat@example.com
|
||||||
|
smtppassword: your-password-here
|
||||||
|
|
||||||
|
# Pushover notifications (optional)
|
||||||
|
# pushover_token: your-pushover-app-token
|
||||||
|
# pushover_user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost webhook (optional)
|
||||||
|
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Watched Hosts
|
||||||
|
# ==============================================================================
|
||||||
|
# Hosts in this list will trigger notifications for:
|
||||||
|
# - Heartbeat timeouts/overdue
|
||||||
|
# - Threshold violations
|
||||||
|
# - Boot messages
|
||||||
|
watchhosts:
|
||||||
|
- webserver01
|
||||||
|
- database01
|
||||||
|
- mailserver
|
||||||
|
- critical-app
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Additional Server Settings
|
||||||
|
# ==============================================================================
|
||||||
|
hb_port: 50003 # UDP port for heartbeat messages
|
||||||
|
hbd_port: 50004 # HTTP port for web interface
|
||||||
|
grace: 10 # Grace period for overdue detection (seconds)
|
||||||
|
debug: 0 # Debug level (0-3)
|
||||||
|
verbose: false # Verbose output
|
||||||
|
|
||||||
|
# Journal settings (message logging)
|
||||||
|
journal_enabled: true
|
||||||
|
journal_path: /var/log/heartbeat/messages.journal
|
||||||
|
journal_max_size: 104857600 # 100MB before rotation
|
||||||
|
journal_max_backups: 10
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Example: Production Configuration with Conservative Thresholds
|
||||||
|
# ==============================================================================
|
||||||
|
# For production systems, consider:
|
||||||
|
# - Higher warning thresholds to reduce alert fatigue
|
||||||
|
# - Appropriate hysteresis values (5-15% typical)
|
||||||
|
# - Re-notification intervals matching on-call rotation
|
||||||
|
# - Multiple escalation contacts
|
||||||
|
# - Integration with incident management systems
|
||||||
|
# ==============================================================================
|
||||||
-593
@@ -1,593 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
|
|
||||||
# NEW
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import socket
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import getopt
|
|
||||||
import string
|
|
||||||
import select
|
|
||||||
import errno
|
|
||||||
import traceback
|
|
||||||
from hashlib import md5
|
|
||||||
import shutil
|
|
||||||
import zlib
|
|
||||||
import subprocess
|
|
||||||
import syslog
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
from .config import load_config
|
|
||||||
|
|
||||||
PORT = 50003
|
|
||||||
INTERVAL = 10
|
|
||||||
REOPENC = 6
|
|
||||||
PIDFILE = "/tmp/hbc.pid"
|
|
||||||
VER = 6
|
|
||||||
MAXRECV = 32767
|
|
||||||
|
|
||||||
running = True
|
|
||||||
dorestart = False
|
|
||||||
warned1 = False
|
|
||||||
|
|
||||||
msgonly = False
|
|
||||||
helpflag = False
|
|
||||||
verbose = False
|
|
||||||
fdaemon = False
|
|
||||||
daemonized = False
|
|
||||||
optlist = []
|
|
||||||
msgboot = {}
|
|
||||||
home = os.environ["HOME"]
|
|
||||||
configfile = "%s/.hbrc" % home
|
|
||||||
cmdargs = []
|
|
||||||
iam = socket.gethostname()
|
|
||||||
|
|
||||||
def log(msg):
|
|
||||||
if fdaemon:
|
|
||||||
syslog.syslog(syslog.LOG_ERR, msg)
|
|
||||||
else:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def handler(signum, frame):
|
|
||||||
if signum == signal.SIGTERM:
|
|
||||||
cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class NullDevice:
|
|
||||||
def write(self, s):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Conn:
|
|
||||||
def __init__(self, conId, addr, port, af):
|
|
||||||
self.conId = conId
|
|
||||||
self.addr = addr
|
|
||||||
self.port = port
|
|
||||||
self.af = af
|
|
||||||
|
|
||||||
self.ackcount = 0 # num of accks received
|
|
||||||
self.lastack = 0 # time() last ACK was received
|
|
||||||
self.send = 0
|
|
||||||
self.lastsend = 0 # time() last msg was sent
|
|
||||||
self.rtts = [0]
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
|
|
||||||
|
|
||||||
def open(self):
|
|
||||||
self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
|
|
||||||
self.sock.setsockopt(
|
|
||||||
socket.SOL_SOCKET,
|
|
||||||
socket.SO_REUSEADDR,
|
|
||||||
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
|
|
||||||
)
|
|
||||||
|
|
||||||
def sendto(self, msg, ID="HTB"): # default ID is HearTBeat
|
|
||||||
global warned1
|
|
||||||
|
|
||||||
if self.send % REOPENC == 0:
|
|
||||||
self.close()
|
|
||||||
if not self.sock:
|
|
||||||
self.open()
|
|
||||||
msg["name"] = shortname(iam)
|
|
||||||
msg["id"] = self.conId
|
|
||||||
msg["ver"] = VER
|
|
||||||
msg["time"] = time.time()
|
|
||||||
m = dicttos(ID, msg) # always compress
|
|
||||||
if verbose:
|
|
||||||
log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
|
|
||||||
try:
|
|
||||||
self.sock.sendto(m, (self.addr, self.port))
|
|
||||||
except socket.error as e:
|
|
||||||
if not warned1:
|
|
||||||
log("socket error: %s %s:%s" % (e, self.addr, self.port))
|
|
||||||
warned1 = True
|
|
||||||
self.close()
|
|
||||||
return
|
|
||||||
self.send += 1
|
|
||||||
self.lastsend = time.time()
|
|
||||||
|
|
||||||
def ack(self, msgDict, now):
|
|
||||||
try:
|
|
||||||
self.lastack = msgDict["time"]
|
|
||||||
mul = 2
|
|
||||||
except:
|
|
||||||
self.lastack = now
|
|
||||||
mul = 1
|
|
||||||
rtt = (self.lastack - self.lastsend) * mul
|
|
||||||
if verbose:
|
|
||||||
log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
|
|
||||||
self.rtts.append(rtt * 1000.0)
|
|
||||||
if len(self.rtts) > 10:
|
|
||||||
del self.rtts[0]
|
|
||||||
self.ackcount += 1
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.sock:
|
|
||||||
self.sock.close()
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
|
|
||||||
def shortname(name):
|
|
||||||
r = name.split(".")
|
|
||||||
return r[0]
|
|
||||||
|
|
||||||
|
|
||||||
def dicttos(ID, d):
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
if type(d[k]) == type(1.2):
|
|
||||||
s.append("%s=%0.5f" % (k, d[k]))
|
|
||||||
else:
|
|
||||||
s.append("%s=%s" % (k, d[k]))
|
|
||||||
pk = ";".join(s)
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
ID = "!" + ID + ":"
|
|
||||||
return ID.encode() + zpk
|
|
||||||
|
|
||||||
|
|
||||||
def stodict(msg):
|
|
||||||
d = {}
|
|
||||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
|
||||||
pk = zlib.decompress(msg[5:]).decode()
|
|
||||||
d["ID"] = msg[1:4].decode()
|
|
||||||
else:
|
|
||||||
r0 = msg.split(":", 1)
|
|
||||||
pk = r0[1]
|
|
||||||
d["ID"] = r0[0]
|
|
||||||
r = pk.split(";")
|
|
||||||
for v in r:
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
v = vr[1].strip()
|
|
||||||
try:
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
d[k] = v
|
|
||||||
if verbose:
|
|
||||||
print("msg is %s" % d)
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def XXstodict(msg):
|
|
||||||
d = {}
|
|
||||||
r0 = msg.split(":", 1)
|
|
||||||
if len(r0) == 1:
|
|
||||||
return None
|
|
||||||
if r0[0][0] == "!": # compressed
|
|
||||||
pk = zlib.decompress(msg[len(r0[0]) + 1 :])
|
|
||||||
d["ID"] = r0[0][1:]
|
|
||||||
else:
|
|
||||||
pk = r0[1]
|
|
||||||
d["ID"] = r0[0]
|
|
||||||
r = pk.split(";")
|
|
||||||
for v in r:
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
v = vr[1].strip()
|
|
||||||
try:
|
|
||||||
if v[0].isdigit():
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
d[k] = v
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def syslogtrace(note):
|
|
||||||
logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
|
|
||||||
log(logm)
|
|
||||||
for l in logm.split("\n"):
|
|
||||||
syslog.syslog(syslog.LOG_ERR, " tb: %s" % l)
|
|
||||||
if verbose:
|
|
||||||
print(logm)
|
|
||||||
|
|
||||||
|
|
||||||
conId = 1
|
|
||||||
|
|
||||||
|
|
||||||
def createConnections(hosts):
|
|
||||||
global conId
|
|
||||||
for host in hosts:
|
|
||||||
if verbose:
|
|
||||||
log("createConnections for %s" % host)
|
|
||||||
try:
|
|
||||||
rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
|
||||||
except socket.gaierror:
|
|
||||||
logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
|
|
||||||
if verbose:
|
|
||||||
log(logm)
|
|
||||||
return None
|
|
||||||
for r in rs:
|
|
||||||
if verbose:
|
|
||||||
log("address %s" % str(r))
|
|
||||||
if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD
|
|
||||||
af = socket.AF_INET6
|
|
||||||
elif r[0] == 2:
|
|
||||||
af = socket.AF_INET
|
|
||||||
else:
|
|
||||||
print("dont know this net type: %s" % r[0][0])
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
addr = r[4][0]
|
|
||||||
conns[conId] = Conn(conId, addr, hb_port, af)
|
|
||||||
if verbose:
|
|
||||||
print("cons[%s] = %s" % (conId, str(conns[conId])))
|
|
||||||
conId += 1
|
|
||||||
|
|
||||||
|
|
||||||
def doexec(conn, data):
|
|
||||||
try:
|
|
||||||
ro = subprocess.check_output(
|
|
||||||
data, stderr=subprocess.STDOUT, shell=True
|
|
||||||
).decode()
|
|
||||||
fail = "OK"
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
ro = str(e)
|
|
||||||
fail = "CalledProcessError"
|
|
||||||
except Exception as e:
|
|
||||||
syslogtrace("System")
|
|
||||||
ro = "N/A"
|
|
||||||
fail = "cmd failed: %s" % e
|
|
||||||
msg = {"service": "command", "msg": fail + " " + ro}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def doupdate(conn, msgDict):
|
|
||||||
fail = None
|
|
||||||
try:
|
|
||||||
code = codecs.decode(msgDict["code"], "base64").decode()
|
|
||||||
csum = msgDict["csum"]
|
|
||||||
except Exception as e:
|
|
||||||
fail = "csum/code missing: %s" % e
|
|
||||||
if not fail:
|
|
||||||
fail = doupdateone(code, csum)
|
|
||||||
|
|
||||||
msg = {"service": "update", "msg": fail if fail else "OK"}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
if not fail:
|
|
||||||
log("hc updates, fs = %s" % (len(code)))
|
|
||||||
|
|
||||||
return fail
|
|
||||||
|
|
||||||
|
|
||||||
def doupdateone(code, csum):
|
|
||||||
|
|
||||||
m = md5()
|
|
||||||
m.update(code.encode())
|
|
||||||
icsum = m.hexdigest()
|
|
||||||
if icsum != csum:
|
|
||||||
return "checksum error"
|
|
||||||
|
|
||||||
fn = sys.argv[0]
|
|
||||||
ofn = "%s.sav" % fn
|
|
||||||
try:
|
|
||||||
shutil.copy2(fn, ofn)
|
|
||||||
except Exception as e:
|
|
||||||
return "cannot make backup copy: %s" % e
|
|
||||||
|
|
||||||
try:
|
|
||||||
fh = open(fn, "w")
|
|
||||||
fh.write(code)
|
|
||||||
fh.close()
|
|
||||||
except Exception as e:
|
|
||||||
return "cannot write new code: %s" % e
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def restart():
|
|
||||||
if verbose:
|
|
||||||
print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
|
|
||||||
e = "fallthrough"
|
|
||||||
try:
|
|
||||||
os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
print("should not be here:", str(e))
|
|
||||||
log("restart failed: %s" % e)
|
|
||||||
|
|
||||||
|
|
||||||
def process():
|
|
||||||
global running, dorestart
|
|
||||||
|
|
||||||
nextReport = time.time()
|
|
||||||
|
|
||||||
while running:
|
|
||||||
while time.time() < nextReport:
|
|
||||||
ifiles = {}
|
|
||||||
conIds = {}
|
|
||||||
for conn in conns:
|
|
||||||
if conns[conn].sock:
|
|
||||||
ifiles[conns[conn].sock.fileno()] = conns[conn].sock
|
|
||||||
conIds[conns[conn].sock.fileno()] = conn
|
|
||||||
|
|
||||||
sleep = nextReport - time.time()
|
|
||||||
if sleep <= 0:
|
|
||||||
break
|
|
||||||
try:
|
|
||||||
r = select.select(list(ifiles.keys()), [], [], sleep)
|
|
||||||
now = (
|
|
||||||
time.time()
|
|
||||||
) # nb: delay from actual packet arrival to select is ca. 105ms!
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
except SystemExit:
|
|
||||||
log("daemon exit, running was %s" % running)
|
|
||||||
if running:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
if running:
|
|
||||||
syslogtrace("select")
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
for rfh in r[0]:
|
|
||||||
conn = conIds[rfh]
|
|
||||||
data, addr = ifiles[rfh].recvfrom(MAXRECV)
|
|
||||||
if verbose:
|
|
||||||
print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
|
|
||||||
try:
|
|
||||||
msgDict = stodict(data)
|
|
||||||
except Exception as e:
|
|
||||||
print(
|
|
||||||
"failed to parse incoming data from %s: %s (%s)"
|
|
||||||
% (addr, data, e)
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(
|
|
||||||
"sock.recvfrom: %s (%s) %s"
|
|
||||||
% (addr, len(data), str(msgDict)[:80])
|
|
||||||
)
|
|
||||||
if msgDict == None:
|
|
||||||
print("bad backet from %s (%s) %s" % (addr, len(data), data))
|
|
||||||
elif msgDict["ID"] == "ACK":
|
|
||||||
conns[conn].ack(msgDict, now)
|
|
||||||
elif msgDict["ID"] == "UPD":
|
|
||||||
if doupdate(conn, msgDict) == None:
|
|
||||||
if verbose:
|
|
||||||
print("process: restart after update")
|
|
||||||
dorestart = True
|
|
||||||
break
|
|
||||||
elif msgDict["ID"] == "CMD":
|
|
||||||
doexec(conn, msgDict["cmd"])
|
|
||||||
else:
|
|
||||||
doexec(conn, data) # deprecated until no more VER - hbc
|
|
||||||
if dorestart:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
if not running:
|
|
||||||
break
|
|
||||||
for conn in conns:
|
|
||||||
msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
time.sleep(
|
|
||||||
0.1
|
|
||||||
) # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
|
|
||||||
if nextReport + interval >= time.time():
|
|
||||||
nextReport += interval
|
|
||||||
else:
|
|
||||||
nextReport = time.time() + interval
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("process: done running")
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup():
|
|
||||||
global running
|
|
||||||
if not running:
|
|
||||||
return
|
|
||||||
if verbose:
|
|
||||||
log("cleanup")
|
|
||||||
running = False
|
|
||||||
for conn in conns:
|
|
||||||
msg = {"shutdown": 1, "acks": conns[conn].ackcount}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
conns[conn].close()
|
|
||||||
time.sleep(1)
|
|
||||||
closeall()
|
|
||||||
|
|
||||||
|
|
||||||
def closeall():
|
|
||||||
if verbose:
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "closecall")
|
|
||||||
for conn in conns:
|
|
||||||
conns[conn].close()
|
|
||||||
|
|
||||||
|
|
||||||
def daemonize(
|
|
||||||
working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
|
|
||||||
UNIX Environment" for details (ISBN 0201563177)
|
|
||||||
http://www.yendor.com/programming/unix/apue/proc/fork2.c
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
# first fork
|
|
||||||
pid = os.fork()
|
|
||||||
if pid > 0:
|
|
||||||
# exit from first parent
|
|
||||||
os._exit(0)
|
|
||||||
except OSError as e:
|
|
||||||
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
|
|
||||||
os._exit(1)
|
|
||||||
|
|
||||||
# decouple from parent environment
|
|
||||||
os.chdir(working_dir)
|
|
||||||
os.setsid()
|
|
||||||
os.umask(0)
|
|
||||||
# second fork
|
|
||||||
try:
|
|
||||||
pid = os.fork()
|
|
||||||
if pid > 0:
|
|
||||||
# exit from second parent
|
|
||||||
os._exit(0)
|
|
||||||
except OSError as e:
|
|
||||||
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# redirects standard file descriptors
|
|
||||||
sys.stdout.flush()
|
|
||||||
sys.stderr.flush()
|
|
||||||
si = open(stdin, "r")
|
|
||||||
so = open(stdout, "a+")
|
|
||||||
se = open(stderr, "a+")
|
|
||||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
|
||||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
|
||||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
|
||||||
|
|
||||||
#
|
|
||||||
# Main program
|
|
||||||
#
|
|
||||||
def build_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="hbc",
|
|
||||||
description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
|
|
||||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
|
||||||
parser.add_argument("-m", "--message", dest="message", help="Send a message")
|
|
||||||
parser.add_argument("-n", "--name", dest="name", help="Name to use in heartbeat message")
|
|
||||||
parser.add_argument("-f", "--daemon", action="store_true", help="Run in daemon mode")
|
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
||||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
|
||||||
parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def main(argv=None):
|
|
||||||
global msgonly, helpflag, verbose, fdaemon, daemonized, optlist, msgboot, home, configfile, cmdargs, iam, hb_port, conns, interval, hb_hosts
|
|
||||||
parser = build_parser()
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
config = load_config(args.configfile)
|
|
||||||
|
|
||||||
# Apply CLI overrides
|
|
||||||
if args.boot:
|
|
||||||
msgboot["boot"] = 1
|
|
||||||
if args.message:
|
|
||||||
msgboot["service"] = "service"
|
|
||||||
msgboot["msg"] = args.message
|
|
||||||
msgonly = True
|
|
||||||
if args.name:
|
|
||||||
iam = args.name
|
|
||||||
if args.daemon:
|
|
||||||
fdaemon = True
|
|
||||||
if args.verbose:
|
|
||||||
verbose = True
|
|
||||||
if args.debug:
|
|
||||||
config.setdefault("debug", 0)
|
|
||||||
config["debug"] += args.debug
|
|
||||||
|
|
||||||
cmdargs += argv
|
|
||||||
if verbose:
|
|
||||||
print("cmdargs for restart are %s" % cmdargs)
|
|
||||||
|
|
||||||
#
|
|
||||||
# set defaults
|
|
||||||
|
|
||||||
hb_hosts = args.hosts
|
|
||||||
hb_port = config.get("hb_port", PORT)
|
|
||||||
interval = config.get("interval", INTERVAL)
|
|
||||||
|
|
||||||
#
|
|
||||||
if verbose:
|
|
||||||
print("notice: hb_hosts: %s" % str(hb_hosts))
|
|
||||||
print("notice: hb_port: %s" % hb_port)
|
|
||||||
print("notice: interval: %s" % interval)
|
|
||||||
print("notice: iam: %s" % iam)
|
|
||||||
print("notice: msgonly: %s" % msgonly)
|
|
||||||
print("notice: msgboot: %s" % msgboot)
|
|
||||||
|
|
||||||
if not msgonly:
|
|
||||||
msgboot["interval"] = interval
|
|
||||||
|
|
||||||
conns = {}
|
|
||||||
while True:
|
|
||||||
if verbose:
|
|
||||||
log("create connections")
|
|
||||||
createConnections(hb_hosts)
|
|
||||||
if len(conns) != 0:
|
|
||||||
break
|
|
||||||
if verbose:
|
|
||||||
log("no connections yet, sleep a bit")
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("%s connections created" % (len(conns)))
|
|
||||||
|
|
||||||
if len(msgboot) > 0:
|
|
||||||
if verbose:
|
|
||||||
print("on boot")
|
|
||||||
msgboot["acks"] = 0
|
|
||||||
for conn in conns:
|
|
||||||
conns[conn].sendto(msgboot)
|
|
||||||
|
|
||||||
if msgonly:
|
|
||||||
if verbose:
|
|
||||||
print("msgboot done msgonly=%s" % msgonly)
|
|
||||||
closeall()
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
#
|
|
||||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
|
||||||
if fdaemon:
|
|
||||||
print("daemoinizing.")
|
|
||||||
daemonize()
|
|
||||||
daemonized = True
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
|
|
||||||
|
|
||||||
signal.signal(signal.SIGTERM, handler)
|
|
||||||
running = True
|
|
||||||
try:
|
|
||||||
process()
|
|
||||||
except Exception as e:
|
|
||||||
syslogtrace("process")
|
|
||||||
if verbose:
|
|
||||||
print("err: process exit: %s" % e)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("main: cleanup")
|
|
||||||
cleanup()
|
|
||||||
if dorestart:
|
|
||||||
restart()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
-380
@@ -1,380 +0,0 @@
|
|||||||
"""
|
|
||||||
host and connection class shared between hbd and
|
|
||||||
the websit's heartbeat.py
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import copy
|
|
||||||
import queue
|
|
||||||
|
|
||||||
num = 0
|
|
||||||
|
|
||||||
MAXRTTS = 10
|
|
||||||
|
|
||||||
DEBUG = 2
|
|
||||||
|
|
||||||
|
|
||||||
def log(host, m):
|
|
||||||
if DEBUG:
|
|
||||||
print("class log: %s %s" % (host, m))
|
|
||||||
|
|
||||||
|
|
||||||
class Connection:
|
|
||||||
# map of addrs to names
|
|
||||||
|
|
||||||
htab = {}
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
UP = "up"
|
|
||||||
DOWN = "down"
|
|
||||||
OVERDUE = "overdue"
|
|
||||||
|
|
||||||
def __init__(self, host, cid, addr, afam):
|
|
||||||
self.host = host
|
|
||||||
self.cid = cid
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.addr = addr
|
|
||||||
self.afam = afam
|
|
||||||
self.rtts = [0]
|
|
||||||
self.lastbeat = time.time()
|
|
||||||
self.statetime = self.lastbeat
|
|
||||||
self.deltastatetime = "computed"
|
|
||||||
self.state = Connection.UNKNOWN
|
|
||||||
|
|
||||||
if host:
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
log(self.host.name, "dns update %s" % self.addr)
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def clearstate(self):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = ""
|
|
||||||
d["rtt"] = ""
|
|
||||||
d["lastbeat"] = ""
|
|
||||||
d["state"] = ""
|
|
||||||
d["statetime"] = ""
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
d["rttstate"] = ""
|
|
||||||
return d
|
|
||||||
|
|
||||||
def statedict(self, Null=False):
|
|
||||||
d = self.clearstate()
|
|
||||||
now = time.time()
|
|
||||||
if not Null:
|
|
||||||
d["addr"] = self.addr
|
|
||||||
if self.rtts[-1]:
|
|
||||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
|
||||||
elif self.state == Connection.UNKNOWN:
|
|
||||||
d["rtt"] = ""
|
|
||||||
else:
|
|
||||||
d["rtt"] = "?"
|
|
||||||
d["lastbeat"] = self.lastbeat
|
|
||||||
if self.state == Connection.OVERDUE:
|
|
||||||
d["state"] = "<b>%s</b>" % self.state
|
|
||||||
else:
|
|
||||||
d["state"] = self.state
|
|
||||||
if self.state == Connection.UP:
|
|
||||||
d["rttstate"] = d["rtt"]
|
|
||||||
elif self.state == Connection.OVERDUE:
|
|
||||||
d["rttstate"] = ""
|
|
||||||
else:
|
|
||||||
d["rttstate"] = d["state"]
|
|
||||||
d["statetime"] = time.strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
|
||||||
)
|
|
||||||
delta = now - self.statetime
|
|
||||||
|
|
||||||
if self.state == Connection.UNKNOWN:
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
elif delta > 86400:
|
|
||||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
|
||||||
elif delta > 3600:
|
|
||||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
|
||||||
elif delta > 60:
|
|
||||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
|
||||||
else:
|
|
||||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%i secs" % (delta)
|
|
||||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
|
||||||
d = self.clearstate()
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self, afam):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = "%s Addr" % afam
|
|
||||||
d["rtt"] = "Latencey"
|
|
||||||
d["lastbeat"] = "Last Contact"
|
|
||||||
d["state"] = "State"
|
|
||||||
d["statetime"] = "Last State"
|
|
||||||
d["rttstate"] = "Reach"
|
|
||||||
d["deltastatetime"] = "Last State"
|
|
||||||
return d
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.__dict__)
|
|
||||||
|
|
||||||
# set new state, return number of secs in previous state
|
|
||||||
def newstate(self, state, now, when=0):
|
|
||||||
self.state = state
|
|
||||||
delta = now - when
|
|
||||||
s = delta - self.statetime
|
|
||||||
self.statetime = delta
|
|
||||||
return s
|
|
||||||
|
|
||||||
def getstate(self):
|
|
||||||
return self.state
|
|
||||||
|
|
||||||
def newaddr(self, addr, rtt, now):
|
|
||||||
self.lastbeat = now
|
|
||||||
self.rtts.append(rtt)
|
|
||||||
if len(self.rtts) > MAXRTTS:
|
|
||||||
del self.rtts[0]
|
|
||||||
|
|
||||||
if self.addr == addr:
|
|
||||||
r = None
|
|
||||||
else:
|
|
||||||
r = "changed from %s to %s" % (self.addr, addr)
|
|
||||||
try:
|
|
||||||
del Connection.htab[self.addr]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self.addr = addr
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
class Host:
|
|
||||||
# Table of Hosts
|
|
||||||
hosts = {}
|
|
||||||
dnsQ = queue.Queue()
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
global num
|
|
||||||
self.name = name
|
|
||||||
if name:
|
|
||||||
num += 1
|
|
||||||
Host.hosts[name] = self
|
|
||||||
self.num = num
|
|
||||||
self.dyn = False
|
|
||||||
self.watched = False
|
|
||||||
self.upcount = 0
|
|
||||||
self.interval = 0
|
|
||||||
self.doesack = -1
|
|
||||||
self.cmds = []
|
|
||||||
self.cver = 0
|
|
||||||
self.connections = {}
|
|
||||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
|
||||||
|
|
||||||
def statedict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = self.name
|
|
||||||
if self.dyn:
|
|
||||||
d["name"] += "*"
|
|
||||||
if self.watched:
|
|
||||||
d["name"] = "<b>%s</b>" % d["name"]
|
|
||||||
d["dyn"] = str(self.dyn)
|
|
||||||
d["ver"] = str(self.cver)
|
|
||||||
d["num"] = self.num
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
cs = self.connections[c].statedict()
|
|
||||||
else:
|
|
||||||
cs = ubConnection.statedict(True)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = "Name"
|
|
||||||
d["dyn"] = "Dyn"
|
|
||||||
d["ver"] = "Ver"
|
|
||||||
d["num"] = "??"
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
cs = ubConnection.headerdict(c)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
return d
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
for af in self.connections:
|
|
||||||
self.connections[af].registerDns()
|
|
||||||
|
|
||||||
def stateinfo(self):
|
|
||||||
ddict = {}
|
|
||||||
for d in self.__dict__:
|
|
||||||
if d == "connections":
|
|
||||||
cl = []
|
|
||||||
for c in self.connections:
|
|
||||||
# dirty ugly hack: fix conn to host backpointer
|
|
||||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
|
||||||
cld["host"] = cld["host"].name
|
|
||||||
cl.append(cld)
|
|
||||||
ddict[d] = cl
|
|
||||||
else:
|
|
||||||
ddict[d] = self.__dict__[d]
|
|
||||||
return ddict
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.stateinfo())
|
|
||||||
|
|
||||||
def setcver(self, cver):
|
|
||||||
self.cver = cver
|
|
||||||
|
|
||||||
def isDynDns(self):
|
|
||||||
return self.dyn
|
|
||||||
|
|
||||||
def isIPv4(self, addr):
|
|
||||||
if isinstance(addr, tuple):
|
|
||||||
return addr[0].find(".") > 0
|
|
||||||
else:
|
|
||||||
return addr.find(".") > 0
|
|
||||||
|
|
||||||
def conndata(self, cid, addr, rtt, now):
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
if self.isIPv4(addr):
|
|
||||||
afam = "IPv4"
|
|
||||||
else:
|
|
||||||
afam = "IPv6"
|
|
||||||
|
|
||||||
if afam not in self.connections:
|
|
||||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
|
||||||
|
|
||||||
conn = self.connections[afam]
|
|
||||||
res = conn.newaddr(addr, rtt, now)
|
|
||||||
return conn, res
|
|
||||||
|
|
||||||
# called when reloading class from pickle, add new fields here
|
|
||||||
def fixup(self):
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
addr = self.connections[c].addr
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.connections[c].addr = addr
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def dispstate(self):
|
|
||||||
# if self.state in ["down", "overdue"]:
|
|
||||||
# state = "<b>%s</b>" % self.state
|
|
||||||
# elif self.state in ["up", "UP"]:
|
|
||||||
# state = ""
|
|
||||||
# for x in list(self.connections.keys()):
|
|
||||||
# try:
|
|
||||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
|
||||||
# except:
|
|
||||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
|
||||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
|
||||||
# state = ""
|
|
||||||
# else:
|
|
||||||
# state = "%s" % self.state
|
|
||||||
# return state
|
|
||||||
|
|
||||||
def dispstats(self):
|
|
||||||
if self.doesack != -1:
|
|
||||||
if self.upcount > 0:
|
|
||||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
|
||||||
r = ""
|
|
||||||
for v in range(3):
|
|
||||||
a, u = self.hdwcounts[v]
|
|
||||||
if (self.upcount - u) != 0:
|
|
||||||
vs = "%0.0f" % (
|
|
||||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
|
||||||
)
|
|
||||||
if vs == "0":
|
|
||||||
vs = ""
|
|
||||||
else:
|
|
||||||
vs = "-"
|
|
||||||
r += '<td align="right">%s</td>' % vs
|
|
||||||
return r
|
|
||||||
else:
|
|
||||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
|
||||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
|
||||||
|
|
||||||
hostfields_long = [
|
|
||||||
"name",
|
|
||||||
"IPv4.addr",
|
|
||||||
"IPv4.state",
|
|
||||||
("IPv4.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv4.statetime", 'style="text-align: right;"'),
|
|
||||||
"IPv6.addr",
|
|
||||||
"IPv6.state",
|
|
||||||
("IPv6.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv6.statetime", 'style="text-align: right;"'),
|
|
||||||
"ver",
|
|
||||||
]
|
|
||||||
|
|
||||||
hostfields_short = [
|
|
||||||
"name",
|
|
||||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def gene(self, tag, v, attrib=None):
|
|
||||||
if attrib:
|
|
||||||
a = " %s" % attrib
|
|
||||||
else:
|
|
||||||
a = ""
|
|
||||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
|
||||||
|
|
||||||
def htmltable(self, tag, hd, short):
|
|
||||||
if short:
|
|
||||||
hostfields = Host.hostfields_short
|
|
||||||
else:
|
|
||||||
hostfields = Host.hostfields_long
|
|
||||||
h = []
|
|
||||||
for f in hostfields:
|
|
||||||
if isinstance(f, tuple):
|
|
||||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
|
||||||
else:
|
|
||||||
h.append(self.gene(tag, hd[f]))
|
|
||||||
return self.gene("tr", "\n".join(h))
|
|
||||||
|
|
||||||
def buildhosttable(self, short=False):
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: start")
|
|
||||||
res = []
|
|
||||||
res.append('<table id="ntable" class="sortable">')
|
|
||||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
|
||||||
hosts_sorted = list(Host.hosts.keys())
|
|
||||||
if len(hosts_sorted):
|
|
||||||
hosts_sorted.sort()
|
|
||||||
for h in hosts_sorted:
|
|
||||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
|
||||||
res.append("</table>")
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: %s" % res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def buildmsgtable(self, msgs):
|
|
||||||
res = []
|
|
||||||
le = max(40 - len(Host.hosts), 3)
|
|
||||||
res.append("<h4>Log of Events</h4>")
|
|
||||||
for m in msgs[len(msgs) - le:]:
|
|
||||||
res.append("%s<BR>" % m)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
# create fake "unbound objects", remove in Python 3.0
|
|
||||||
ubHost = Host(None)
|
|
||||||
ubConnection = Connection(None, "", "", "")
|
|
||||||
-199
@@ -1,199 +0,0 @@
|
|||||||
"""HTTP server implementation using aiohttp and jinja2."""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import urllib.parse
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from aiohttp import web
|
|
||||||
from fastapi.templating import Jinja2Templates
|
|
||||||
import jinja2
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def _render_template(html_str: str, **context) -> str:
|
|
||||||
tmpl = jinja2.Template(html_str)
|
|
||||||
return tmpl.render(**context)
|
|
||||||
|
|
||||||
async def start(
|
|
||||||
host: str,
|
|
||||||
port: int,
|
|
||||||
config,
|
|
||||||
hbdclass,
|
|
||||||
msgs_getter,
|
|
||||||
log=None,
|
|
||||||
email=None,
|
|
||||||
pushmsg=None,
|
|
||||||
msg_to_websockets=None,
|
|
||||||
tcss=None,
|
|
||||||
DEBUG=0,
|
|
||||||
verbose=False,
|
|
||||||
get_now=None,
|
|
||||||
VER="",
|
|
||||||
):
|
|
||||||
"""Start an aiohttp web server and block until cancelled.
|
|
||||||
|
|
||||||
This function is intended to be awaited inside the main asyncio event loop.
|
|
||||||
"""
|
|
||||||
get_now = get_now or (lambda: time.time())
|
|
||||||
|
|
||||||
async def index(request):
|
|
||||||
res = []
|
|
||||||
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
|
||||||
res.append("<html>")
|
|
||||||
res.append("<head>")
|
|
||||||
res.append(f"<title>Heartbeat</title>")
|
|
||||||
if tcss:
|
|
||||||
res.append(tcss)
|
|
||||||
res.append("</head>")
|
|
||||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
|
||||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
|
||||||
res += hbdclass.ubHost.buildhosttable()
|
|
||||||
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
|
|
||||||
res.append(
|
|
||||||
"<p> %s (%s)</p>" % (time.strftime("%H:%M:%S", time.localtime(get_now())), config.get("tz", "CET-1CDT"))
|
|
||||||
)
|
|
||||||
res.append("</body></html>")
|
|
||||||
body = "\n".join(res)
|
|
||||||
return web.Response(text=body, content_type="text/html")
|
|
||||||
|
|
||||||
async def api_hosts(request):
|
|
||||||
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
|
|
||||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
|
||||||
|
|
||||||
async def api_messages(request):
|
|
||||||
lst = msgs_getter()[-30:]
|
|
||||||
return web.json_response(lst)
|
|
||||||
|
|
||||||
async def cmd(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
ucmd = qa.get("c")
|
|
||||||
if not ucmd or not uname:
|
|
||||||
return web.Response(status=400, text="need h= and c= arguments")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
hbdclass.Host.hosts[uname].cmds.append(("CMD", {"cmd": urllib.parse.unquote(ucmd)}))
|
|
||||||
return web.Response(text=f"cmd {uname} queued")
|
|
||||||
|
|
||||||
async def drop(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
if not uname:
|
|
||||||
return web.Response(status=400, text="need h= argument")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
if log:
|
|
||||||
log(uname, "dropped")
|
|
||||||
del hbdclass.Host.hosts[uname]
|
|
||||||
return web.Response(text="Done")
|
|
||||||
|
|
||||||
async def register(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
if not uname:
|
|
||||||
return web.Response(status=400, text="need h= argument")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
ll = hbdclass.Host.hosts[uname].registerDns()
|
|
||||||
if log:
|
|
||||||
log(uname, ll)
|
|
||||||
return web.Response(text=str(ll))
|
|
||||||
|
|
||||||
async def update(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = urllib.parse.unquote(qa.get("h", ""))
|
|
||||||
ucode = qa.get("c")
|
|
||||||
if not ucode or not uname:
|
|
||||||
return web.Response(status=400, text="need h= and c= arguments")
|
|
||||||
if uname != "All" and uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
if uname != "All":
|
|
||||||
names = [uname]
|
|
||||||
else:
|
|
||||||
names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
|
|
||||||
out = []
|
|
||||||
for n in names:
|
|
||||||
err = None
|
|
||||||
try:
|
|
||||||
r = {"csum": None, "code": ucode}
|
|
||||||
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
|
|
||||||
except Exception as e:
|
|
||||||
err = str(e)
|
|
||||||
out.append(f"update started for {n}: {err if err else 'OK'}")
|
|
||||||
return web.Response(text="\n".join(out))
|
|
||||||
|
|
||||||
async def restart(request):
|
|
||||||
# signal main application to perform restart if needed
|
|
||||||
# not implemented here - return OK
|
|
||||||
if log:
|
|
||||||
log(None, "restart request")
|
|
||||||
return web.Response(text="restart request")
|
|
||||||
|
|
||||||
async def live(request):
|
|
||||||
# render template from hbd/templates/live.html using Jinja2
|
|
||||||
# Resolve templates directory relative to the hbd package
|
|
||||||
pkg_dir = os.path.dirname(__file__)
|
|
||||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
|
||||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
|
||||||
host = config.get("hb_host", "localhost")
|
|
||||||
extra_scripts = config.get("http_extra_scripts", "")
|
|
||||||
host = request.host.split(":")[0]
|
|
||||||
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
|
|
||||||
tmpl = env.get_template("live.html")
|
|
||||||
body = tmpl.render(
|
|
||||||
title="Heartbeat",
|
|
||||||
header="Heartbeat",
|
|
||||||
request=request,
|
|
||||||
heartbeat_ws_url=heartbeat_ws_url,
|
|
||||||
extra_scripts=extra_scripts,
|
|
||||||
hosts=[hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)],
|
|
||||||
messages=msgs_getter()[-30:],
|
|
||||||
)
|
|
||||||
return web.Response(text=body, content_type="text/html")
|
|
||||||
|
|
||||||
async def static(request):
|
|
||||||
"""Serve files from the package static directory.
|
|
||||||
|
|
||||||
URL form: /static/<path>
|
|
||||||
"""
|
|
||||||
p = request.match_info.get("path", "")
|
|
||||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
|
||||||
# normalize and prevent directory traversal
|
|
||||||
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
|
||||||
if not target.startswith(base + os.sep) and target != base:
|
|
||||||
return web.Response(status=403, text="Forbidden")
|
|
||||||
if not os.path.exists(target) or not os.path.isfile(target):
|
|
||||||
return web.Response(status=404, text="Not Found")
|
|
||||||
logger.info("serving static file: %s", target)
|
|
||||||
return web.FileResponse(path=target)
|
|
||||||
|
|
||||||
app = web.Application()
|
|
||||||
app.add_routes(
|
|
||||||
[
|
|
||||||
web.get("/", index),
|
|
||||||
web.get("/api/0/hosts", api_hosts),
|
|
||||||
web.get("/api/0/messages", api_messages),
|
|
||||||
web.get("/c", cmd),
|
|
||||||
web.get("/d", drop),
|
|
||||||
web.get("/n", register),
|
|
||||||
web.get("/u", update),
|
|
||||||
web.get("/r", restart),
|
|
||||||
web.get("/live", live),
|
|
||||||
web.get("/static/{path:.*}", static),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
runner = web.AppRunner(app)
|
|
||||||
await runner.setup()
|
|
||||||
site = web.TCPSite(runner, host, port)
|
|
||||||
await site.start()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"HTTP server started on {host}:{port}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
await asyncio.Future()
|
|
||||||
finally:
|
|
||||||
await runner.cleanup()
|
|
||||||
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
"""monitor helper and thread for heartbeat daemon."""
|
|
||||||
from __future__ import annotations
|
|
||||||
import asyncio
|
|
||||||
import threading
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
from subprocess import Popen, PIPE, STDOUT
|
|
||||||
from typing import Optional
|
|
||||||
from . import hbdclass
|
|
||||||
DROPOVERDUE = 7 * 24 * 3600
|
|
||||||
|
|
||||||
def checkoverdue(config: dict, hbdclass, log: callable, email: callable, pushmsg: callable, msg_to_websockets: callable):
|
|
||||||
now = time.time()
|
|
||||||
for h in list(hbdclass.Host.hosts.keys()):
|
|
||||||
pmsg = []
|
|
||||||
for c in hbdclass.Host.hosts[h].connections:
|
|
||||||
conn = hbdclass.Host.hosts[h].connections[c]
|
|
||||||
if conn.state == hbdclass.Connection.DOWN:
|
|
||||||
continue
|
|
||||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
|
||||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
|
||||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
|
||||||
pmsg.append(conn.afam)
|
|
||||||
if (
|
|
||||||
conn.state == hbdclass.Connection.OVERDUE and (now - conn.lastbeat) > DROPOVERDUE
|
|
||||||
):
|
|
||||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
|
||||||
if pmsg != []:
|
|
||||||
if h in config.get("watchhosts", []):
|
|
||||||
email("overdue", "%s overdue" % " and ".join(pmsg))
|
|
||||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
|
||||||
log(h, "%s overdue" % " and ".join(pmsg))
|
|
||||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
|
||||||
|
|
||||||
async def start(
|
|
||||||
config: dict,
|
|
||||||
hbdclass: callable,
|
|
||||||
log=None,
|
|
||||||
email=None,
|
|
||||||
pushmsg=None,
|
|
||||||
msg_to_websockets=None,
|
|
||||||
):
|
|
||||||
""" start a monitor loop that checks for overdue hosts every minute """
|
|
||||||
while True:
|
|
||||||
await asyncio.sleep(15) # 15 seconds between checks
|
|
||||||
checkoverdue(config, hbdclass, log, email, pushmsg, msg_to_websockets)
|
|
||||||
-155
@@ -1,155 +0,0 @@
|
|||||||
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
|
|
||||||
import logging
|
|
||||||
from typing import Optional
|
|
||||||
import http.client
|
|
||||||
import urllib.parse
|
|
||||||
import subprocess
|
|
||||||
import smtplib
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
|
||||||
|
|
||||||
# module-level configuration set via setup()
|
|
||||||
_config = {}
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def setup(cfg: dict):
|
|
||||||
"""Initialize notifier defaults from a configuration dict."""
|
|
||||||
global _config
|
|
||||||
_config = dict(cfg)
|
|
||||||
|
|
||||||
|
|
||||||
def send_email(aemail, smtpserver, sender, subject, body, debug=0):
|
|
||||||
"""Send a plain email via SMTP. Returns True on success."""
|
|
||||||
try:
|
|
||||||
server = smtplib.SMTP(smtpserver)
|
|
||||||
if debug > 0:
|
|
||||||
server.set_debuglevel(1)
|
|
||||||
server.sendmail(sender, aemail, body)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("email send failed: %s", e)
|
|
||||||
try:
|
|
||||||
server.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
server.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def email(subject: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Convenience wrapper exposed to the rest of the application.
|
|
||||||
|
|
||||||
Uses module-level configuration to supply recipient list, smtp server
|
|
||||||
and sender address.
|
|
||||||
"""
|
|
||||||
toaddrs = _config.get("AEMAIL") or _config.get("aemail") or _config.get("email_to") or []
|
|
||||||
fromemail = _config.get("fromemail") or _config.get("sender") or f"aew.heartbeat@{_config.get('domain','local') }"
|
|
||||||
smtpserver = _config.get("SMTPSERVER") or _config.get("smtpserver") or _config.get("SMTPSERVER", "localhost")
|
|
||||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
|
||||||
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
|
||||||
toaddrs[0] if toaddrs else "",
|
|
||||||
fromemail,
|
|
||||||
subject,
|
|
||||||
date,
|
|
||||||
msg,
|
|
||||||
)
|
|
||||||
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
|
|
||||||
|
|
||||||
|
|
||||||
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Send message via Pushover API."""
|
|
||||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
|
||||||
try:
|
|
||||||
conn.request(
|
|
||||||
"POST",
|
|
||||||
"/1/messages.json",
|
|
||||||
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
|
|
||||||
{"Content-type": "application/x-www-form-urlencoded"},
|
|
||||||
)
|
|
||||||
r = conn.getresponse()
|
|
||||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
|
||||||
return r.status == 200
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("pushover error: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushmattermost(host: str, token: str, channel: str, msg: str, username: str = "hbd", icon: Optional[str] = None, debug: int = 0) -> bool:
|
|
||||||
"""Send a message to Mattermost via simple webhook driver if available.
|
|
||||||
|
|
||||||
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
|
|
||||||
If the import fails it returns False.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from mattermostdriver import Driver
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
|
||||||
mm = Driver(ses)
|
|
||||||
payload = {"text": msg, "channel": channel, "username": username}
|
|
||||||
if icon:
|
|
||||||
payload["icon_url"] = icon
|
|
||||||
try:
|
|
||||||
rc = mm.webhooks.call_webhook(token, payload)
|
|
||||||
logger.debug("mattermost rc: %s", rc)
|
|
||||||
return bool(rc is None or rc == "")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("mattermost error: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushsignal(signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Send a message via signal-cli (requires local installation).
|
|
||||||
|
|
||||||
Uses subprocess to call signal-cli. Returns True if the command succeeded.
|
|
||||||
"""
|
|
||||||
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
|
|
||||||
logger.debug("signal cli: %s", CLI)
|
|
||||||
try:
|
|
||||||
res = subprocess.run(CLI, capture_output=True)
|
|
||||||
if res.returncode != 0:
|
|
||||||
logger.error("signal failed: %s". res.stderr.decode())
|
|
||||||
return False
|
|
||||||
logger.debug("signal sent: %s", res.stdout.decode())
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("signal exception: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
|
||||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
|
||||||
|
|
||||||
cfg is expected to contain keys for different services when needed, e.g.
|
|
||||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
|
||||||
- cfg['pushover_token'], cfg['pushover_user']
|
|
||||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
|
||||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
|
||||||
|
|
||||||
Returns a dict of results per provider.
|
|
||||||
"""
|
|
||||||
results = {}
|
|
||||||
p = cfg.get("pushsrv", "pushover")
|
|
||||||
if p in ("all", "pushover"):
|
|
||||||
ok = pushover(cfg.get("pushover_token", ""), cfg.get("pushover_user", ""), msg, debug=debug)
|
|
||||||
results["pushover"] = ok
|
|
||||||
if p in ("all", "mattermost"):
|
|
||||||
ok = pushmattermost(cfg.get("matter_host", ""), cfg.get("matter_token", ""), cfg.get("matter_channel", ""), msg, username=cfg.get("matter_username", "hbd"), icon=cfg.get("matter_icon"), debug=debug)
|
|
||||||
results["mattermost"] = ok
|
|
||||||
if p in ("all", "signal"):
|
|
||||||
ok = pushsignal(cfg.get("signal_cli", "/usr/local/bin/signal-cli"), cfg.get("signal_user", ""), cfg.get("signal_recipient", ""), msg, debug=debug)
|
|
||||||
results["signal"] = ok
|
|
||||||
logger.debug("push results: %s", results)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
|
||||||
"""Use the module-level configuration dict to dispatch a push message."""
|
|
||||||
return pushmsg(_config, msg, debug=debug)
|
|
||||||
|
|
||||||
@@ -1,81 +0,0 @@
|
|||||||
"""Message encoding/decoding utilities for hbd protocol."""
|
|
||||||
from typing import Dict, Any
|
|
||||||
import zlib
|
|
||||||
|
|
||||||
|
|
||||||
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
|
||||||
"""Serialize a dict to protocol message bytes.
|
|
||||||
|
|
||||||
If compress is True, the payload is zlib-compressed and the message is
|
|
||||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
|
||||||
`ID:key=value;...` (bytes).
|
|
||||||
"""
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
v = d[k]
|
|
||||||
if isinstance(v, float):
|
|
||||||
s.append(f"{k}={v:0.5f}")
|
|
||||||
else:
|
|
||||||
s.append(f"{k}={v}")
|
|
||||||
pk = ";".join(s)
|
|
||||||
if compress:
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
hdr = ("!" + ID + ":").encode()
|
|
||||||
return hdr + zpk
|
|
||||||
else:
|
|
||||||
return (ID + ":" + pk).encode()
|
|
||||||
|
|
||||||
|
|
||||||
def stodict(msg: bytes):
|
|
||||||
"""Deserialize a protocol message into a dict.
|
|
||||||
|
|
||||||
Mirrors original behaviour: detects compressed messages starting with
|
|
||||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
|
||||||
message ID and the parsed key/value pairs.
|
|
||||||
"""
|
|
||||||
d = {}
|
|
||||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
|
||||||
# message is: b'!ID:' + compressed_payload
|
|
||||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
|
||||||
try:
|
|
||||||
pk = zlib.decompress(msg[5:]).decode()
|
|
||||||
except Exception:
|
|
||||||
# malformed compressed payload
|
|
||||||
return {}
|
|
||||||
d["ID"] = msg[1:4].decode()
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
r0 = msg.split(b":", 1)
|
|
||||||
pk = r0[1].decode()
|
|
||||||
d["ID"] = r0[0].decode()
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
if not pk:
|
|
||||||
return d
|
|
||||||
parts = pk.split(";")
|
|
||||||
for v in parts:
|
|
||||||
if not v:
|
|
||||||
continue
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
val = vr[1].strip()
|
|
||||||
if val and val[0].isdigit():
|
|
||||||
try:
|
|
||||||
val_e = eval(val)
|
|
||||||
except Exception:
|
|
||||||
val_e = val
|
|
||||||
d[k] = val_e
|
|
||||||
else:
|
|
||||||
d[k] = val
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def oldmtodict(msg: bytes):
|
|
||||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
|
||||||
|
|
||||||
The original implementation prefixed with 'HTB:' and called stodict.
|
|
||||||
"""
|
|
||||||
return stodict(b"HTB:" + msg)
|
|
||||||
-323
@@ -1,323 +0,0 @@
|
|||||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import atexit
|
|
||||||
import time
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
from . import __version__
|
|
||||||
|
|
||||||
from . import udp
|
|
||||||
from . import hbdclass
|
|
||||||
|
|
||||||
from . import ws as ws_mod
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
msg_to_websockets = ws_mod.broadcast
|
|
||||||
|
|
||||||
logf = None
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
|
|
||||||
# shared runtime collections and helpers
|
|
||||||
msgs = []
|
|
||||||
|
|
||||||
def initlog(logfile):
|
|
||||||
try:
|
|
||||||
return open(logfile, "a+")
|
|
||||||
except Exception as e:
|
|
||||||
import sys
|
|
||||||
print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
|
|
||||||
return sys.stderr
|
|
||||||
|
|
||||||
def log(host, m, service=None):
|
|
||||||
ts = time.time()
|
|
||||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
|
|
||||||
msgs.append(s)
|
|
||||||
logger.info(s)
|
|
||||||
if logf:
|
|
||||||
try:
|
|
||||||
logf.write(s + "\n")
|
|
||||||
logf.flush()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("failed to write to logfile: %s", e)
|
|
||||||
msg_to_websockets("message", s)
|
|
||||||
|
|
||||||
def cleanup_function(config):
|
|
||||||
"""This function will be executed upon program exit."""
|
|
||||||
logger.info("Running cleanup function...")
|
|
||||||
import pickle
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
|
||||||
|
|
||||||
pickf = open(pickfile, "wb")
|
|
||||||
pick = pickle.Pickler(pickf)
|
|
||||||
pick.dump(hbdclass.Host.hosts)
|
|
||||||
pick.dump(msgs)
|
|
||||||
pick.dump(lastfm)
|
|
||||||
pickf.close()
|
|
||||||
|
|
||||||
logger.info("Cleanup complete.")
|
|
||||||
|
|
||||||
async def _run_async(config):
|
|
||||||
global msgs
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
shutdown_event = asyncio.Event()
|
|
||||||
|
|
||||||
# Signal handlers for graceful shutdown
|
|
||||||
def signal_handler(signum, frame):
|
|
||||||
sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else signum
|
|
||||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
|
||||||
loop.call_soon_threadsafe(shutdown_event.set)
|
|
||||||
|
|
||||||
# Register signal handlers
|
|
||||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
|
||||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
|
||||||
|
|
||||||
# prepare runtime dependencies
|
|
||||||
import threading
|
|
||||||
# from . import hbdclass
|
|
||||||
from . import http as http_mod
|
|
||||||
from . import dns as dns_mod
|
|
||||||
from . import notify as notify_mod
|
|
||||||
from . import monitor as monitor_mod
|
|
||||||
|
|
||||||
notify_mod.setup(config)
|
|
||||||
|
|
||||||
email = notify_mod.email
|
|
||||||
pushmsg = notify_mod.pushmsg_from_config
|
|
||||||
|
|
||||||
# UDP server endpoint (handler wired to handle_datagram with context)
|
|
||||||
bind_addr = ("0.0.0.0", config.get("hb_port", 50003))
|
|
||||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
|
||||||
|
|
||||||
def udp_handler(msg, addr, transport):
|
|
||||||
ctx = dict(
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
DEBUG=config.get("debug", 0),
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
)
|
|
||||||
udp.handle_datagram(msg, addr, transport, ctx)
|
|
||||||
|
|
||||||
transport, protocol = await loop.create_datagram_endpoint(
|
|
||||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
|
||||||
local_addr=bind_addr,
|
|
||||||
)
|
|
||||||
|
|
||||||
# HTTP server (asyncio-based via aiohttp)
|
|
||||||
try:
|
|
||||||
http_task = asyncio.create_task(
|
|
||||||
http_mod.start(
|
|
||||||
host=config.get("hbd_host", ""),
|
|
||||||
port=config.get("hbd_port", 50004),
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
msgs_getter=lambda: msgs,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
tcss=None,
|
|
||||||
DEBUG=config.get("debug", 0),
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
get_now=lambda: time.time(),
|
|
||||||
VER="",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("HTTP server started on %s:%s", config.get("hbd_host", ""), config.get("hbd_port", 50004))
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("failed to start HTTP server: %s", e)
|
|
||||||
|
|
||||||
# start dns update worker (async)
|
|
||||||
dns_task = None
|
|
||||||
try:
|
|
||||||
dns_task = dns_mod.start_dns_worker(hbdclass, config, log=log, email=email, loop=loop)
|
|
||||||
logger.info("dns update worker started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("dns worker failed to start: %s", e)
|
|
||||||
|
|
||||||
# Start the websocket servers as a background task
|
|
||||||
try:
|
|
||||||
ws_task = asyncio.create_task(
|
|
||||||
ws_mod.start(
|
|
||||||
host=config.get("hbd_host", ""),
|
|
||||||
ws_port=config.get("ws_port", 50005),
|
|
||||||
wss_port=config.get("wss_port", None),
|
|
||||||
ssl_context=None,
|
|
||||||
get_hosts=lambda: [hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)],
|
|
||||||
get_msgs=lambda: msgs,
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("WebSocket task started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("websocket server failed to start: %s", e)
|
|
||||||
|
|
||||||
# Start the monitor thread as a background task
|
|
||||||
try:
|
|
||||||
monitor_task = asyncio.create_task(
|
|
||||||
monitor_mod.start(
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("Monitor task started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("monitor task failed to start: %s", e)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# run forever until shutdown event is set
|
|
||||||
await shutdown_event.wait()
|
|
||||||
logger.info("Shutdown signal received, stopping services...")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Error in main loop: %s", e)
|
|
||||||
finally:
|
|
||||||
# Cancel all running tasks
|
|
||||||
logger.info("Cancelling tasks...")
|
|
||||||
try:
|
|
||||||
transport.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error closing UDP transport: %s", e)
|
|
||||||
|
|
||||||
tasks_to_cancel = [http_task, ws_task, monitor_task]
|
|
||||||
for task in tasks_to_cancel:
|
|
||||||
if task:
|
|
||||||
try:
|
|
||||||
task.cancel()
|
|
||||||
logger.debug("Cancelled task: %s", task)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error cancelling task: %s", e)
|
|
||||||
|
|
||||||
# Wait for tasks to finish cancellation with timeout
|
|
||||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
|
||||||
if remaining_tasks:
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(asyncio.gather(*remaining_tasks, return_exceptions=True), timeout=2.0)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("Timeout waiting for tasks to cancel")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug("Exception during task cancellation: %s", e)
|
|
||||||
|
|
||||||
# Signal DNS worker to exit and await it
|
|
||||||
try:
|
|
||||||
if 'dns_task' in locals() and dns_task:
|
|
||||||
try:
|
|
||||||
hbdclass.Host.dnsQ.put(None)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
|
||||||
logger.info("DNS worker finished")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("Timeout waiting for DNS worker to finish")
|
|
||||||
dns_task.cancel()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
logger.info("DNS worker was cancelled")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error awaiting DNS worker: %s", e)
|
|
||||||
finally:
|
|
||||||
# Clear queue bridge to release any held references
|
|
||||||
hbdclass.Host.dnsQ = None
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error stopping DNS worker: %s", e)
|
|
||||||
|
|
||||||
logger.info("All tasks cancelled")
|
|
||||||
|
|
||||||
|
|
||||||
def load_pickled_hosts(config, hbdclass):
|
|
||||||
"""Load pickled hosts from file, if available."""
|
|
||||||
global lastfm, msgs
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
|
||||||
dyndnshosts = config.get("dyndnshosts", [])
|
|
||||||
watchhosts = config.get("watchhosts", [])
|
|
||||||
drophosts = config.get("drophosts", [])
|
|
||||||
if 1 and os.path.exists(pickfile):
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("opening pickls %s", pickfile)
|
|
||||||
pickf = open(pickfile, "rb")
|
|
||||||
pick = pickle.Unpickler(pickf)
|
|
||||||
try:
|
|
||||||
hbdclass.Host.hosts = pick.load()
|
|
||||||
msgs = pick.load()
|
|
||||||
try:
|
|
||||||
lastfm = pick.load()
|
|
||||||
except:
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
pickf.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("load pickled failed: %s", e)
|
|
||||||
os.unlink(pickfile)
|
|
||||||
hbdclass.Connection.htab = {}
|
|
||||||
for h in list(hbdclass.Host.hosts.keys()):
|
|
||||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
|
||||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
|
||||||
hbdclass.Host.hosts[h].fixup()
|
|
||||||
for h in drophosts:
|
|
||||||
if h in hbdclass.Host.hosts:
|
|
||||||
del hbdclass.Host.hosts[h]
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
|
||||||
else:
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("no pickled data")
|
|
||||||
|
|
||||||
def run(config):
|
|
||||||
"""Start the hbd service (blocking).
|
|
||||||
|
|
||||||
Manually manages the event loop to ensure clean shutdown.
|
|
||||||
"""
|
|
||||||
global logf
|
|
||||||
import os
|
|
||||||
import threading
|
|
||||||
import time as time_module
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO)
|
|
||||||
load_pickled_hosts(config, hbdclass)
|
|
||||||
|
|
||||||
logf = initlog(logfile=config.get("logfile", "messages.log"))
|
|
||||||
log(None, f"hbd version {__version__} starting up")
|
|
||||||
|
|
||||||
# Create and set the event loop manually
|
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(loop)
|
|
||||||
|
|
||||||
try:
|
|
||||||
loop.run_until_complete(_run_async(config))
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Unhandled exception in main: %s", e)
|
|
||||||
finally:
|
|
||||||
cleanup_function(config)
|
|
||||||
logger.info("hbd shutdown complete")
|
|
||||||
if logf and logf != sys.stderr:
|
|
||||||
try:
|
|
||||||
logf.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
# Explicitly close the loop
|
|
||||||
try:
|
|
||||||
# Cancel all remaining tasks
|
|
||||||
pending = asyncio.all_tasks(loop)
|
|
||||||
for task in pending:
|
|
||||||
task.cancel()
|
|
||||||
# Run one more cycle to process cancellations
|
|
||||||
if pending:
|
|
||||||
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
loop.close()
|
|
||||||
|
|
||||||
# Exit
|
|
||||||
os._exit(0)
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""HeartBeat Daemon (hbd) - Server/daemon component."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,302 @@
|
|||||||
|
"""Command line interface for hbd package."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import getpass
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from .config import load_config
|
||||||
|
from .main import run as run_server
|
||||||
|
|
||||||
|
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="hbd",
|
||||||
|
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(dest="command")
|
||||||
|
|
||||||
|
# --- serve (default) ---
|
||||||
|
serve_p = subparsers.add_parser("serve", help="Start the hbd server (default)")
|
||||||
|
serve_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
serve_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||||
|
serve_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||||
|
serve_p.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||||
|
help="Push service to use")
|
||||||
|
serve_p.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||||
|
|
||||||
|
# Legacy top-level flags (no subcommand) — kept for backward compatibility
|
||||||
|
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||||
|
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||||
|
help="Push service to use")
|
||||||
|
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||||
|
|
||||||
|
# --- passwd ---
|
||||||
|
passwd_p = subparsers.add_parser(
|
||||||
|
"passwd",
|
||||||
|
help="Generate a password hash for use in the config file",
|
||||||
|
)
|
||||||
|
passwd_p.add_argument(
|
||||||
|
"username",
|
||||||
|
nargs="?",
|
||||||
|
help="Username (informational only, for display)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- notify ---
|
||||||
|
notify_p = subparsers.add_parser(
|
||||||
|
"notify",
|
||||||
|
help="Send a test message via a configured notification channel",
|
||||||
|
)
|
||||||
|
notify_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
notify_p.add_argument(
|
||||||
|
"channel",
|
||||||
|
help="Channel name as defined in notification_channels",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"message",
|
||||||
|
nargs="?",
|
||||||
|
default="Test notification from hbd",
|
||||||
|
help="Message body (default: 'Test notification from hbd')",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"--level",
|
||||||
|
default="WARNING",
|
||||||
|
choices=["INFO", "WARNING", "CRITICAL", "RECOVER"],
|
||||||
|
help="Notification level (default: WARNING)",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"--title",
|
||||||
|
default=None,
|
||||||
|
help="Notification title (default: '[LEVEL] test')",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- stop ---
|
||||||
|
stop_p = subparsers.add_parser("stop", help="Stop the running hbd instance")
|
||||||
|
stop_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
|
||||||
|
# --- reload ---
|
||||||
|
reload_p = subparsers.add_parser("reload", help="Reload configuration (SIGHUP)")
|
||||||
|
reload_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
|
||||||
|
# --- restart ---
|
||||||
|
restart_p = subparsers.add_parser("restart", help="Restart the running hbd instance")
|
||||||
|
restart_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
restart_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground after restart")
|
||||||
|
restart_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output after restart")
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_passwd(args):
|
||||||
|
"""Interactive password hash generator."""
|
||||||
|
from .users import hash_password
|
||||||
|
|
||||||
|
username = args.username or ""
|
||||||
|
prompt = f"New password for {username}: " if username else "New password: "
|
||||||
|
while True:
|
||||||
|
pw = getpass.getpass(prompt)
|
||||||
|
if not pw:
|
||||||
|
print("Password must not be empty.", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
pw2 = getpass.getpass("Confirm password: ")
|
||||||
|
if pw != pw2:
|
||||||
|
print("Passwords do not match, try again.", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
hashed = hash_password(pw)
|
||||||
|
if username:
|
||||||
|
print(f"\nAdd the following to your config under users: -> {username}:")
|
||||||
|
else:
|
||||||
|
print("\nPassword hash (paste into config file under the user's 'password' key):")
|
||||||
|
print(f" password: {hashed}")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_notify(args):
|
||||||
|
"""Send a test message via a single notification channel."""
|
||||||
|
from .config import load_config
|
||||||
|
from .notify import Notification, _dispatch_to_channel, setup
|
||||||
|
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
setup(config)
|
||||||
|
|
||||||
|
channels = config.get("notification_channels", {})
|
||||||
|
if args.channel not in channels:
|
||||||
|
available = ", ".join(channels.keys()) if channels else "(none)"
|
||||||
|
print(f"Error: channel '{args.channel}' not found in notification_channels.", file=sys.stderr)
|
||||||
|
print(f"Available channels: {available}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
channel_cfg = channels[args.channel]
|
||||||
|
level = args.level.upper()
|
||||||
|
title = args.title or f"[{level}] test"
|
||||||
|
base_url = config.get("base_url", "").rstrip("/")
|
||||||
|
|
||||||
|
notif = Notification(
|
||||||
|
title=title,
|
||||||
|
body=args.message,
|
||||||
|
level=level,
|
||||||
|
url=f"{base_url}/plugins" if base_url else "",
|
||||||
|
)
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from .notify import _send_matrix_async, _send_sms_voipms_async, _DRIVERS
|
||||||
|
ch_type = channel_cfg.get("type", "")
|
||||||
|
print(f"Sending via {args.channel} ({ch_type}): {title} — {args.message}")
|
||||||
|
|
||||||
|
if ch_type == "matrix":
|
||||||
|
ok = asyncio.run(_send_matrix_async(channel_cfg, notif))
|
||||||
|
elif ch_type == "sms_voipms":
|
||||||
|
ok = asyncio.run(_send_sms_voipms_async(channel_cfg, notif))
|
||||||
|
else:
|
||||||
|
driver = _DRIVERS.get(ch_type)
|
||||||
|
if driver is None:
|
||||||
|
print(f"Error: unknown channel type '{ch_type}'", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
ok = driver(channel_cfg, notif)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
print("OK")
|
||||||
|
else:
|
||||||
|
print("FAILED — check logs for details", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_pid(configfile) -> int | None:
|
||||||
|
"""Return the PID from the pidfile, or None if not found / not running."""
|
||||||
|
import os
|
||||||
|
config = load_config(configfile)
|
||||||
|
pidfile = config.get("pidfile", "")
|
||||||
|
if not pidfile:
|
||||||
|
print("Error: no pidfile configured.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with open(pidfile) as f:
|
||||||
|
pid = int(f.read().strip())
|
||||||
|
# Verify process is actually running
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return pid
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"PID file not found ({pidfile}). Is hbd running?", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except ProcessLookupError:
|
||||||
|
print(f"PID file exists but process {pid} is not running.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading pidfile: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_stop(args):
|
||||||
|
import os, signal as _signal, time
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is None:
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Stopping hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGTERM)
|
||||||
|
# Wait up to 10 s for the process to exit
|
||||||
|
for _ in range(20):
|
||||||
|
time.sleep(0.5)
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
except ProcessLookupError:
|
||||||
|
print("hbd stopped.")
|
||||||
|
return
|
||||||
|
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_reload(args):
|
||||||
|
import os, signal as _signal
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is None:
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Sending SIGHUP to hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGHUP)
|
||||||
|
print("Reload signal sent.")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_restart(args):
|
||||||
|
import os, signal as _signal, time, subprocess
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is not None:
|
||||||
|
print(f"Stopping hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGTERM)
|
||||||
|
for _ in range(20):
|
||||||
|
time.sleep(0.5)
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
except ProcessLookupError:
|
||||||
|
print("hbd stopped.")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("hbd does not appear to be running — starting fresh.")
|
||||||
|
|
||||||
|
# Re-launch hbd with the same config
|
||||||
|
cmd = [sys.executable, "-m", "hbd.server.cli", "serve"]
|
||||||
|
if args.configfile:
|
||||||
|
cmd += ["-c", args.configfile]
|
||||||
|
if getattr(args, "foreground", False):
|
||||||
|
cmd += ["-f"]
|
||||||
|
if getattr(args, "verbose", False):
|
||||||
|
cmd += ["-v"]
|
||||||
|
|
||||||
|
if getattr(args, "foreground", False):
|
||||||
|
# Run in foreground — replace current process
|
||||||
|
os.execv(sys.executable, cmd)
|
||||||
|
else:
|
||||||
|
subprocess.Popen(cmd, start_new_session=True)
|
||||||
|
print("hbd restarted.")
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if args.command == "passwd":
|
||||||
|
cmd_passwd(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "notify":
|
||||||
|
cmd_notify(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "stop":
|
||||||
|
cmd_stop(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "reload":
|
||||||
|
cmd_reload(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "restart":
|
||||||
|
cmd_restart(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Default: run the server (supports both `hbd serve ...` and `hbd ...`)
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
|
||||||
|
# Apply CLI overrides
|
||||||
|
if args.foreground:
|
||||||
|
config["foreground"] = True
|
||||||
|
if args.verbose:
|
||||||
|
config["verbose"] = True
|
||||||
|
if args.pushsrv:
|
||||||
|
config["pushsrv"] = args.pushsrv
|
||||||
|
if args.debug > 0:
|
||||||
|
config["debug"] = args.debug
|
||||||
|
|
||||||
|
# Pass config_path for reloading support
|
||||||
|
run_server(config, config_path=args.configfile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,332 @@
|
|||||||
|
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
SERVER_DEFAULTS = {
|
||||||
|
# Network settings
|
||||||
|
"hb_port": 50003, # Port to listen for heartbeats
|
||||||
|
"hbd_port": 50004, # HTTP API port
|
||||||
|
"hbd_host": "", # Bind address (empty = all interfaces)
|
||||||
|
|
||||||
|
# Persistence
|
||||||
|
"pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"), # File to store host state between restarts
|
||||||
|
"pidfile": os.path.join(os.path.expanduser("~"), ".hb.pid"), # PID file for stop/restart/reload
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
"logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
|
||||||
|
# Notification channels
|
||||||
|
"notification_channels": {}, # Named channels with type and credentials
|
||||||
|
"base_url": "", # Base URL for notification links (e.g. https://hbd.example.com)
|
||||||
|
|
||||||
|
# Monitoring settings
|
||||||
|
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||||
|
"grace": 2, # Grace period (extra seconds before notifying after a missed heartbeat)
|
||||||
|
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||||
|
|
||||||
|
# User management
|
||||||
|
"users": {}, # username -> {full_name, avatar, password, admin, notification_channels}
|
||||||
|
"default_owner": None, # Username that owns hosts with no explicit owner
|
||||||
|
|
||||||
|
# OAuth2 providers
|
||||||
|
"oauth": {}, # oauth.gitea.{url,client_id,client_secret}
|
||||||
|
|
||||||
|
# Host management
|
||||||
|
"hosts": {}, # Unified host definitions
|
||||||
|
"dyndomains": ["example.org"], # Domains to update via nsupdate when a host with dyndns: true is updated
|
||||||
|
|
||||||
|
# DNS updates
|
||||||
|
"nsupdate_bin": "/usr/bin/nsupdate", # Path to nsupdate binary
|
||||||
|
|
||||||
|
# WebSocket settings
|
||||||
|
"ws_port": 50005,
|
||||||
|
"wss_port": None,
|
||||||
|
"cert_path": "/usr/local/etc/ssl/",
|
||||||
|
"wss_pem": "fullchain.pem",
|
||||||
|
"wss_key": "privkey.pem",
|
||||||
|
|
||||||
|
# Message journal configuration
|
||||||
|
"journal_enabled": True,
|
||||||
|
"journal_dir": "/var/log/heartbeat",
|
||||||
|
"journal_file": "messages.journal",
|
||||||
|
"journal_max_size": 100 * 1024 * 1024, # 100MB
|
||||||
|
"journal_max_backups": 10,
|
||||||
|
|
||||||
|
# Runtime flags
|
||||||
|
"foreground": False,
|
||||||
|
"verbose": False,
|
||||||
|
"debug": 0,
|
||||||
|
|
||||||
|
# Plugin/threshold configs (for clients reporting to this server)
|
||||||
|
"plugins": {},
|
||||||
|
"thresholds": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
THRESHOLD_DEFAULTS = {
|
||||||
|
'thresholds': {
|
||||||
|
'cpu_monitor': {
|
||||||
|
'cpu_percent': {
|
||||||
|
'warning': 80.0,
|
||||||
|
'critical': 90.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'memory_monitor': {
|
||||||
|
'memory_percent': {
|
||||||
|
'warning': 85.0,
|
||||||
|
'critical': 95.0
|
||||||
|
},
|
||||||
|
'swap_percent': {
|
||||||
|
'warning': 40.0,
|
||||||
|
'critical': 75.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'disk_monitor': {
|
||||||
|
'partitions': {
|
||||||
|
'/': {
|
||||||
|
'percent': {
|
||||||
|
'warning': 85.0,
|
||||||
|
'critical': 90.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'rtt': {
|
||||||
|
'warning': 200,
|
||||||
|
'critical': 250.0,
|
||||||
|
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||||
|
},
|
||||||
|
'nagios_runner': {
|
||||||
|
'status_code': {
|
||||||
|
'display': '{check_name} {output}',
|
||||||
|
'operator': "nagios"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'zfs_monitor': {
|
||||||
|
'pools': {
|
||||||
|
'*': {
|
||||||
|
'status': {
|
||||||
|
'warning': 1,
|
||||||
|
'critical': 2,
|
||||||
|
'operator': '>=',
|
||||||
|
'hysteresis': 0.0,
|
||||||
|
'grace': 0,
|
||||||
|
'display': 'ZFS pool {pool_name} is {health}'
|
||||||
|
},
|
||||||
|
'capacity': {
|
||||||
|
'warning': 80.0,
|
||||||
|
'critical': 90.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path=None):
|
||||||
|
"""Load configuration from a YAML file and merge with server defaults.
|
||||||
|
|
||||||
|
If YAML is not available or the file does not exist, defaults are returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with configuration
|
||||||
|
"""
|
||||||
|
cfg = SERVER_DEFAULTS.copy()
|
||||||
|
if not path:
|
||||||
|
# default path (~/.hb.yaml)
|
||||||
|
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||||
|
|
||||||
|
if os.path.exists(path):
|
||||||
|
if yaml:
|
||||||
|
with open(path) as fh:
|
||||||
|
data = yaml.safe_load(fh)
|
||||||
|
# Merge YAML data with defaults
|
||||||
|
# Keep all keys from YAML to support plugin configs and future extensions
|
||||||
|
for k, v in data.items():
|
||||||
|
cfg[k] = v
|
||||||
|
else:
|
||||||
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
|
pass
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
class ReloadableConfig:
|
||||||
|
"""Thread-safe/async-safe configuration wrapper that supports runtime reloading.
|
||||||
|
|
||||||
|
This class wraps the configuration dictionary and provides:
|
||||||
|
- Thread-safe config reloading via SIGHUP
|
||||||
|
- Backward-compatible dict-like access
|
||||||
|
- Async lock to prevent concurrent reloads
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, initial_config, config_path=None):
|
||||||
|
"""Initialize with initial configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
initial_config: Initial configuration dictionary
|
||||||
|
config_path: Path to config file for reloading (optional)
|
||||||
|
"""
|
||||||
|
self._config = initial_config
|
||||||
|
self._config_path = config_path
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def reload(self, config_path=None):
|
||||||
|
"""Reload configuration from file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path: Path to config file (uses stored path if not provided)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New configuration dictionary
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception if reload fails (keeps existing config)
|
||||||
|
"""
|
||||||
|
path = config_path or self._config_path
|
||||||
|
if not path:
|
||||||
|
raise ValueError("No config path specified for reload")
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
try:
|
||||||
|
# Load new config
|
||||||
|
new_config = load_config(path)
|
||||||
|
|
||||||
|
# Store old config for rollback if needed
|
||||||
|
old_config = self._config
|
||||||
|
|
||||||
|
# Update config
|
||||||
|
self._config = new_config
|
||||||
|
self._logger.info(f"Configuration reloaded from {path}")
|
||||||
|
|
||||||
|
return new_config
|
||||||
|
except Exception as e:
|
||||||
|
self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
|
||||||
|
# Keep existing config on error
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
"""Get a config value (dict-compatible)."""
|
||||||
|
return self._config.get(key, default)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Get a config value via subscript (dict-compatible)."""
|
||||||
|
return self._config[key]
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
"""Check if key exists (dict-compatible)."""
|
||||||
|
return key in self._config
|
||||||
|
|
||||||
|
def keys(self):
|
||||||
|
"""Return config keys (dict-compatible)."""
|
||||||
|
return self._config.keys()
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
"""Return config items (dict-compatible)."""
|
||||||
|
return self._config.items()
|
||||||
|
|
||||||
|
def values(self):
|
||||||
|
"""Return config values (dict-compatible)."""
|
||||||
|
return self._config.values()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self):
|
||||||
|
"""Get the underlying config dict (for components that need full dict)."""
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
|
||||||
|
def get_watchhosts(config):
|
||||||
|
"""Extract watched hostnames from config (hosts with watch: true).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
# List of hostnames to watch
|
||||||
|
"""
|
||||||
|
watchhosts = []
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and host_attrs.get("watch", True):
|
||||||
|
watchhosts.append(host_name)
|
||||||
|
return watchhosts
|
||||||
|
|
||||||
|
|
||||||
|
def get_dyndnshosts(config):
|
||||||
|
"""Return hostnames that have a dyndns setting in the hosts section."""
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if not isinstance(hosts_config, dict):
|
||||||
|
return []
|
||||||
|
return [
|
||||||
|
name for name, attrs in hosts_config.items()
|
||||||
|
if isinstance(attrs, dict) and attrs.get("dyndns")
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_config(config, hostname):
|
||||||
|
"""Get configuration for a specific host from the hosts section.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with host attributes or empty dict
|
||||||
|
"""
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||||
|
val = hosts_config[hostname]
|
||||||
|
return val if isinstance(val, dict) else {}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# User / host-access helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_default_owner(config) -> str | None:
|
||||||
|
"""Return the configured default_owner username, or the first admin user, or None."""
|
||||||
|
explicit = config.get("default_owner")
|
||||||
|
if explicit:
|
||||||
|
return explicit
|
||||||
|
# Fall back to first admin user found in config
|
||||||
|
users_cfg = config.get("users", {})
|
||||||
|
if isinstance(users_cfg, dict):
|
||||||
|
for username, attrs in users_cfg.items():
|
||||||
|
if isinstance(attrs, dict) and attrs.get("admin", False):
|
||||||
|
return username
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_access(config, hostname) -> dict:
|
||||||
|
"""Return the access dict for *hostname*: owner, managers, monitors.
|
||||||
|
|
||||||
|
Falls back to default_owner for hosts without an explicit owner.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"owner": str | None,
|
||||||
|
"managers": list[str],
|
||||||
|
"monitors": list[str],
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
host_cfg = get_host_config(config, hostname)
|
||||||
|
|
||||||
|
owner = host_cfg.get("owner") # or get_default_owner(config)
|
||||||
|
|
||||||
|
managers = host_cfg.get("managers", [])
|
||||||
|
if isinstance(managers, str):
|
||||||
|
managers = [managers]
|
||||||
|
|
||||||
|
monitors = host_cfg.get("monitors", [])
|
||||||
|
if isinstance(monitors, str):
|
||||||
|
monitors = [monitors]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"owner": owner,
|
||||||
|
"managers": list(managers),
|
||||||
|
"monitors": list(monitors),
|
||||||
|
}
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
"""YAML round-trip read/write for .hb.yaml, with backup and atomic writes."""
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ruamel.yaml import YAML
|
||||||
|
|
||||||
|
_write_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _make_yaml() -> YAML:
|
||||||
|
y = YAML()
|
||||||
|
y.preserve_quotes = True
|
||||||
|
return y
|
||||||
|
|
||||||
|
# Top-level keys managed by the 'server' logical section
|
||||||
|
_SERVER_KEYS = [
|
||||||
|
"hbd_port", "hbd_host", "ws_port", "wss_port", "hb_port",
|
||||||
|
"interval", "grace", "base_url", "threshold_renotify_interval",
|
||||||
|
"logfile", "pidfile", "pickfile", "journal_enabled", "journal_dir",
|
||||||
|
"journal_max_size", "journal_max_backups", "default_owner",
|
||||||
|
"default_threshold_config",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Top-level keys managed by the 'dns' logical section
|
||||||
|
_DNS_KEYS = ["nsupdate_bin", "rndc_key", "dyndomains"]
|
||||||
|
|
||||||
|
|
||||||
|
def read_roundtrip(path: str):
|
||||||
|
"""Load .hb.yaml with ruamel.yaml, preserving comments and ordering."""
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return _make_yaml().load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def write_config(path: str, data) -> None:
|
||||||
|
"""Backup current file then atomically write data.
|
||||||
|
|
||||||
|
Backup naming: {path}.bak.YYYYMMDD-HHMMSS
|
||||||
|
Rotation: keep the 10 most recent backups, delete older ones.
|
||||||
|
Atomic write: write to {path}.tmp then os.replace({path}.tmp, path).
|
||||||
|
Acquires _write_lock for the full backup+write sequence.
|
||||||
|
"""
|
||||||
|
with _write_lock:
|
||||||
|
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
backup_path = f"{path}.bak.{ts}"
|
||||||
|
n = 0
|
||||||
|
while os.path.exists(backup_path):
|
||||||
|
n += 1
|
||||||
|
backup_path = f"{path}.bak.{ts}-{n}"
|
||||||
|
orig_mode = None
|
||||||
|
if os.path.exists(path):
|
||||||
|
orig_mode = os.stat(path).st_mode
|
||||||
|
with open(path, "rb") as src, open(backup_path, "wb") as dst:
|
||||||
|
dst.write(src.read())
|
||||||
|
os.chmod(backup_path, orig_mode)
|
||||||
|
backups = sorted(glob.glob(f"{path}.bak.*"), reverse=True)
|
||||||
|
for old in backups[10:]:
|
||||||
|
os.unlink(old)
|
||||||
|
tmp = f"{path}.tmp"
|
||||||
|
try:
|
||||||
|
with open(tmp, "w", encoding="utf-8") as f:
|
||||||
|
_make_yaml().dump(data, f)
|
||||||
|
if orig_mode is not None:
|
||||||
|
os.chmod(tmp, orig_mode)
|
||||||
|
os.replace(tmp, path)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def list_backups(path: str) -> list:
|
||||||
|
"""Return backup paths sorted newest-first."""
|
||||||
|
return sorted(glob.glob(f"{path}.bak.*"), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_structured_section(data, section: str, values: dict) -> None:
|
||||||
|
"""Merge a dict of scalar/list values into data for the named logical section.
|
||||||
|
|
||||||
|
For 'server': updates each known key individually, preserving comments on
|
||||||
|
unchanged keys. For 'users': replaces the entire users dict.
|
||||||
|
"""
|
||||||
|
if section == "server":
|
||||||
|
for key in _SERVER_KEYS:
|
||||||
|
if key in values:
|
||||||
|
data[key] = values[key]
|
||||||
|
elif section == "dns":
|
||||||
|
for key in _DNS_KEYS:
|
||||||
|
if key in values:
|
||||||
|
data[key] = values[key]
|
||||||
|
else:
|
||||||
|
data.pop(key, None)
|
||||||
|
elif section == "users":
|
||||||
|
data["users"] = values
|
||||||
|
elif section == "hosts":
|
||||||
|
data["hosts"] = values
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown structured section: {section!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def apply_channel(data, name: str, channel_cfg: dict) -> None:
|
||||||
|
"""Insert or replace a single notification channel entry, preserving others."""
|
||||||
|
if not data.get("notification_channels"):
|
||||||
|
data["notification_channels"] = {}
|
||||||
|
data["notification_channels"][name] = channel_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def delete_channel(data, name: str) -> None:
|
||||||
|
"""Remove a notification channel by name. No-op if not found."""
|
||||||
|
nc = data.get("notification_channels") or {}
|
||||||
|
nc.pop(name, None)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_yaml_section(data, section: str, yaml_text: str) -> None:
|
||||||
|
"""Replace the named logical section by parsing yaml_text."""
|
||||||
|
parsed = _make_yaml().load(yaml_text)
|
||||||
|
if section == "notification_channels":
|
||||||
|
data["notification_channels"] = parsed
|
||||||
|
elif section == "thresholds":
|
||||||
|
data["threshold_configs"] = parsed
|
||||||
|
elif section == "hosts":
|
||||||
|
data["hosts"] = parsed
|
||||||
|
elif section == "dns":
|
||||||
|
if parsed:
|
||||||
|
for key in _DNS_KEYS:
|
||||||
|
if key in parsed:
|
||||||
|
data[key] = parsed[key]
|
||||||
|
else:
|
||||||
|
for key in _DNS_KEYS:
|
||||||
|
data.pop(key, None)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown YAML section: {section!r}")
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
|
||||||
|
class Data:
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.data = {}
|
||||||
|
|
||||||
|
def update(self, new_data):
|
||||||
|
self.data.update(new_data)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
return self.data.get(key, default)
|
||||||
@@ -1,13 +1,26 @@
|
|||||||
"""DNS update helper and pure asyncio worker for heartbeat daemon."""
|
"""DNS update helper and pure asyncio worker for heartbeat daemon."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import subprocess
|
|
||||||
from subprocess import Popen, PIPE, STDOUT
|
from subprocess import Popen, PIPE, STDOUT
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def create_nsupdate_payload(hostname: str, newip: str, dyndomain: str, dnsttl: str = "5") -> str:
|
def create_nsupdate_payload(
|
||||||
D = {"domain": dyndomain, "fqdn": f"{hostname}.dy.{dyndomain}", "dnsttl": dnsttl, "newip": newip, "ts": __import__("time").strftime("%Y-%m-%d.%H:%M:%S", __import__("time").gmtime())}
|
hostname: str, newip: str, dyndomain: str, dnsttl: str = "5"
|
||||||
|
) -> str:
|
||||||
|
D = {
|
||||||
|
"domain": dyndomain,
|
||||||
|
"fqdn": f"{hostname}.dy.{dyndomain}",
|
||||||
|
"dnsttl": dnsttl,
|
||||||
|
"newip": newip,
|
||||||
|
"ts": __import__("time").strftime(
|
||||||
|
"%Y-%m-%d.%H:%M:%S", __import__("time").gmtime()
|
||||||
|
),
|
||||||
|
}
|
||||||
if ":" in newip:
|
if ":" in newip:
|
||||||
nsup = (
|
nsup = (
|
||||||
"""update delete %(fqdn)s AAAA
|
"""update delete %(fqdn)s AAAA
|
||||||
@@ -17,7 +30,8 @@ update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
|
|||||||
send
|
send
|
||||||
answer
|
answer
|
||||||
|
|
||||||
""" % D
|
"""
|
||||||
|
% D
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nsup = (
|
nsup = (
|
||||||
@@ -28,12 +42,19 @@ update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
|
|||||||
send
|
send
|
||||||
answer
|
answer
|
||||||
|
|
||||||
""" % D
|
"""
|
||||||
|
% D
|
||||||
)
|
)
|
||||||
return nsup
|
return nsup
|
||||||
|
|
||||||
|
|
||||||
def nsupdate(hostname: str, newip: str, dyndomain: str, nsupdate_bin: str = "/usr/local/bin/nsupdate", rndc_key: str = "/etc/dhcpc/rndc-key") -> Optional[str]:
|
def nsupdate(
|
||||||
|
hostname: str,
|
||||||
|
newip: str,
|
||||||
|
dyndomain: str,
|
||||||
|
nsupdate_bin: str = "/usr/local/bin/nsupdate",
|
||||||
|
rndc_key: str = "/etc/dhcpc/rndc-key",
|
||||||
|
) -> Optional[str]:
|
||||||
"""Perform DNS update via nsupdate command.
|
"""Perform DNS update via nsupdate command.
|
||||||
|
|
||||||
Returns None on success, else returns combined stdout/stderr as a string.
|
Returns None on success, else returns combined stdout/stderr as a string.
|
||||||
@@ -54,7 +75,14 @@ def nsupdate(hostname: str, newip: str, dyndomain: str, nsupdate_bin: str = "/us
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional[callable] = None, email: Optional[callable] = None, loop: Optional[asyncio.AbstractEventLoop] = None):
|
async def dns_update_worker(
|
||||||
|
hbdclass,
|
||||||
|
cfg: dict,
|
||||||
|
async_queue=None,
|
||||||
|
log: Optional[callable] = None,
|
||||||
|
pushmsg: Optional[callable] = None,
|
||||||
|
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||||
|
):
|
||||||
"""Pure async DNS worker that processes updates from asyncio.Queue.
|
"""Pure async DNS worker that processes updates from asyncio.Queue.
|
||||||
|
|
||||||
Exits when it receives a None sentinel.
|
Exits when it receives a None sentinel.
|
||||||
@@ -66,7 +94,9 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
if not dnsq:
|
if not dnsq:
|
||||||
if log:
|
if log:
|
||||||
try:
|
try:
|
||||||
await loop.run_in_executor(None, log, None, "dns_update_worker: no queue available")
|
await loop.run_in_executor(
|
||||||
|
None, log, None, "dns_update_worker: no queue available"
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
@@ -77,7 +107,9 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if log:
|
if log:
|
||||||
try:
|
try:
|
||||||
await loop.run_in_executor(None, log, None, f"dns_update_worker: error getting item: {e}")
|
await loop.run_in_executor(
|
||||||
|
None, log, None, f"dns_update_worker: error getting item: {e}"
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
@@ -94,38 +126,47 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
pass
|
pass
|
||||||
continue
|
continue
|
||||||
|
|
||||||
m = f"changed address to {addr}"
|
|
||||||
for dyndomain in cfg.get("dyndomains", []):
|
for dyndomain in cfg.get("dyndomains", []):
|
||||||
err = await loop.run_in_executor(None, nsupdate, name, addr, dyndomain, cfg.get("nsupdate_bin", "/usr/local/bin/nsupdate"), cfg.get("rndc_key", "/etc/dhcpc/rndc-key"))
|
err = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
nsupdate,
|
||||||
|
name,
|
||||||
|
addr,
|
||||||
|
dyndomain,
|
||||||
|
cfg.get("nsupdate_bin", "/usr/local/bin/nsupdate"),
|
||||||
|
cfg.get("rndc_key", "/etc/dhcpc/rndc-key"),
|
||||||
|
)
|
||||||
if err:
|
if err:
|
||||||
m += f", DNS update failed: {err}"
|
m = f"DNS update failed for {addr} ({dyndomain}): {err}"
|
||||||
if email:
|
logger.error("DNS update failed for %s: %s", name, err)
|
||||||
|
if log:
|
||||||
try:
|
try:
|
||||||
await loop.run_in_executor(None, email, "error: nsupdate failed", f"{name}.dy.{dyndomain}: {m}")
|
await loop.run_in_executor(None, log, name, "ERROR", m)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
m += ", DNS updated."
|
m = f"DNS updated {name}.dy.{dyndomain} → {addr}"
|
||||||
|
if log:
|
||||||
|
try:
|
||||||
|
await loop.run_in_executor(None, log, name, "INFO", m)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not cfg.get("dyndomains"):
|
||||||
|
logger.warning("DNS update triggered for %s but no dyndomains configured", name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dnsq.task_done()
|
dnsq.task_done()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if log:
|
|
||||||
try:
|
|
||||||
await loop.run_in_executor(None, log, name, m)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if log:
|
def start_dns_worker(
|
||||||
try:
|
hbdclass,
|
||||||
await loop.run_in_executor(None, log, None, "dns_update_worker exiting")
|
cfg: dict,
|
||||||
except Exception:
|
log: Optional[callable] = None,
|
||||||
pass
|
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||||
|
):
|
||||||
|
|
||||||
def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email: Optional[callable] = None, loop: Optional[asyncio.AbstractEventLoop] = None):
|
|
||||||
"""Start the async DNS worker and return the Task.
|
"""Start the async DNS worker and return the Task.
|
||||||
|
|
||||||
Replaces Host.dnsQ with an asyncio.Queue wrapped in a thread-safe bridge
|
Replaces Host.dnsQ with an asyncio.Queue wrapped in a thread-safe bridge
|
||||||
@@ -139,6 +180,7 @@ def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email:
|
|||||||
|
|
||||||
class _QueueBridge:
|
class _QueueBridge:
|
||||||
"""Thread-safe wrapper around asyncio.Queue for synchronous callers."""
|
"""Thread-safe wrapper around asyncio.Queue for synchronous callers."""
|
||||||
|
|
||||||
def __init__(self, loop, aq):
|
def __init__(self, loop, aq):
|
||||||
self._loop = loop
|
self._loop = loop
|
||||||
self._aq = aq
|
self._aq = aq
|
||||||
@@ -167,5 +209,9 @@ def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email:
|
|||||||
bridge = _QueueBridge(loop, async_q)
|
bridge = _QueueBridge(loop, async_q)
|
||||||
hbdclass.Host.dnsQ = bridge
|
hbdclass.Host.dnsQ = bridge
|
||||||
|
|
||||||
task = loop.create_task(dns_update_worker(hbdclass, cfg, async_queue=async_q, log=log, email=email, loop=loop))
|
task = loop.create_task(
|
||||||
|
dns_update_worker(
|
||||||
|
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||||
|
)
|
||||||
|
)
|
||||||
return task
|
return task
|
||||||
@@ -0,0 +1,638 @@
|
|||||||
|
"""
|
||||||
|
host and connection class shared between hbd and
|
||||||
|
the websit's heartbeat.py
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import copy
|
||||||
|
import queue
|
||||||
|
|
||||||
|
num = 0
|
||||||
|
|
||||||
|
MAXRTTS = 10
|
||||||
|
|
||||||
|
DEBUG = 2
|
||||||
|
|
||||||
|
|
||||||
|
def log(host, m):
|
||||||
|
if DEBUG:
|
||||||
|
print("class log: %s %s" % (host, m))
|
||||||
|
|
||||||
|
|
||||||
|
class Connection:
|
||||||
|
# map of addrs to names
|
||||||
|
|
||||||
|
htab = {}
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
UP = "up"
|
||||||
|
DOWN = "down"
|
||||||
|
OVERDUE = "overdue"
|
||||||
|
|
||||||
|
def __init__(self, host, cid, addr, afam):
|
||||||
|
self.host = host
|
||||||
|
self.cid = cid
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
self.addr = addr
|
||||||
|
self.afam = afam
|
||||||
|
self.rtts = [0]
|
||||||
|
self.lastbeat = time.time()
|
||||||
|
self.statetime = self.lastbeat
|
||||||
|
self.deltastatetime = "computed"
|
||||||
|
self.state = Connection.UNKNOWN
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
if host:
|
||||||
|
Connection.htab[addr] = self.host.name
|
||||||
|
if self.host.isDynDns():
|
||||||
|
log(self.host.name, "dns update %s" % self.addr)
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
# Remove asyncio timer objects that can't be pickled
|
||||||
|
# These will be recreated when the next HTB arrives after unpickling
|
||||||
|
state['overdue_timer'] = None
|
||||||
|
state['overdue_callback'] = None
|
||||||
|
state['timeout_duration'] = None
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
"""Restore Connection from pickle, reinitializing timer fields."""
|
||||||
|
self.__dict__.update(state)
|
||||||
|
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
|
||||||
|
if not hasattr(self, 'overdue_timer'):
|
||||||
|
self.overdue_timer = None
|
||||||
|
if not hasattr(self, 'overdue_callback'):
|
||||||
|
self.overdue_callback = None
|
||||||
|
if not hasattr(self, 'timeout_duration'):
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
def registerDns(self):
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
|
def clearstate(self):
|
||||||
|
d = {}
|
||||||
|
d["addr"] = ""
|
||||||
|
d["rtt"] = ""
|
||||||
|
d["lastbeat"] = ""
|
||||||
|
d["state"] = ""
|
||||||
|
d["statetime"] = ""
|
||||||
|
d["deltastatetime"] = ""
|
||||||
|
d["rttstate"] = ""
|
||||||
|
return d
|
||||||
|
|
||||||
|
def statedict(self, Null=False):
|
||||||
|
d = self.clearstate()
|
||||||
|
now = time.time()
|
||||||
|
if not Null:
|
||||||
|
d["addr"] = self.addr
|
||||||
|
if self.rtts[-1]:
|
||||||
|
d["rtt"] = "%d" % round(self.rtts[-1])
|
||||||
|
elif self.state == Connection.UNKNOWN:
|
||||||
|
d["rtt"] = ""
|
||||||
|
else:
|
||||||
|
d["rtt"] = "?"
|
||||||
|
d["lastbeat"] = self.lastbeat
|
||||||
|
if self.state == Connection.OVERDUE:
|
||||||
|
d["state"] = "<b>%s</b>" % self.state
|
||||||
|
else:
|
||||||
|
d["state"] = self.state
|
||||||
|
if self.state == Connection.UP:
|
||||||
|
d["rttstate"] = d["rtt"]
|
||||||
|
elif self.state == Connection.OVERDUE:
|
||||||
|
d["rttstate"] = ""
|
||||||
|
else:
|
||||||
|
d["rttstate"] = d["state"]
|
||||||
|
d["statetime"] = time.strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||||
|
)
|
||||||
|
delta = now - self.statetime
|
||||||
|
|
||||||
|
if self.state == Connection.UNKNOWN:
|
||||||
|
d["deltastatetime"] = ""
|
||||||
|
elif delta > 86400:
|
||||||
|
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||||
|
elif delta > 3600:
|
||||||
|
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||||
|
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||||
|
elif delta > 60:
|
||||||
|
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||||
|
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||||
|
else:
|
||||||
|
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = "%i secs" % (delta)
|
||||||
|
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||||
|
d = self.clearstate()
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
def headerdict(self, afam):
|
||||||
|
d = {}
|
||||||
|
d["addr"] = "%s Addr" % afam
|
||||||
|
d["rtt"] = "Latencey"
|
||||||
|
d["lastbeat"] = "Last Contact"
|
||||||
|
d["state"] = "State"
|
||||||
|
d["statetime"] = "Last State"
|
||||||
|
d["rttstate"] = "Reach"
|
||||||
|
d["deltastatetime"] = "Last State"
|
||||||
|
return d
|
||||||
|
|
||||||
|
def jsons(self):
|
||||||
|
"""Serialize connection to JSON, excluding non-serializable timer objects."""
|
||||||
|
data = {}
|
||||||
|
for key, value in self.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
data[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
data[key] = value
|
||||||
|
return json.dumps(data)
|
||||||
|
|
||||||
|
# set new state, return number of secs in previous state
|
||||||
|
def newstate(self, state, now, when=0):
|
||||||
|
self.state = state
|
||||||
|
delta = now - when
|
||||||
|
s = delta - self.statetime
|
||||||
|
self.statetime = delta
|
||||||
|
return s
|
||||||
|
|
||||||
|
def getstate(self):
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
def newaddr(self, addr, rtt, now):
|
||||||
|
self.lastbeat = now
|
||||||
|
if rtt is not None:
|
||||||
|
self.rtts.append(rtt)
|
||||||
|
if len(self.rtts) > MAXRTTS:
|
||||||
|
del self.rtts[0]
|
||||||
|
|
||||||
|
if self.addr == addr:
|
||||||
|
r = None
|
||||||
|
else:
|
||||||
|
r = "changed from %s to %s" % (self.addr, addr)
|
||||||
|
try:
|
||||||
|
del Connection.htab[self.addr]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.addr = addr
|
||||||
|
Connection.htab[addr] = self.host.name
|
||||||
|
if self.host.isDynDns():
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
return r
|
||||||
|
|
||||||
|
def reset_overdue_timer(self, timeout_seconds, callback):
|
||||||
|
"""Reset the overdue timer for this connection.
|
||||||
|
|
||||||
|
Cancels any existing timer and sets a new one that will mark
|
||||||
|
the connection as overdue if no heartbeat arrives before timeout.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout_seconds: Seconds before marking as overdue
|
||||||
|
callback: Async function to call when timer expires
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Cancel existing timer if any
|
||||||
|
if self.overdue_timer and not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
|
||||||
|
# Store parameters for later reference
|
||||||
|
self.timeout_duration = timeout_seconds
|
||||||
|
self.overdue_callback = callback
|
||||||
|
|
||||||
|
# Create new timer
|
||||||
|
async def timer_expired():
|
||||||
|
await callback(self)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
self.overdue_timer = loop.call_later(timeout_seconds,
|
||||||
|
lambda: asyncio.create_task(timer_expired()))
|
||||||
|
except RuntimeError:
|
||||||
|
# No event loop running yet
|
||||||
|
pass
|
||||||
|
|
||||||
|
def cancel_overdue_timer(self):
|
||||||
|
"""Cancel the overdue timer if it exists and clear all timer references."""
|
||||||
|
if self.overdue_timer:
|
||||||
|
try:
|
||||||
|
if not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Clear all timer-related references
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
def get_avg_rtt(self):
|
||||||
|
"""Get average RTT from recent samples."""
|
||||||
|
valid_rtts = [r for r in self.rtts if r > 0]
|
||||||
|
if valid_rtts:
|
||||||
|
return sum(valid_rtts) / len(valid_rtts)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_current_rtt(self):
|
||||||
|
"""Get most recent RTT value."""
|
||||||
|
return self.rtts[-1] if self.rtts else 0
|
||||||
|
|
||||||
|
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
|
||||||
|
"""Check if RTT exceeds thresholds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
warning_threshold: RTT in ms for warning level
|
||||||
|
critical_threshold: RTT in ms for critical level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
|
||||||
|
"""
|
||||||
|
rtt = self.get_current_rtt()
|
||||||
|
if rtt <= 0:
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
if critical_threshold and rtt > critical_threshold:
|
||||||
|
return ('CRITICAL', rtt)
|
||||||
|
elif warning_threshold and rtt > warning_threshold:
|
||||||
|
return ('WARNING', rtt)
|
||||||
|
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
class Host:
|
||||||
|
# Table of Hosts
|
||||||
|
hosts = {}
|
||||||
|
dnsQ = queue.Queue()
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
global num
|
||||||
|
self.name = name
|
||||||
|
if name:
|
||||||
|
num += 1
|
||||||
|
Host.hosts[name] = self
|
||||||
|
self.num = num
|
||||||
|
self.dyn = False
|
||||||
|
self.watched = False
|
||||||
|
self.upcount = 0
|
||||||
|
self.interval = 0
|
||||||
|
self.doesack = -1
|
||||||
|
self.cmds = []
|
||||||
|
self.connections = {}
|
||||||
|
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
|
||||||
|
self.plugin_data = {}
|
||||||
|
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||||
|
# Alert state tracking: {metric_path: AlertState}
|
||||||
|
self.alert_states = {}
|
||||||
|
# User access control
|
||||||
|
self.owner: str | None = None # username of owner
|
||||||
|
self.managers: list = [] # usernames with manager role
|
||||||
|
self.monitors: list = [] # usernames with monitor role
|
||||||
|
|
||||||
|
def statedict(self):
|
||||||
|
d = {}
|
||||||
|
d["raw_name"] = self.name
|
||||||
|
d["name"] = self.name
|
||||||
|
if self.dyn:
|
||||||
|
d["name"] += "*"
|
||||||
|
if self.watched:
|
||||||
|
d["name"] = "<b>%s</b>" % d["name"]
|
||||||
|
d["dyn"] = str(self.dyn)
|
||||||
|
d["num"] = self.num
|
||||||
|
|
||||||
|
# Add alert counts (split by acknowledged status)
|
||||||
|
warning_unacked = 0
|
||||||
|
warning_acked = 0
|
||||||
|
critical_unacked = 0
|
||||||
|
critical_acked = 0
|
||||||
|
for metric_path, alert_state in self.alert_states.items():
|
||||||
|
# Import AlertLevel here to avoid circular imports
|
||||||
|
from .threshold import AlertLevel
|
||||||
|
if alert_state.level == AlertLevel.WARNING:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
warning_acked += 1
|
||||||
|
else:
|
||||||
|
warning_unacked += 1
|
||||||
|
elif alert_state.level == AlertLevel.CRITICAL:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
critical_acked += 1
|
||||||
|
else:
|
||||||
|
critical_unacked += 1
|
||||||
|
|
||||||
|
d["alert_warning_unacked"] = warning_unacked
|
||||||
|
d["alert_warning_acked"] = warning_acked
|
||||||
|
d["alert_critical_unacked"] = critical_unacked
|
||||||
|
d["alert_critical_acked"] = critical_acked
|
||||||
|
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c in self.connections:
|
||||||
|
cs = self.connections[c].statedict()
|
||||||
|
else:
|
||||||
|
cs = ubConnection.statedict(True)
|
||||||
|
for csv in cs:
|
||||||
|
d["%s.%s" % (c, csv)] = cs[csv]
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
def headerdict(self):
|
||||||
|
d = {}
|
||||||
|
d["name"] = "Name"
|
||||||
|
d["dyn"] = "Dyn"
|
||||||
|
d["num"] = "??"
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
cs = ubConnection.headerdict(c)
|
||||||
|
for csv in cs:
|
||||||
|
d["%s.%s" % (c, csv)] = cs[csv]
|
||||||
|
return d
|
||||||
|
|
||||||
|
def registerDns(self):
|
||||||
|
for af in self.connections:
|
||||||
|
self.connections[af].registerDns()
|
||||||
|
|
||||||
|
def stateinfo(self):
|
||||||
|
ddict = {}
|
||||||
|
for d in self.__dict__:
|
||||||
|
if d in ["alert_states", "plugin_data"]:
|
||||||
|
continue
|
||||||
|
if d == "connections":
|
||||||
|
cl = []
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c not in self.connections:
|
||||||
|
continue
|
||||||
|
# Create connection dict, excluding non-serializable timer objects
|
||||||
|
conn = self.connections[c]
|
||||||
|
cld = {}
|
||||||
|
for key, value in conn.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
cld[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
# Safe copy for serializable values
|
||||||
|
try:
|
||||||
|
cld[key] = copy.deepcopy(value)
|
||||||
|
except Exception:
|
||||||
|
# If deepcopy fails, use shallow copy
|
||||||
|
cld[key] = value
|
||||||
|
cl.append(cld)
|
||||||
|
ddict[d] = cl
|
||||||
|
else:
|
||||||
|
ddict[d] = self.__dict__[d]
|
||||||
|
|
||||||
|
# Add alert counts (computed from alert_states)
|
||||||
|
warning_unacked = 0
|
||||||
|
warning_acked = 0
|
||||||
|
critical_unacked = 0
|
||||||
|
critical_acked = 0
|
||||||
|
if hasattr(self, 'alert_states'):
|
||||||
|
from .threshold import AlertLevel
|
||||||
|
for metric_path, alert_state in self.alert_states.items():
|
||||||
|
if alert_state.level == AlertLevel.WARNING:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
warning_acked += 1
|
||||||
|
else:
|
||||||
|
warning_unacked += 1
|
||||||
|
elif alert_state.level == AlertLevel.CRITICAL:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
critical_acked += 1
|
||||||
|
else:
|
||||||
|
critical_unacked += 1
|
||||||
|
|
||||||
|
ddict["alert_warning_unacked"] = warning_unacked
|
||||||
|
ddict["alert_warning_acked"] = warning_acked
|
||||||
|
ddict["alert_critical_unacked"] = critical_unacked
|
||||||
|
ddict["alert_critical_acked"] = critical_acked
|
||||||
|
|
||||||
|
# User access
|
||||||
|
ddict["owner"] = getattr(self, "owner", None)
|
||||||
|
ddict["managers"] = list(getattr(self, "managers", []))
|
||||||
|
ddict["monitors"] = list(getattr(self, "monitors", []))
|
||||||
|
|
||||||
|
# hbc version from latest os_info plugin data
|
||||||
|
hbc_version = None
|
||||||
|
latest_os = self.get_latest_plugin_data("os_info")
|
||||||
|
if latest_os:
|
||||||
|
_, os_data = latest_os
|
||||||
|
hbc_version = os_data.get("hbc_version")
|
||||||
|
ddict["hbc_version"] = hbc_version
|
||||||
|
|
||||||
|
return ddict
|
||||||
|
|
||||||
|
def jsons(self):
|
||||||
|
return json.dumps(self.stateinfo())
|
||||||
|
|
||||||
|
def isDynDns(self):
|
||||||
|
return self.dyn
|
||||||
|
|
||||||
|
def isIPv4(self, addr):
|
||||||
|
if isinstance(addr, tuple):
|
||||||
|
return addr[0].find(".") > 0
|
||||||
|
else:
|
||||||
|
return addr.find(".") > 0
|
||||||
|
|
||||||
|
def conndata(self, cid, addr, rtt, now):
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
if self.isIPv4(addr):
|
||||||
|
afam = "IPv4"
|
||||||
|
else:
|
||||||
|
afam = "IPv6"
|
||||||
|
|
||||||
|
if afam not in self.connections:
|
||||||
|
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||||
|
|
||||||
|
conn = self.connections[afam]
|
||||||
|
res = conn.newaddr(addr, rtt, now)
|
||||||
|
return conn, res
|
||||||
|
|
||||||
|
# called when reloading class from pickle, add new fields here
|
||||||
|
def fixup(self):
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c in self.connections:
|
||||||
|
addr = self.connections[c].addr
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
self.connections[c].addr = addr
|
||||||
|
|
||||||
|
# Add plugin_data if missing (for backward compatibility)
|
||||||
|
if not hasattr(self, "plugin_data"):
|
||||||
|
self.plugin_data = {}
|
||||||
|
if not hasattr(self, "plugin_retention"):
|
||||||
|
self.plugin_retention = 100
|
||||||
|
if not hasattr(self, "alert_states"):
|
||||||
|
self.alert_states = {}
|
||||||
|
# User access fields (added in user-management feature)
|
||||||
|
if not hasattr(self, "owner"):
|
||||||
|
self.owner = None
|
||||||
|
if not hasattr(self, "managers"):
|
||||||
|
self.managers = []
|
||||||
|
if not hasattr(self, "monitors"):
|
||||||
|
self.monitors = []
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_plugin_data(self, plugin_name, data, timestamp=None):
|
||||||
|
"""Store plugin data with timestamp.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin (e.g., "cpu_monitor")
|
||||||
|
data: Dict of plugin data
|
||||||
|
timestamp: Optional timestamp (default: current time)
|
||||||
|
"""
|
||||||
|
if timestamp is None:
|
||||||
|
timestamp = time.time()
|
||||||
|
|
||||||
|
if plugin_name not in self.plugin_data:
|
||||||
|
self.plugin_data[plugin_name] = []
|
||||||
|
|
||||||
|
# Add new data
|
||||||
|
self.plugin_data[plugin_name].append((timestamp, data))
|
||||||
|
|
||||||
|
# Enforce retention limit (keep last N samples)
|
||||||
|
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
|
||||||
|
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
|
||||||
|
|
||||||
|
def get_plugin_data(self, plugin_name, limit=None):
|
||||||
|
"""Retrieve plugin data for a specific plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
limit: Optional limit on number of recent samples to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (timestamp, data) tuples, most recent last
|
||||||
|
"""
|
||||||
|
data = self.plugin_data.get(plugin_name, [])
|
||||||
|
if limit and len(data) > limit:
|
||||||
|
return data[-limit:]
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_latest_plugin_data(self, plugin_name):
|
||||||
|
"""Get the most recent plugin data for a plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(timestamp, data) tuple or None if no data
|
||||||
|
"""
|
||||||
|
data = self.plugin_data.get(plugin_name, [])
|
||||||
|
return data[-1] if data else None
|
||||||
|
|
||||||
|
def get_all_plugin_data(self):
|
||||||
|
"""Get all plugin data for this host.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict of {plugin_name: [(timestamp, data), ...]}
|
||||||
|
"""
|
||||||
|
return self.plugin_data
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# User-role helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def apply_access(self, owner, managers, monitors):
|
||||||
|
"""Set owner/managers/monitors on this host (called from config load)."""
|
||||||
|
self.owner = owner
|
||||||
|
self.managers = list(managers)
|
||||||
|
self.monitors = list(monitors)
|
||||||
|
|
||||||
|
def is_owner(self, username: str) -> bool:
|
||||||
|
return self.owner == username
|
||||||
|
|
||||||
|
def is_manager(self, username: str) -> bool:
|
||||||
|
return username in self.managers or self.is_owner(username)
|
||||||
|
|
||||||
|
def is_monitor(self, username: str) -> bool:
|
||||||
|
return username in self.monitors or self.is_manager(username)
|
||||||
|
|
||||||
|
def access_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"owner": self.owner,
|
||||||
|
"managers": list(self.managers),
|
||||||
|
"monitors": list(self.monitors),
|
||||||
|
}
|
||||||
|
|
||||||
|
hostfields_long = [
|
||||||
|
"name",
|
||||||
|
"IPv4.addr",
|
||||||
|
"IPv4.state",
|
||||||
|
("IPv4.rtt", 'style="text-align: right;"'),
|
||||||
|
("IPv4.statetime", 'style="text-align: right;"'),
|
||||||
|
"IPv6.addr",
|
||||||
|
"IPv6.state",
|
||||||
|
("IPv6.rtt", 'style="text-align: right;"'),
|
||||||
|
("IPv6.statetime", 'style="text-align: right;"'),
|
||||||
|
]
|
||||||
|
|
||||||
|
hostfields_short = [
|
||||||
|
"name",
|
||||||
|
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||||
|
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||||
|
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||||
|
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def gene(self, tag, v, attrib=None):
|
||||||
|
if attrib:
|
||||||
|
a = " %s" % attrib
|
||||||
|
else:
|
||||||
|
a = ""
|
||||||
|
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||||
|
|
||||||
|
def htmltable(self, tag, hd, short):
|
||||||
|
if short:
|
||||||
|
hostfields = Host.hostfields_short
|
||||||
|
else:
|
||||||
|
hostfields = Host.hostfields_long
|
||||||
|
h = []
|
||||||
|
for f in hostfields:
|
||||||
|
if isinstance(f, tuple):
|
||||||
|
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||||
|
else:
|
||||||
|
h.append(self.gene(tag, hd[f]))
|
||||||
|
return self.gene("tr", "\n".join(h))
|
||||||
|
|
||||||
|
def buildhosttable(self, short=False):
|
||||||
|
if DEBUG > 1:
|
||||||
|
print("DBG buildhosttable: start")
|
||||||
|
res = []
|
||||||
|
res.append('<table id="ntable" class="sortable">')
|
||||||
|
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||||
|
hosts_sorted = list(Host.hosts.keys())
|
||||||
|
if len(hosts_sorted):
|
||||||
|
hosts_sorted.sort()
|
||||||
|
for h in hosts_sorted:
|
||||||
|
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||||
|
res.append("</table>")
|
||||||
|
if DEBUG > 1:
|
||||||
|
print("DBG buildhosttable: %s" % res)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def buildmsgtable(self, msgs):
|
||||||
|
res = []
|
||||||
|
le = max(40 - len(Host.hosts), 3)
|
||||||
|
res.append("<h4>Log of Events</h4>")
|
||||||
|
for m in msgs[len(msgs) - le :]:
|
||||||
|
res.append("%s<BR>" % m)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# create fake "unbound objects", remove in Python 3.0
|
||||||
|
ubHost = Host(None)
|
||||||
|
ubConnection = Connection(None, "", "", "")
|
||||||
+1738
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,342 @@
|
|||||||
|
"""
|
||||||
|
Journal logging for heartbeat messages.
|
||||||
|
|
||||||
|
Provides size-based rotating log files for all received heartbeat messages.
|
||||||
|
Messages are logged in JSON format for easy parsing and analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MessageJournal:
|
||||||
|
"""
|
||||||
|
Journal logger for heartbeat messages with size-based rotation.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Logs all received messages in JSON format
|
||||||
|
- Automatic rotation when file size exceeds threshold
|
||||||
|
- Keeps configurable number of rotated logs
|
||||||
|
- Thread-safe and async-safe operation
|
||||||
|
- Configurable log directory and file naming
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
|
||||||
|
journal_file: Base filename (default: messages.journal)
|
||||||
|
max_size: Maximum file size in bytes before rotation (default: 100MB)
|
||||||
|
max_backups: Number of backup files to keep (default: 10)
|
||||||
|
enabled: Enable/disable journaling (default: True)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the message journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary with journal settings
|
||||||
|
"""
|
||||||
|
self.config = config or {}
|
||||||
|
|
||||||
|
# Configuration options
|
||||||
|
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
|
||||||
|
self.journal_file = self.config.get('journal_file', 'messages.journal')
|
||||||
|
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
|
||||||
|
self.max_backups = self.config.get('journal_max_backups', 10)
|
||||||
|
self.enabled = self.config.get('journal_enabled', True)
|
||||||
|
|
||||||
|
# Runtime state
|
||||||
|
self._file_handle = None
|
||||||
|
self._current_size = 0
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
# Full path to current journal file
|
||||||
|
self.journal_path = self.journal_dir / self.journal_file
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""
|
||||||
|
Initialize the journal.
|
||||||
|
|
||||||
|
Creates journal directory if needed and opens the journal file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization successful, False otherwise
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
logger.info("Message journal disabled in configuration")
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create journal directory if it doesn't exist
|
||||||
|
self.journal_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Open journal file in append mode
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
|
||||||
|
# Get current file size
|
||||||
|
try:
|
||||||
|
self._current_size = os.path.getsize(self.journal_path)
|
||||||
|
except OSError:
|
||||||
|
self._current_size = 0
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info(f"Message journal initialized: {self.journal_path} "
|
||||||
|
f"(current size: {self._current_size:,} bytes, "
|
||||||
|
f"max: {self.max_size:,} bytes)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize message journal: {e}")
|
||||||
|
self.enabled = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def log_message(
|
||||||
|
self,
|
||||||
|
msg: Dict[str, Any],
|
||||||
|
addr: tuple,
|
||||||
|
timestamp: Optional[float] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Log a received message to the journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Parsed message dictionary
|
||||||
|
addr: Source address (ip, port) tuple
|
||||||
|
timestamp: Message timestamp (defaults to current time)
|
||||||
|
"""
|
||||||
|
if not self.enabled or not self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip HTB (heartbeat) messages - too verbose
|
||||||
|
msg_id = msg.get('ID', '')
|
||||||
|
if msg_id == 'HTB':
|
||||||
|
return
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
try:
|
||||||
|
# Prepare journal entry
|
||||||
|
if timestamp is None:
|
||||||
|
import time
|
||||||
|
timestamp = time.time()
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
|
||||||
|
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
|
||||||
|
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
|
||||||
|
'message': msg
|
||||||
|
}
|
||||||
|
|
||||||
|
# Serialize to JSON (one line per entry)
|
||||||
|
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
|
||||||
|
json_bytes = json_line.encode('utf-8')
|
||||||
|
|
||||||
|
# Check if rotation is needed
|
||||||
|
if self._current_size + len(json_bytes) > self.max_size:
|
||||||
|
await self._rotate()
|
||||||
|
|
||||||
|
# Write to journal
|
||||||
|
if self._file_handle:
|
||||||
|
self._file_handle.write(json_line)
|
||||||
|
self._file_handle.flush() # Ensure data is written
|
||||||
|
self._current_size += len(json_bytes)
|
||||||
|
|
||||||
|
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error writing to journal: {e}")
|
||||||
|
|
||||||
|
async def _rotate(self):
|
||||||
|
"""
|
||||||
|
Rotate the journal file.
|
||||||
|
|
||||||
|
Renames current file with timestamp, opens new file, and removes
|
||||||
|
old backups exceeding max_backups limit.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Close current file
|
||||||
|
if self._file_handle:
|
||||||
|
self._file_handle.close()
|
||||||
|
self._file_handle = None
|
||||||
|
|
||||||
|
# Generate backup filename with timestamp
|
||||||
|
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
|
||||||
|
backup_name = f"{self.journal_file}.{timestamp_str}"
|
||||||
|
backup_path = self.journal_dir / backup_name
|
||||||
|
|
||||||
|
# Rename current file to backup
|
||||||
|
if self.journal_path.exists():
|
||||||
|
self.journal_path.rename(backup_path)
|
||||||
|
logger.info(f"Rotated journal: {backup_path} "
|
||||||
|
f"(size: {self._current_size:,} bytes)")
|
||||||
|
|
||||||
|
# Open new journal file
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
self._current_size = 0
|
||||||
|
|
||||||
|
# Clean up old backups
|
||||||
|
await self._cleanup_old_backups()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error rotating journal: {e}")
|
||||||
|
# Try to reopen the file even if rotation failed
|
||||||
|
try:
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
except Exception as e2:
|
||||||
|
logger.error(f"Failed to reopen journal after rotation error: {e2}")
|
||||||
|
self.enabled = False
|
||||||
|
|
||||||
|
async def _cleanup_old_backups(self):
|
||||||
|
"""
|
||||||
|
Remove old backup files exceeding max_backups limit.
|
||||||
|
|
||||||
|
Keeps only the most recent backups based on filename (which includes timestamp).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Find all backup files
|
||||||
|
backup_pattern = f"{self.journal_file}.*"
|
||||||
|
backup_files = sorted(self.journal_dir.glob(backup_pattern))
|
||||||
|
|
||||||
|
# Remove oldest backups if we have too many
|
||||||
|
if len(backup_files) > self.max_backups:
|
||||||
|
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
|
||||||
|
for backup_file in files_to_remove:
|
||||||
|
try:
|
||||||
|
backup_file.unlink()
|
||||||
|
logger.info(f"Removed old backup: {backup_file.name}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning up old backups: {e}")
|
||||||
|
|
||||||
|
async def log_threshold_event(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
metric_path: str,
|
||||||
|
old_level: str,
|
||||||
|
new_level: str,
|
||||||
|
value: Any,
|
||||||
|
timestamp: Optional[float] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Log a threshold state change event.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
|
||||||
|
old_level: Previous alert level
|
||||||
|
new_level: New alert level
|
||||||
|
value: Current metric value
|
||||||
|
timestamp: Event timestamp (default: current time)
|
||||||
|
"""
|
||||||
|
if not self.enabled or not self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
if timestamp is None:
|
||||||
|
timestamp = __import__('time').time()
|
||||||
|
|
||||||
|
event = {
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
|
||||||
|
'event_type': 'threshold',
|
||||||
|
'host': host_name,
|
||||||
|
'metric': metric_path,
|
||||||
|
'old_level': old_level,
|
||||||
|
'new_level': new_level,
|
||||||
|
'value': value,
|
||||||
|
}
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
if not self._file_handle:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if rotation is needed
|
||||||
|
if self._current_size >= self.max_size:
|
||||||
|
await self._rotate()
|
||||||
|
|
||||||
|
# Write event
|
||||||
|
line = json.dumps(event) + '\n'
|
||||||
|
self._file_handle.write(line)
|
||||||
|
self._file_handle.flush()
|
||||||
|
|
||||||
|
# Update size
|
||||||
|
self._current_size += len(line.encode('utf-8'))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error logging threshold event: {e}")
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""
|
||||||
|
Close the journal and release resources.
|
||||||
|
|
||||||
|
Should be called during shutdown.
|
||||||
|
"""
|
||||||
|
async with self._lock:
|
||||||
|
if self._file_handle:
|
||||||
|
try:
|
||||||
|
self._file_handle.close()
|
||||||
|
logger.info("Message journal closed")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error closing journal: {e}")
|
||||||
|
finally:
|
||||||
|
self._file_handle = None
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get journal statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with journal stats
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'enabled': self.enabled,
|
||||||
|
'initialized': self._initialized,
|
||||||
|
'current_file': str(self.journal_path),
|
||||||
|
'current_size': self._current_size,
|
||||||
|
'max_size': self.max_size,
|
||||||
|
'max_backups': self.max_backups,
|
||||||
|
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global journal instance
|
||||||
|
_journal_instance: Optional[MessageJournal] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
|
||||||
|
"""
|
||||||
|
Get or create the global journal instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary (only used on first call)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MessageJournal instance
|
||||||
|
"""
|
||||||
|
global _journal_instance
|
||||||
|
if _journal_instance is None:
|
||||||
|
_journal_instance = MessageJournal(config)
|
||||||
|
return _journal_instance
|
||||||
|
|
||||||
|
|
||||||
|
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Convenience function to log a message using the global journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Parsed message dictionary
|
||||||
|
addr: Source address (ip, port) tuple
|
||||||
|
timestamp: Message timestamp (defaults to current time)
|
||||||
|
"""
|
||||||
|
journal = get_journal()
|
||||||
|
await journal.log_message(msg, addr, timestamp)
|
||||||
@@ -0,0 +1,535 @@
|
|||||||
|
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import ssl
|
||||||
|
from . import __version__
|
||||||
|
|
||||||
|
from . import udp
|
||||||
|
from . import hbdclass
|
||||||
|
|
||||||
|
from . import ws as ws_mod
|
||||||
|
from . import notify as notify_mod
|
||||||
|
from . import data
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
msg_to_websockets = ws_mod.broadcast
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
# shared runtime collections and helpers
|
||||||
|
|
||||||
|
def save_state(config, hbdclass):
|
||||||
|
"""Save current state to pickle file. Safe to call at any time."""
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
# Clear timer references before pickling (they can't be serialized)
|
||||||
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
for conn_type, conn in host.connections.items():
|
||||||
|
if hasattr(conn, 'cancel_overdue_timer'):
|
||||||
|
conn.cancel_overdue_timer()
|
||||||
|
if hasattr(conn, 'overdue_timer'):
|
||||||
|
conn.overdue_timer = None
|
||||||
|
if hasattr(conn, 'overdue_callback'):
|
||||||
|
conn.overdue_callback = None
|
||||||
|
if hasattr(conn, 'timeout_duration'):
|
||||||
|
conn.timeout_duration = None
|
||||||
|
|
||||||
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
|
tmpfile = pickfile + ".tmp"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(tmpfile, "wb") as pickf:
|
||||||
|
pick = pickle.Pickler(pickf)
|
||||||
|
pick.dump(hbdclass.Host.hosts)
|
||||||
|
pick.dump(data.msgs)
|
||||||
|
pick.dump(users_mod.save_sessions())
|
||||||
|
os.replace(tmpfile, pickfile)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to save state: %s", e)
|
||||||
|
try:
|
||||||
|
os.unlink(tmpfile)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_function(config, hbdclass):
|
||||||
|
"""This function will be executed upon program exit."""
|
||||||
|
logger.info("Running cleanup function...")
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.info("Cleanup complete.")
|
||||||
|
|
||||||
|
|
||||||
|
async def reload_configuration(config_obj, config_path, components):
|
||||||
|
"""Reload configuration and update all components.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_obj: ReloadableConfig instance
|
||||||
|
config_path: Path to config file
|
||||||
|
components: Dict with threshold_checker and other components
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if reload succeeded, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info("Starting configuration reload...")
|
||||||
|
|
||||||
|
# Reload config file
|
||||||
|
new_config = await config_obj.reload(config_path)
|
||||||
|
|
||||||
|
# Update notify module
|
||||||
|
notify_mod.reload_config(new_config)
|
||||||
|
|
||||||
|
# Reload users
|
||||||
|
users_mod.load_users(new_config)
|
||||||
|
|
||||||
|
# Re-apply host attributes from updated config to all known hosts
|
||||||
|
from . import config as config_mod
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(new_config)
|
||||||
|
watchhosts = config_mod.get_watchhosts(new_config)
|
||||||
|
for hostname, host in hbdclass.Host.hosts.items():
|
||||||
|
host.dyn = hostname in dyndnshosts
|
||||||
|
host.watched = hostname in watchhosts
|
||||||
|
access = config_mod.get_host_access(new_config, hostname)
|
||||||
|
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||||
|
|
||||||
|
# Reload threshold checker and prune alerts orphaned by the new config
|
||||||
|
if 'threshold_checker' in components:
|
||||||
|
components['threshold_checker'].reload(new_config)
|
||||||
|
components['threshold_checker'].purge_stale_alerts(hbdclass)
|
||||||
|
|
||||||
|
# Note: Changes to the following require restart:
|
||||||
|
# - hb_port, hbd_port, ws_port (already bound)
|
||||||
|
# - SSL certificates (already loaded)
|
||||||
|
# - pickfile (already opened)
|
||||||
|
# - journal settings (journal already initialized)
|
||||||
|
|
||||||
|
# These are reloadable and effective immediately:
|
||||||
|
# - notification_channels
|
||||||
|
# - threshold_configs
|
||||||
|
# - hosts (watchhosts, dyndns, notification_channels)
|
||||||
|
# - grace period (used on next heartbeat)
|
||||||
|
# - debug/verbose flags (used on next message)
|
||||||
|
|
||||||
|
logger.info("Configuration reload completed successfully")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("=" * 60)
|
||||||
|
logger.error(f"Failed to reload configuration: {e}", exc_info=True)
|
||||||
|
logger.error("Keeping previous configuration")
|
||||||
|
logger.error("=" * 60)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_async(config, config_path=None):
|
||||||
|
from .config import ReloadableConfig
|
||||||
|
if not isinstance(config, ReloadableConfig):
|
||||||
|
config = ReloadableConfig(config, config_path)
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
shutdown_event = asyncio.Event()
|
||||||
|
reload_event = asyncio.Event()
|
||||||
|
|
||||||
|
# Signal handlers for graceful shutdown and reload
|
||||||
|
def signal_handler(signum, frame):
|
||||||
|
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||||
|
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||||
|
loop.call_soon_threadsafe(shutdown_event.set)
|
||||||
|
|
||||||
|
def reload_handler(signum, frame):
|
||||||
|
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||||
|
logger.info(f"Received {sig_name}, initiating config reload...")
|
||||||
|
loop.call_soon_threadsafe(reload_event.set)
|
||||||
|
|
||||||
|
# Register signal handlers
|
||||||
|
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||||
|
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||||
|
loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
|
||||||
|
|
||||||
|
from . import http as http_mod
|
||||||
|
from . import dns as dns_mod
|
||||||
|
from . import notify as notify_mod
|
||||||
|
from . import journal as journal_mod
|
||||||
|
from . import threshold as threshold_mod
|
||||||
|
|
||||||
|
notify_mod.setup(config, loop=loop)
|
||||||
|
|
||||||
|
# Initialize message journal
|
||||||
|
msg_journal = journal_mod.get_journal(config)
|
||||||
|
await msg_journal.initialize()
|
||||||
|
|
||||||
|
# Initialize threshold checker
|
||||||
|
threshold_checker = threshold_mod.ThresholdChecker(
|
||||||
|
config=config,
|
||||||
|
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||||
|
journal=msg_journal,
|
||||||
|
)
|
||||||
|
logger.info("Threshold checker initialized")
|
||||||
|
|
||||||
|
# Components dict for reload orchestration
|
||||||
|
components = {
|
||||||
|
'threshold_checker': threshold_checker,
|
||||||
|
'msg_journal': msg_journal,
|
||||||
|
}
|
||||||
|
|
||||||
|
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||||
|
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||||
|
# This option is system-dependent; on many systems, setting it to False enables
|
||||||
|
# the socket to handle both IPv4 and IPv6 traffic.
|
||||||
|
try:
|
||||||
|
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
bind_addr = ("::", config.get("hb_port", 50003))
|
||||||
|
sock.bind(bind_addr)
|
||||||
|
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||||
|
|
||||||
|
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
|
||||||
|
# If supported, read datagrams via recvmsg() so RTT uses the kernel
|
||||||
|
# timestamp rather than the time.time() call after asyncio scheduling.
|
||||||
|
use_kernel_ts = udp.enable_kernel_timestamps(sock)
|
||||||
|
if use_kernel_ts:
|
||||||
|
logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
|
||||||
|
else:
|
||||||
|
logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
|
||||||
|
|
||||||
|
def udp_handler(msg, addr, transport, recv_ts=None):
|
||||||
|
ctx = dict(
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
msg_to_websockets=msg_to_websockets,
|
||||||
|
msg_journal=msg_journal,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
|
DEBUG=config.get("debug", 0),
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
recv_ts=recv_ts,
|
||||||
|
)
|
||||||
|
udp.handle_datagram(msg, addr, transport, ctx)
|
||||||
|
|
||||||
|
if use_kernel_ts:
|
||||||
|
# recvmsg path: manage the socket ourselves with loop.add_reader()
|
||||||
|
sock.setblocking(False)
|
||||||
|
transport = udp.RecvmsgTransport(loop, sock)
|
||||||
|
reader = udp.make_recvmsg_reader(sock, udp_handler, transport)
|
||||||
|
loop.add_reader(sock.fileno(), reader)
|
||||||
|
protocol = None
|
||||||
|
else:
|
||||||
|
transport, protocol = await loop.create_datagram_endpoint(
|
||||||
|
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||||
|
sock=sock,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Restore connection timers for hosts loaded from pickle
|
||||||
|
restore_ctx = dict(
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
msg_to_websockets=msg_to_websockets,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
|
)
|
||||||
|
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||||
|
|
||||||
|
# Drop alert states that no longer have a matching threshold (stale after
|
||||||
|
# upgrade or config change between runs).
|
||||||
|
threshold_checker.purge_stale_alerts(hbdclass)
|
||||||
|
|
||||||
|
async def _http_reload_callback():
|
||||||
|
await reload_configuration(config, config_path, components)
|
||||||
|
|
||||||
|
# HTTP server (asyncio-based via aiohttp)
|
||||||
|
try:
|
||||||
|
http_task = asyncio.create_task(
|
||||||
|
http_mod.start(
|
||||||
|
host=config.get("hbd_host", ""),
|
||||||
|
port=config.get("hbd_port", 50004),
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
tcss=None,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
get_now=lambda: time.time(),
|
||||||
|
VER="",
|
||||||
|
reload_callback=_http_reload_callback,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"HTTP server started on %s:%s",
|
||||||
|
config.get("hbd_host", ""),
|
||||||
|
config.get("hbd_port", 50004),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("failed to start HTTP server: %s", e)
|
||||||
|
|
||||||
|
# start dns update worker (async)
|
||||||
|
dns_task = None
|
||||||
|
try:
|
||||||
|
dns_task = dns_mod.start_dns_worker(
|
||||||
|
hbdclass, config, log=eventlog, loop=loop
|
||||||
|
)
|
||||||
|
logger.info("dns update worker started")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("dns worker failed to start: %s", e)
|
||||||
|
|
||||||
|
# Register WebSocket state — connections are now served through /ws on the HTTP port
|
||||||
|
ws_task = None
|
||||||
|
ws_mod.setup(
|
||||||
|
loop=loop,
|
||||||
|
get_hosts=lambda: [
|
||||||
|
hbdclass.Host.hosts[h].stateinfo()
|
||||||
|
for h in sorted(hbdclass.Host.hosts)
|
||||||
|
],
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
)
|
||||||
|
logger.info("WebSocket handler registered on /ws (HTTP port %s)", config.get("hbd_port", 50004))
|
||||||
|
|
||||||
|
# Periodic autosave task
|
||||||
|
autosave_interval = config.get("autosave_interval", 300) # default: 5 minutes
|
||||||
|
|
||||||
|
async def autosave_task():
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(autosave_interval)
|
||||||
|
logger.debug("Autosaving state...")
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.debug("Autosave complete (%d hosts)", len(hbdclass.Host.hosts))
|
||||||
|
|
||||||
|
autosave = asyncio.create_task(autosave_task())
|
||||||
|
logger.info("Autosave task started (interval: %ds)", autosave_interval)
|
||||||
|
|
||||||
|
# Main event loop - monitor shutdown and reload events
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# Wait for either shutdown or reload event
|
||||||
|
done, pending = await asyncio.wait(
|
||||||
|
[
|
||||||
|
asyncio.create_task(shutdown_event.wait()),
|
||||||
|
asyncio.create_task(reload_event.wait()),
|
||||||
|
],
|
||||||
|
return_when=asyncio.FIRST_COMPLETED
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check which event was triggered
|
||||||
|
if shutdown_event.is_set():
|
||||||
|
logger.info("Shutdown signal received, stopping services...")
|
||||||
|
# Cancel pending wait tasks
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
break
|
||||||
|
|
||||||
|
if reload_event.is_set():
|
||||||
|
# Clear the event for next reload
|
||||||
|
reload_event.clear()
|
||||||
|
|
||||||
|
# Cancel pending wait tasks
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
# Perform reload if config_path is available
|
||||||
|
if config_path:
|
||||||
|
await reload_configuration(config, config_path, components)
|
||||||
|
else:
|
||||||
|
logger.warning("Cannot reload: no config path available")
|
||||||
|
|
||||||
|
# Continue main loop
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Error in main loop: %s", e)
|
||||||
|
finally:
|
||||||
|
# Cancel all running tasks
|
||||||
|
logger.info("Cancelling tasks...")
|
||||||
|
try:
|
||||||
|
transport.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error closing UDP transport: %s", e)
|
||||||
|
|
||||||
|
tasks_to_cancel = [http_task, autosave]
|
||||||
|
for task in tasks_to_cancel:
|
||||||
|
if task:
|
||||||
|
try:
|
||||||
|
task.cancel()
|
||||||
|
logger.debug("Cancelled task: %s", task)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error cancelling task: %s", e)
|
||||||
|
|
||||||
|
# Wait for tasks to finish cancellation with timeout
|
||||||
|
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||||
|
if remaining_tasks:
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||||
|
timeout=2.0,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Timeout waiting for tasks to cancel")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Exception during task cancellation: %s", e)
|
||||||
|
|
||||||
|
# Close message journal
|
||||||
|
try:
|
||||||
|
await msg_journal.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error closing message journal: %s", e)
|
||||||
|
|
||||||
|
# Signal DNS worker to exit and await it
|
||||||
|
try:
|
||||||
|
if "dns_task" in locals() and dns_task:
|
||||||
|
try:
|
||||||
|
hbdclass.Host.dnsQ.put(None)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||||
|
logger.info("DNS worker finished")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Timeout waiting for DNS worker to finish")
|
||||||
|
dns_task.cancel()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("DNS worker was cancelled")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error awaiting DNS worker: %s", e)
|
||||||
|
finally:
|
||||||
|
# Clear queue bridge to release any held references
|
||||||
|
hbdclass.Host.dnsQ = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error stopping DNS worker: %s", e)
|
||||||
|
|
||||||
|
# Save state (hosts + sessions) on clean shutdown
|
||||||
|
try:
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.info("State saved on shutdown")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error saving state on shutdown: %s", e)
|
||||||
|
|
||||||
|
logger.info("All tasks cancelled")
|
||||||
|
|
||||||
|
|
||||||
|
def load_pickled_hosts(config, hbdclass):
|
||||||
|
"""Load pickled hosts from file, if available."""
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from . import config as config_mod
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||||
|
watchhosts = config_mod.get_watchhosts(config)
|
||||||
|
if 1 and os.path.exists(pickfile):
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("opening pickls %s", pickfile)
|
||||||
|
pickf = open(pickfile, "rb")
|
||||||
|
pick = pickle.Unpickler(pickf)
|
||||||
|
try:
|
||||||
|
hbdclass.Host.hosts = pick.load()
|
||||||
|
data.msgs = pick.load()
|
||||||
|
try:
|
||||||
|
users_mod.load_sessions(pick.load())
|
||||||
|
except Exception:
|
||||||
|
pass # older pickle without sessions — fine
|
||||||
|
pickf.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("load pickled failed: %s", e)
|
||||||
|
os.unlink(pickfile)
|
||||||
|
hbdclass.Connection.htab = {}
|
||||||
|
for h in list(hbdclass.Host.hosts.keys()):
|
||||||
|
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||||
|
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||||
|
hbdclass.Host.hosts[h].fixup()
|
||||||
|
access = config_mod.get_host_access(config, h)
|
||||||
|
hbdclass.Host.hosts[h].apply_access(
|
||||||
|
access["owner"], access["managers"], access["monitors"]
|
||||||
|
)
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||||
|
else:
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("no pickled data")
|
||||||
|
|
||||||
|
|
||||||
|
def run(config, config_path=None):
|
||||||
|
"""Start the hbd service (blocking).
|
||||||
|
|
||||||
|
Manually manages the event loop to ensure clean shutdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
config_path: Path to config file (for reload support)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
log_level = logging.WARNING
|
||||||
|
if config.get("verbose", False):
|
||||||
|
log_level = logging.INFO
|
||||||
|
if config.get("debug", 0) > 0:
|
||||||
|
log_level = logging.DEBUG
|
||||||
|
logging.basicConfig(level=log_level)
|
||||||
|
if not config.get("debug", 0):
|
||||||
|
logging.getLogger("aiohttp.access").propagate = False
|
||||||
|
load_pickled_hosts(config, hbdclass)
|
||||||
|
|
||||||
|
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||||
|
users_mod.load_users(config)
|
||||||
|
|
||||||
|
# Write pidfile
|
||||||
|
pidfile = config.get("pidfile", "")
|
||||||
|
if pidfile:
|
||||||
|
try:
|
||||||
|
with open(pidfile, "w") as f:
|
||||||
|
f.write(str(os.getpid()))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to write pidfile %s: %s", pidfile, e)
|
||||||
|
|
||||||
|
eventlog(None, "INFO", f"hbd version {__version__} starting up")
|
||||||
|
|
||||||
|
if config_path:
|
||||||
|
logger.info(f"Config file: {config_path} (reload with SIGHUP)")
|
||||||
|
else:
|
||||||
|
logger.warning("No config path provided - reload via SIGHUP disabled")
|
||||||
|
|
||||||
|
# Create and set the event loop manually
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop.run_until_complete(_run_async(config, config_path=config_path))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Unhandled exception in main: %s", e)
|
||||||
|
finally:
|
||||||
|
cleanup_function(config, hbdclass)
|
||||||
|
logger.info("hbd shutdown complete")
|
||||||
|
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
|
||||||
|
notify_mod.closelog()
|
||||||
|
# Remove pidfile
|
||||||
|
if pidfile:
|
||||||
|
try:
|
||||||
|
os.unlink(pidfile)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Explicitly close the loop
|
||||||
|
try:
|
||||||
|
# Cancel all remaining tasks
|
||||||
|
pending = asyncio.all_tasks(loop)
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
# Run one more cycle to process cancellations
|
||||||
|
if pending:
|
||||||
|
loop.run_until_complete(
|
||||||
|
asyncio.gather(*pending, return_exceptions=True)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
# Exit
|
||||||
|
os._exit(0)
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
"""Monitor helper for heartbeat daemon.
|
||||||
|
|
||||||
|
This module provides monitoring tasks for the heartbeat daemon.
|
||||||
|
The primary reachability monitoring is now event-driven (timers set/reset
|
||||||
|
on HTB arrival in udp.py) rather than periodic polling.
|
||||||
|
|
||||||
|
This module can be extended for additional monitoring tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
|
DROPOVERDUE = 7 * 24 * 3600
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_connections(hbdclass):
|
||||||
|
"""Clean up connection timers on shutdown.
|
||||||
|
|
||||||
|
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||||
|
"""
|
||||||
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
for conn_type, conn in host.connections.items():
|
||||||
|
if hasattr(conn, 'cancel_overdue_timer'):
|
||||||
|
conn.cancel_overdue_timer()
|
||||||
|
|
||||||
@@ -0,0 +1,495 @@
|
|||||||
|
"""Notification helpers: email, pushover, matrix, mattermost, signal, sms and dispatcher.
|
||||||
|
|
||||||
|
Channel types supported:
|
||||||
|
pushover - Pushover app notifications
|
||||||
|
email - SMTP email
|
||||||
|
matrix - Matrix (via matrix-nio)
|
||||||
|
mattermost - Mattermost webhook
|
||||||
|
signal - Signal via signal-cli subprocess
|
||||||
|
sms_voipms - SMS via voip.ms REST API
|
||||||
|
|
||||||
|
Each channel can specify ``min_level: WARNING|CRITICAL`` (default: WARNING).
|
||||||
|
|
||||||
|
Notifications are dispatched to the owner + managers of the host, each via
|
||||||
|
their own ``notification_channels`` list. When no users are configured the
|
||||||
|
server runs silently (no notifications sent).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import smtplib
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from . import data
|
||||||
|
from . import ws as ws_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
msg_to_websockets = ws_mod.broadcast
|
||||||
|
|
||||||
|
# Module-level state set via setup()
|
||||||
|
_config: dict = {}
|
||||||
|
|
||||||
|
# Tracks which channels fired a WARNING/CRITICAL per host.
|
||||||
|
# {host_name: set of channel_names} — used to route RECOVER to the same channels.
|
||||||
|
_alerted_channels: dict = {}
|
||||||
|
|
||||||
|
logf = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Level ordering
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_LEVEL_ORDER = {"RECOVER": 0, "INFO": 0, "WARNING": 1, "CRITICAL": 2}
|
||||||
|
|
||||||
|
def _level_value(level: str) -> int:
|
||||||
|
return _LEVEL_ORDER.get(level.upper(), 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Notification dataclass
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Notification:
|
||||||
|
"""Structured notification payload."""
|
||||||
|
title: str # e.g. "[CRITICAL] webserver01"
|
||||||
|
body: str # detail message
|
||||||
|
level: str # RECOVER | WARNING | CRITICAL | INFO
|
||||||
|
url: str = "" # link to plugin metrics page
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module setup
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def setup(cfg: dict, loop: Optional[asyncio.AbstractEventLoop] = None):
|
||||||
|
"""Initialize notifier from configuration dict."""
|
||||||
|
global _config
|
||||||
|
_config = dict(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def reload_config(cfg: dict):
|
||||||
|
"""Reload notification configuration on SIGHUP."""
|
||||||
|
global _config
|
||||||
|
_config = dict(cfg)
|
||||||
|
logger.info("Notification configuration reloaded")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Event log (websocket + file + in-memory)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def initlog(logfile):
|
||||||
|
global logf
|
||||||
|
try:
|
||||||
|
logf = open(logfile, "a+")
|
||||||
|
except Exception as e:
|
||||||
|
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
||||||
|
logf = sys.stderr
|
||||||
|
return logf
|
||||||
|
|
||||||
|
|
||||||
|
def closelog():
|
||||||
|
global logf
|
||||||
|
if logf and logf != sys.stderr:
|
||||||
|
try:
|
||||||
|
logf.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def eventlog(host, lvl, m, service=None):
|
||||||
|
ts = time.time()
|
||||||
|
msg = {
|
||||||
|
"ts": ts,
|
||||||
|
"host": host or None,
|
||||||
|
"level": lvl,
|
||||||
|
"service": service,
|
||||||
|
"message": m,
|
||||||
|
}
|
||||||
|
data.msgs.append(msg)
|
||||||
|
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||||
|
if host:
|
||||||
|
s += f"{host} "
|
||||||
|
s += m
|
||||||
|
logger.info(s)
|
||||||
|
if logf:
|
||||||
|
try:
|
||||||
|
logf.write(s + "\n")
|
||||||
|
logf.flush()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("failed to write to logfile: %s", e)
|
||||||
|
msg_to_websockets("message", msg)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Low-level channel drivers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _send_pushover(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
import http.client
|
||||||
|
import urllib.parse
|
||||||
|
token = channel_cfg.get("token", "")
|
||||||
|
user = channel_cfg.get("user", "")
|
||||||
|
if not token or not user:
|
||||||
|
logger.warning("pushover: missing token or user")
|
||||||
|
return False
|
||||||
|
params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body}
|
||||||
|
if channel_cfg.get("sound"):
|
||||||
|
params["sound"] = channel_cfg["sound"]
|
||||||
|
if notif.url:
|
||||||
|
params["url"] = notif.url
|
||||||
|
params["url_title"] = "Heartbeat"
|
||||||
|
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||||
|
try:
|
||||||
|
conn.request(
|
||||||
|
"POST",
|
||||||
|
"/1/messages.json",
|
||||||
|
urllib.parse.urlencode(params),
|
||||||
|
{"Content-type": "application/x-www-form-urlencoded"},
|
||||||
|
)
|
||||||
|
r = conn.getresponse()
|
||||||
|
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||||
|
return r.status == 200
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("pushover error: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_email(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
recipients = channel_cfg.get("recipients", [])
|
||||||
|
sender = channel_cfg.get("sender", "")
|
||||||
|
smtp_server = channel_cfg.get("smtp_server", "")
|
||||||
|
smtp_port = channel_cfg.get("smtp_port", 587)
|
||||||
|
smtp_user = channel_cfg.get("smtp_user")
|
||||||
|
smtp_password = channel_cfg.get("smtp_password")
|
||||||
|
|
||||||
|
if not recipients or not sender or not smtp_server:
|
||||||
|
logger.warning("email: missing recipients, sender, or smtp_server")
|
||||||
|
return False
|
||||||
|
|
||||||
|
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||||
|
body_text = notif.body
|
||||||
|
if notif.url:
|
||||||
|
body_text += f"\n\n{notif.url}"
|
||||||
|
raw = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||||
|
recipients[0] if isinstance(recipients, list) else recipients,
|
||||||
|
sender,
|
||||||
|
notif.title,
|
||||||
|
date,
|
||||||
|
body_text,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
server = smtplib.SMTP(smtp_server, smtp_port)
|
||||||
|
if smtp_port == 587:
|
||||||
|
server.starttls()
|
||||||
|
server.ehlo()
|
||||||
|
if smtp_user and smtp_password:
|
||||||
|
server.login(smtp_user, smtp_password)
|
||||||
|
server.sendmail(sender, recipients, raw)
|
||||||
|
server.quit()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("email send failed: %s", e)
|
||||||
|
try:
|
||||||
|
server.quit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_mattermost(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
try:
|
||||||
|
from mattermostdriver import Driver
|
||||||
|
except ImportError:
|
||||||
|
logger.error("mattermostdriver not installed")
|
||||||
|
return False
|
||||||
|
host = channel_cfg.get("host", "")
|
||||||
|
token = channel_cfg.get("token", "")
|
||||||
|
channel = channel_cfg.get("channel", "")
|
||||||
|
if not host or not token or not channel:
|
||||||
|
logger.warning("mattermost: missing host, token, or channel")
|
||||||
|
return False
|
||||||
|
text = f"**{notif.title}**\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
text += f"\n[Plugin metrics] {notif.url}"
|
||||||
|
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||||
|
mm = Driver(ses)
|
||||||
|
payload: dict = {"text": text, "channel": channel, "username": channel_cfg.get("username", "hbd")}
|
||||||
|
icon = channel_cfg.get("icon")
|
||||||
|
if icon:
|
||||||
|
payload["icon_url"] = icon
|
||||||
|
try:
|
||||||
|
rc = mm.webhooks.call_webhook(token, payload)
|
||||||
|
return bool(rc is None or rc == "")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("mattermost error: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_signal(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
cli = channel_cfg.get("cli_path", "/usr/local/bin/signal-cli")
|
||||||
|
user = channel_cfg.get("user", "")
|
||||||
|
recipient = channel_cfg.get("recipient", "")
|
||||||
|
if not user or not recipient:
|
||||||
|
logger.warning("signal: missing user or recipient")
|
||||||
|
return False
|
||||||
|
msg = f"{notif.title}\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
msg += f"\n{notif.url}"
|
||||||
|
try:
|
||||||
|
res = subprocess.run([cli, "-u", user, "send", "-m", msg, recipient], capture_output=True)
|
||||||
|
if res.returncode != 0:
|
||||||
|
logger.error("signal failed: %s", res.stderr.decode())
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("signal exception: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_sms_voipms_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send SMS via voip.ms REST API using multipart form-data POST."""
|
||||||
|
import json
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
api_user = channel_cfg.get("api_user", "")
|
||||||
|
api_password = channel_cfg.get("api_password", "")
|
||||||
|
did = channel_cfg.get("did", "")
|
||||||
|
dst = channel_cfg.get("dst", "")
|
||||||
|
if not api_user or not api_password or not did or not dst:
|
||||||
|
logger.warning("sms_voipms: missing api_user, api_password, did, or dst")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# SMS body: title + body, truncated to 160 chars
|
||||||
|
text = f"{notif.title}: {notif.body}"
|
||||||
|
if len(text) > 160:
|
||||||
|
text = text[:157] + "..."
|
||||||
|
|
||||||
|
form_data = {
|
||||||
|
"api_username": api_user,
|
||||||
|
"api_password": api_password,
|
||||||
|
"method": "sendSMS",
|
||||||
|
"did": did,
|
||||||
|
"dst": dst,
|
||||||
|
"message": text,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
with aiohttp.MultipartWriter("form-data") as mp:
|
||||||
|
for key, value in form_data.items():
|
||||||
|
part = mp.append(value)
|
||||||
|
part.set_content_disposition("form-data", name=key)
|
||||||
|
async with session.post("https://voip.ms/api/v1/rest.php", data=mp) as resp:
|
||||||
|
body = await resp.text()
|
||||||
|
if resp.status != 200:
|
||||||
|
logger.error("sms_voipms HTTP %s: %s", resp.status, body)
|
||||||
|
return False
|
||||||
|
result = json.loads(body)
|
||||||
|
if result.get("status") == "success":
|
||||||
|
return True
|
||||||
|
logger.error("sms_voipms error: %s", result.get("status"))
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("sms_voipms exception: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_matrix_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send a Matrix message using matrix-nio."""
|
||||||
|
try:
|
||||||
|
from nio import AsyncClient, RoomMessageText # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
logger.error("matrix-nio not installed; pip install matrix-nio")
|
||||||
|
return False
|
||||||
|
|
||||||
|
from nio import AsyncClient
|
||||||
|
homeserver = channel_cfg.get("homeserver", "")
|
||||||
|
access_token = channel_cfg.get("access_token", "")
|
||||||
|
room_id = channel_cfg.get("room_id", "")
|
||||||
|
if not homeserver or not access_token or not room_id:
|
||||||
|
logger.warning("matrix: missing homeserver, access_token, or room_id")
|
||||||
|
return False
|
||||||
|
|
||||||
|
text = f"{notif.title}\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
text += f"\n{notif.url}"
|
||||||
|
html = f"<strong>{notif.title}</strong><br>{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
html += f'<br><a href="{notif.url}">Plugin metrics</a>'
|
||||||
|
|
||||||
|
client = AsyncClient(homeserver)
|
||||||
|
client.access_token = access_token
|
||||||
|
try:
|
||||||
|
from nio import RoomSendResponse
|
||||||
|
content = {
|
||||||
|
"msgtype": "m.text",
|
||||||
|
"body": text,
|
||||||
|
"format": "org.matrix.custom.html",
|
||||||
|
"formatted_body": html,
|
||||||
|
}
|
||||||
|
resp = await client.room_send(room_id, "m.room.message", content)
|
||||||
|
if hasattr(resp, "event_id"):
|
||||||
|
return True
|
||||||
|
logger.error("matrix send failed: %s", resp)
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("matrix exception: %s", e)
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
await client.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Channel dispatcher (all async — sync drivers run in a thread executor)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Sync drivers kept for `hbd notify` CLI usage (asyncio.run wraps them there).
|
||||||
|
_DRIVERS = {
|
||||||
|
"pushover": _send_pushover,
|
||||||
|
"email": _send_email,
|
||||||
|
"mattermost": _send_mattermost,
|
||||||
|
"signal": _send_signal,
|
||||||
|
}
|
||||||
|
|
||||||
|
_TIMEOUT = 15 # seconds per channel send
|
||||||
|
|
||||||
|
|
||||||
|
async def _dispatch_to_channel(channel_name: str, channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send *notif* to a single named channel, honouring min_level."""
|
||||||
|
# Strip ownership metadata — notifier drivers only need delivery credentials.
|
||||||
|
channel_cfg = {k: v for k, v in channel_cfg.items() if k not in ("owner", "private")}
|
||||||
|
|
||||||
|
level = notif.level.upper()
|
||||||
|
if level != "RECOVER":
|
||||||
|
min_level = channel_cfg.get("min_level", "WARNING").upper()
|
||||||
|
if _level_value(level) < _level_value(min_level):
|
||||||
|
logger.debug(
|
||||||
|
"channel '%s': skipping level %s (min_level=%s)", channel_name, level, min_level
|
||||||
|
)
|
||||||
|
return True # filtered intentionally
|
||||||
|
|
||||||
|
ch_type = channel_cfg.get("type", "")
|
||||||
|
try:
|
||||||
|
if ch_type == "matrix":
|
||||||
|
return await asyncio.wait_for(_send_matrix_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||||
|
if ch_type == "sms_voipms":
|
||||||
|
return await asyncio.wait_for(_send_sms_voipms_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||||
|
sync_driver = _DRIVERS.get(ch_type)
|
||||||
|
if sync_driver is None:
|
||||||
|
logger.warning("unknown channel type '%s' for channel '%s'", ch_type, channel_name)
|
||||||
|
return False
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
asyncio.to_thread(sync_driver, channel_cfg, notif), timeout=_TIMEOUT
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.error("channel '%s' timed out after %ds", channel_name, _TIMEOUT)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Central dispatch function
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _build_url(host_name: str) -> str:
|
||||||
|
base_url = _config.get("base_url", "").rstrip("/")
|
||||||
|
if not base_url:
|
||||||
|
return ""
|
||||||
|
return f"{base_url}/alerts?filter={host_name}"
|
||||||
|
|
||||||
|
|
||||||
|
async def send_notification(host_name: str, notif: Notification) -> dict:
|
||||||
|
"""Dispatch *notif* to all managers/owner of *host_name*.
|
||||||
|
|
||||||
|
Looks up the host's owner + managers, resolves each user's
|
||||||
|
notification_channels, and dispatches. Silently does nothing if
|
||||||
|
no users are configured.
|
||||||
|
|
||||||
|
Returns a dict of {channel_name: bool} results.
|
||||||
|
"""
|
||||||
|
from . import users as users_mod
|
||||||
|
from . import hbdclass
|
||||||
|
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Collect recipient usernames: owner + managers
|
||||||
|
host = hbdclass.Host.hosts.get(host_name)
|
||||||
|
if host is None:
|
||||||
|
logger.debug("send_notification: host '%s' not found", host_name)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
recipients: set[str] = set()
|
||||||
|
owner = getattr(host, "owner", None)
|
||||||
|
if owner:
|
||||||
|
recipients.add(owner)
|
||||||
|
for m in getattr(host, "managers", []):
|
||||||
|
recipients.add(m)
|
||||||
|
|
||||||
|
if not recipients:
|
||||||
|
logger.debug("send_notification: no owner/managers for '%s'", host_name)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Fill url if not already set
|
||||||
|
if not notif.url:
|
||||||
|
notif.url = _build_url(host_name)
|
||||||
|
|
||||||
|
global_channels: dict = _config.get("notification_channels", {})
|
||||||
|
results: dict = {}
|
||||||
|
level = notif.level.upper()
|
||||||
|
is_alert = level in ("WARNING", "CRITICAL")
|
||||||
|
is_recover = level in ("RECOVER",)
|
||||||
|
|
||||||
|
# For RECOVER: send to every channel that previously fired an alert for this host,
|
||||||
|
# regardless of that channel's min_level.
|
||||||
|
if is_recover and host_name in _alerted_channels:
|
||||||
|
for channel_name in list(_alerted_channels[host_name]):
|
||||||
|
channel_cfg = global_channels.get(channel_name)
|
||||||
|
if not channel_cfg:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||||
|
results[channel_name] = ok
|
||||||
|
if ok:
|
||||||
|
logger.info("recover sent to channel '%s': %s", channel_name, notif.title)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("error sending recover to channel '%s': %s", channel_name, e)
|
||||||
|
del _alerted_channels[host_name]
|
||||||
|
return results
|
||||||
|
|
||||||
|
for username in recipients:
|
||||||
|
user = users_mod.get_user(username)
|
||||||
|
if user is None:
|
||||||
|
logger.debug("send_notification: user '%s' not found", username)
|
||||||
|
continue
|
||||||
|
for channel_name in user.notification_channels:
|
||||||
|
if channel_name in results:
|
||||||
|
continue
|
||||||
|
channel_cfg = global_channels.get(channel_name)
|
||||||
|
if not channel_cfg:
|
||||||
|
logger.warning("channel '%s' not defined in notification_channels", channel_name)
|
||||||
|
results[channel_name] = False
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||||
|
results[channel_name] = ok
|
||||||
|
if ok:
|
||||||
|
logger.info("notification sent to channel '%s': %s", channel_name, notif.title)
|
||||||
|
if is_alert:
|
||||||
|
_alerted_channels.setdefault(host_name, set()).add(channel_name)
|
||||||
|
else:
|
||||||
|
logger.warning("failed to send notification to channel '%s'", channel_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("error sending to channel '%s': %s", channel_name, e)
|
||||||
|
results[channel_name] = False
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -0,0 +1,254 @@
|
|||||||
|
"""OAuth2 provider support.
|
||||||
|
|
||||||
|
Config shape (in ~/.hb.yaml):
|
||||||
|
|
||||||
|
oauth:
|
||||||
|
my-gitea: # route slug → /login/oauth/my-gitea
|
||||||
|
type: gitea # "gitea" | "github" | "nextcloud"
|
||||||
|
# omit type to default to "gitea"
|
||||||
|
url: https://git.example.com # required for gitea and nextcloud
|
||||||
|
client_id: <client-id>
|
||||||
|
client_secret: <client-secret>
|
||||||
|
label: "Work Gitea" # optional display name on login button
|
||||||
|
logo: https://example.com/logo.png # optional logo URL
|
||||||
|
|
||||||
|
github:
|
||||||
|
type: github
|
||||||
|
client_id: <client-id>
|
||||||
|
client_secret: <client-secret>
|
||||||
|
|
||||||
|
nextcloud:
|
||||||
|
type: nextcloud
|
||||||
|
url: https://cloud.example.com
|
||||||
|
client_id: <client-id>
|
||||||
|
client_secret: <client-secret>
|
||||||
|
|
||||||
|
Register the OAuth app with each provider and set the redirect URI to:
|
||||||
|
https://<hbd-host>/login/oauth/<name>/callback
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import secrets
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
STATE_TTL = 600 # 10 minutes
|
||||||
|
|
||||||
|
# state_token -> expiry timestamp
|
||||||
|
_states: dict[str, float] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def make_state() -> str:
|
||||||
|
"""Generate a CSRF state token, store it with TTL, and return it."""
|
||||||
|
_purge_states()
|
||||||
|
token = secrets.token_hex(32)
|
||||||
|
_states[token] = time.time() + STATE_TTL
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def validate_state(state: str) -> bool:
|
||||||
|
"""Return True if *state* is known and unexpired; always removes it."""
|
||||||
|
expiry = _states.pop(state, None)
|
||||||
|
if expiry is None:
|
||||||
|
return False
|
||||||
|
return time.time() < expiry
|
||||||
|
|
||||||
|
|
||||||
|
def _purge_states() -> None:
|
||||||
|
"""Remove all expired CSRF state tokens from the in-memory store."""
|
||||||
|
now = time.time()
|
||||||
|
expired = [k for k, exp in list(_states.items()) if exp < now]
|
||||||
|
for k in expired:
|
||||||
|
del _states[k]
|
||||||
|
|
||||||
|
|
||||||
|
class OAuthError(Exception):
|
||||||
|
"""Raised when the OAuth2 flow fails for any reason."""
|
||||||
|
|
||||||
|
|
||||||
|
PROVIDER_DEFS: dict = {
|
||||||
|
"gitea": {
|
||||||
|
"authorize_url_tmpl": "{url}/login/oauth/authorize",
|
||||||
|
"token_url_tmpl": "{url}/login/oauth/access_token",
|
||||||
|
"profile_url_tmpl": "{url}/api/v1/user",
|
||||||
|
"scope": "user:email",
|
||||||
|
"field_map": {"username": "login", "full_name": "full_name", "avatar": "avatar_url"},
|
||||||
|
"profile_data_path": [],
|
||||||
|
"requires_url": True,
|
||||||
|
"default_label": "Gitea",
|
||||||
|
},
|
||||||
|
"github": {
|
||||||
|
"authorize_url_tmpl": "https://github.com/login/oauth/authorize",
|
||||||
|
"token_url_tmpl": "https://github.com/login/oauth/access_token",
|
||||||
|
"profile_url_tmpl": "https://api.github.com/user",
|
||||||
|
"scope": "read:user",
|
||||||
|
"field_map": {"username": "login", "full_name": "name", "avatar": "avatar_url"},
|
||||||
|
"profile_data_path": [],
|
||||||
|
"requires_url": False,
|
||||||
|
"default_label": "GitHub",
|
||||||
|
},
|
||||||
|
"nextcloud": {
|
||||||
|
"authorize_url_tmpl": "{url}/apps/oauth2/authorize",
|
||||||
|
"token_url_tmpl": "{url}/apps/oauth2/api/v1/token",
|
||||||
|
"profile_url_tmpl": "{url}/ocs/v2.php/cloud/user?format=json",
|
||||||
|
"scope": "",
|
||||||
|
"field_map": {"username": "id", "full_name": "display-name", "avatar": None},
|
||||||
|
"profile_data_path": ["ocs", "data"],
|
||||||
|
"requires_url": True,
|
||||||
|
"default_label": "Nextcloud",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResolvedProvider:
|
||||||
|
"""A fully resolved OAuth2 provider instance, ready to use."""
|
||||||
|
name: str
|
||||||
|
type: str
|
||||||
|
label: str
|
||||||
|
logo: str
|
||||||
|
authorize_url: str
|
||||||
|
token_url: str
|
||||||
|
profile_url: str
|
||||||
|
scope: str
|
||||||
|
client_id: str
|
||||||
|
client_secret: str
|
||||||
|
field_map: dict
|
||||||
|
profile_data_path: list
|
||||||
|
|
||||||
|
|
||||||
|
def get_providers(config: dict) -> list[ResolvedProvider]:
|
||||||
|
"""Return a ResolvedProvider for every valid entry in config['oauth'].
|
||||||
|
|
||||||
|
Entries with missing required fields or unknown types are skipped with
|
||||||
|
a warning log. Order follows config declaration order.
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
oauth_cfg = config.get("oauth", {})
|
||||||
|
if not isinstance(oauth_cfg, dict):
|
||||||
|
return result
|
||||||
|
for name, entry in oauth_cfg.items():
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
provider_type = entry.get("type", "gitea")
|
||||||
|
defn = PROVIDER_DEFS.get(provider_type)
|
||||||
|
if defn is None:
|
||||||
|
logger.warning("OAuth: unknown provider type %r for %r, skipping", provider_type, name)
|
||||||
|
continue
|
||||||
|
client_id = entry.get("client_id", "")
|
||||||
|
client_secret = entry.get("client_secret", "")
|
||||||
|
if not client_id or not client_secret:
|
||||||
|
logger.warning("OAuth: %r missing client_id or client_secret, skipping", name)
|
||||||
|
continue
|
||||||
|
url = entry.get("url", "").rstrip("/")
|
||||||
|
if defn["requires_url"] and not url:
|
||||||
|
logger.warning("OAuth: %r requires url but none configured, skipping", name)
|
||||||
|
continue
|
||||||
|
label = entry.get("label") or defn["default_label"]
|
||||||
|
logo = entry.get("logo", "")
|
||||||
|
result.append(ResolvedProvider(
|
||||||
|
name=name,
|
||||||
|
type=provider_type,
|
||||||
|
label=label,
|
||||||
|
logo=logo,
|
||||||
|
authorize_url=defn["authorize_url_tmpl"].format(url=url),
|
||||||
|
token_url=defn["token_url_tmpl"].format(url=url),
|
||||||
|
profile_url=defn["profile_url_tmpl"].format(url=url),
|
||||||
|
scope=defn["scope"],
|
||||||
|
client_id=client_id,
|
||||||
|
client_secret=client_secret,
|
||||||
|
field_map=dict(defn["field_map"]),
|
||||||
|
profile_data_path=list(defn["profile_data_path"]),
|
||||||
|
))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def is_enabled(config: dict) -> bool:
|
||||||
|
"""Return True when at least one OAuth provider is fully configured."""
|
||||||
|
return bool(get_providers(config))
|
||||||
|
|
||||||
|
|
||||||
|
def build_auth_url(provider: ResolvedProvider, state: str, redirect_uri: str) -> str:
|
||||||
|
"""Return the provider's OAuth2 authorization URL to redirect the browser to."""
|
||||||
|
params: dict = {
|
||||||
|
"client_id": provider.client_id,
|
||||||
|
"redirect_uri": redirect_uri,
|
||||||
|
"response_type": "code",
|
||||||
|
"state": state,
|
||||||
|
}
|
||||||
|
if provider.scope:
|
||||||
|
params["scope"] = provider.scope
|
||||||
|
return f"{provider.authorize_url}?{urllib.parse.urlencode(params)}"
|
||||||
|
|
||||||
|
|
||||||
|
async def exchange_code(provider: ResolvedProvider, code: str, redirect_uri: str) -> str:
|
||||||
|
"""Exchange an authorization *code* for an access token.
|
||||||
|
|
||||||
|
Returns the access token string. Raises OAuthError on any failure.
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"client_id": provider.client_id,
|
||||||
|
"client_secret": provider.client_secret,
|
||||||
|
"code": code,
|
||||||
|
"grant_type": "authorization_code",
|
||||||
|
"redirect_uri": redirect_uri,
|
||||||
|
}
|
||||||
|
timeout = aiohttp.ClientTimeout(total=10)
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||||
|
async with session.post(
|
||||||
|
provider.token_url,
|
||||||
|
json=payload,
|
||||||
|
headers={"Accept": "application/json"},
|
||||||
|
) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
text = await resp.text()
|
||||||
|
raise OAuthError(f"Token exchange failed ({resp.status}): {text}")
|
||||||
|
data = await resp.json()
|
||||||
|
token = data.get("access_token")
|
||||||
|
if not token:
|
||||||
|
raise OAuthError(f"No access_token in response: {data}")
|
||||||
|
except aiohttp.ClientError as exc:
|
||||||
|
raise OAuthError(f"Token exchange network error: {exc}") from exc
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_user(provider: ResolvedProvider, token: str) -> dict:
|
||||||
|
"""Fetch the authenticated user's profile from the provider.
|
||||||
|
|
||||||
|
Returns a dict with keys: login, full_name, avatar_url.
|
||||||
|
Raises OAuthError on any failure.
|
||||||
|
"""
|
||||||
|
timeout = aiohttp.ClientTimeout(total=10)
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||||
|
async with session.get(
|
||||||
|
provider.profile_url,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {token}",
|
||||||
|
"Accept": "application/json",
|
||||||
|
},
|
||||||
|
) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
text = await resp.text()
|
||||||
|
raise OAuthError(f"User fetch failed ({resp.status}): {text}")
|
||||||
|
data = await resp.json()
|
||||||
|
except aiohttp.ClientError as exc:
|
||||||
|
raise OAuthError(f"User fetch network error: {exc}") from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
for key in provider.profile_data_path:
|
||||||
|
data = data.get(key, {})
|
||||||
|
avatar_field = provider.field_map.get("avatar")
|
||||||
|
return {
|
||||||
|
"login": data.get(provider.field_map["username"], ""),
|
||||||
|
"full_name": data.get(provider.field_map["full_name"], ""),
|
||||||
|
"avatar_url": data.get(avatar_field, "") if avatar_field else "",
|
||||||
|
}
|
||||||
|
except AttributeError:
|
||||||
|
raise OAuthError(f"Unexpected profile response structure from {provider.type}")
|
||||||
@@ -0,0 +1,498 @@
|
|||||||
|
"""Settings descriptor: maps config keys to display metadata.
|
||||||
|
|
||||||
|
``get_settings_sections(config)`` returns an ordered list of sections, each
|
||||||
|
containing a list of field descriptors. The template iterates this structure
|
||||||
|
generically, so adding editability later is a matter of:
|
||||||
|
|
||||||
|
1. Setting ``"editable": True`` on a field.
|
||||||
|
2. Adding the matching ``<input>``/``<select>`` in the template
|
||||||
|
(guided by ``"type"``).
|
||||||
|
3. Wiring a POST handler in http.py.
|
||||||
|
|
||||||
|
Field descriptor keys
|
||||||
|
---------------------
|
||||||
|
key str Config key (for future form POST matching)
|
||||||
|
label str Human-readable label
|
||||||
|
description str One-line help text shown below the value
|
||||||
|
value any Sanitized display value (secrets replaced with "•••")
|
||||||
|
type str One of: text | number | port | boolean | path | duration |
|
||||||
|
list | secret | size | select
|
||||||
|
editable bool Reserved for future use — currently always False
|
||||||
|
sensitive bool True when the raw value must never be shown
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Credential field names that should always be masked.
|
||||||
|
_SECRET_KEYS = frozenset({
|
||||||
|
"password", "token", "user_key", "api_key", "secret",
|
||||||
|
"smtp_password", "smtp_user", "api_password", "access_token",
|
||||||
|
})
|
||||||
|
|
||||||
|
CHANNEL_TYPE_SCHEMAS = {
|
||||||
|
"pushover": {
|
||||||
|
"label": "Pushover",
|
||||||
|
"fields": [
|
||||||
|
{"key": "token", "label": "App token", "type": "secret", "required": True},
|
||||||
|
{"key": "user", "label": "User key", "type": "secret", "required": True},
|
||||||
|
{"key": "sound", "label": "Sound", "type": "text", "required": False},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"label": "E-mail",
|
||||||
|
"fields": [
|
||||||
|
{"key": "recipients", "label": "Recipients (comma-separated)", "type": "list", "required": True},
|
||||||
|
{"key": "sender", "label": "From address", "type": "text", "required": True},
|
||||||
|
{"key": "smtp_server", "label": "SMTP server", "type": "text", "required": True},
|
||||||
|
{"key": "smtp_port", "label": "SMTP port", "type": "port", "required": False},
|
||||||
|
{"key": "smtp_user", "label": "SMTP username", "type": "text", "required": False},
|
||||||
|
{"key": "smtp_password", "label": "SMTP password", "type": "secret", "required": False},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"signal": {
|
||||||
|
"label": "Signal",
|
||||||
|
"fields": [
|
||||||
|
{"key": "user", "label": "Sender number", "type": "text", "required": True},
|
||||||
|
{"key": "recipient", "label": "Recipient number", "type": "text", "required": True},
|
||||||
|
{"key": "cli_path", "label": "signal-cli path", "type": "text", "required": False},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"matrix": {
|
||||||
|
"label": "Matrix",
|
||||||
|
"fields": [
|
||||||
|
{"key": "homeserver", "label": "Homeserver URL", "type": "text", "required": True},
|
||||||
|
{"key": "access_token", "label": "Access token", "type": "secret", "required": True},
|
||||||
|
{"key": "room_id", "label": "Room ID", "type": "text", "required": True},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"sms_voipms": {
|
||||||
|
"label": "SMS (voip.ms)",
|
||||||
|
"fields": [
|
||||||
|
{"key": "api_user", "label": "API username", "type": "text", "required": True},
|
||||||
|
{"key": "api_password", "label": "API password", "type": "secret", "required": True},
|
||||||
|
{"key": "did", "label": "DID (from)", "type": "text", "required": True},
|
||||||
|
{"key": "dst", "label": "Destination", "type": "text", "required": True},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"mattermost": {
|
||||||
|
"label": "Mattermost",
|
||||||
|
"fields": [
|
||||||
|
{"key": "host", "label": "Host", "type": "text", "required": True},
|
||||||
|
{"key": "token", "label": "Webhook token", "type": "secret", "required": True},
|
||||||
|
{"key": "channel", "label": "Channel", "type": "text", "required": True},
|
||||||
|
{"key": "username", "label": "Bot username", "type": "text", "required": False},
|
||||||
|
{"key": "icon", "label": "Icon URL", "type": "text", "required": False},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
_CHANNEL_TYPE_LABELS = {k: v["label"] for k, v in CHANNEL_TYPE_SCHEMAS.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def _mask(value):
|
||||||
|
"""Return a masked placeholder for sensitive values."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
return "•••"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_size(n):
|
||||||
|
"""Format a byte count as a human-readable string."""
|
||||||
|
try:
|
||||||
|
n = int(n)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(n)
|
||||||
|
for unit in ("B", "KB", "MB", "GB"):
|
||||||
|
if n < 1024:
|
||||||
|
return f"{n} {unit}"
|
||||||
|
n //= 1024
|
||||||
|
return f"{n} TB"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_duration(seconds):
|
||||||
|
"""Format seconds into a human-readable duration string."""
|
||||||
|
try:
|
||||||
|
s = int(seconds)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(seconds)
|
||||||
|
if s < 60:
|
||||||
|
return f"{s}s"
|
||||||
|
if s < 3600:
|
||||||
|
m, sec = divmod(s, 60)
|
||||||
|
return f"{m}m {sec}s" if sec else f"{m}m"
|
||||||
|
h, rem = divmod(s, 3600)
|
||||||
|
m = rem // 60
|
||||||
|
return f"{h}h {m}m" if m else f"{h}h"
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_channel(name, cfg):
|
||||||
|
"""Return a sanitized copy of a notification channel config."""
|
||||||
|
result = {}
|
||||||
|
for k, v in cfg.items():
|
||||||
|
if k in _SECRET_KEYS:
|
||||||
|
result[k] = _mask(v)
|
||||||
|
elif isinstance(v, list):
|
||||||
|
result[k] = v
|
||||||
|
else:
|
||||||
|
result[k] = v
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_settings_sections(config: dict, threshold_checker=None) -> list:
|
||||||
|
"""Return ordered list of setting sections for the settings page.
|
||||||
|
|
||||||
|
Each section:
|
||||||
|
{
|
||||||
|
"title": str,
|
||||||
|
"description": str,
|
||||||
|
"fields": [ field_descriptor, ... ]
|
||||||
|
}
|
||||||
|
|
||||||
|
Each field_descriptor:
|
||||||
|
{
|
||||||
|
"key": str,
|
||||||
|
"label": str,
|
||||||
|
"description": str,
|
||||||
|
"value": display_value,
|
||||||
|
"raw": raw_config_value, # None for sensitive
|
||||||
|
"type": str,
|
||||||
|
"editable": bool,
|
||||||
|
"sensitive": bool,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
def field(key, label, ftype, description="", editable=False, sensitive=False):
|
||||||
|
raw = config.get(key)
|
||||||
|
if sensitive:
|
||||||
|
display = _mask(raw)
|
||||||
|
raw_out = None
|
||||||
|
elif ftype == "size":
|
||||||
|
display = _fmt_size(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "duration":
|
||||||
|
display = _fmt_duration(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "boolean":
|
||||||
|
display = bool(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "list":
|
||||||
|
val = raw or []
|
||||||
|
display = list(val) if not isinstance(val, list) else val
|
||||||
|
raw_out = display
|
||||||
|
else:
|
||||||
|
display = raw if raw is not None else ""
|
||||||
|
raw_out = raw
|
||||||
|
return {
|
||||||
|
"key": key,
|
||||||
|
"label": label,
|
||||||
|
"description": description,
|
||||||
|
"value": display,
|
||||||
|
"raw": raw_out,
|
||||||
|
"type": ftype,
|
||||||
|
"editable": editable,
|
||||||
|
"sensitive": sensitive,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Notification channels (complex, built separately) ----------------
|
||||||
|
_METADATA_KEYS = {"type", "owner", "private", "min_level"}
|
||||||
|
notif_channels = []
|
||||||
|
for ch_name, ch_cfg in sorted((config.get("notification_channels") or {}).items()):
|
||||||
|
if not isinstance(ch_cfg, dict):
|
||||||
|
continue
|
||||||
|
ch_type = ch_cfg.get("type", "")
|
||||||
|
fields = []
|
||||||
|
for k, v in ch_cfg.items():
|
||||||
|
if k in _METADATA_KEYS:
|
||||||
|
continue
|
||||||
|
sensitive = k in _SECRET_KEYS
|
||||||
|
fields.append({
|
||||||
|
"key": k,
|
||||||
|
"label": k.replace("_", " ").title(),
|
||||||
|
"value": _mask(v) if sensitive else (
|
||||||
|
", ".join(v) if isinstance(v, list) else str(v)
|
||||||
|
),
|
||||||
|
"sensitive": sensitive,
|
||||||
|
})
|
||||||
|
notif_channels.append({
|
||||||
|
"name": ch_name,
|
||||||
|
"type": ch_type,
|
||||||
|
"type_label": _CHANNEL_TYPE_LABELS.get(ch_type, ch_type.title()),
|
||||||
|
"owner": ch_cfg.get("owner"),
|
||||||
|
"private": bool(ch_cfg.get("private", False)),
|
||||||
|
"min_level": ch_cfg.get("min_level", "WARNING"),
|
||||||
|
"fields": fields,
|
||||||
|
})
|
||||||
|
|
||||||
|
# ---- Users (show metadata only, never password hashes) ----------------
|
||||||
|
users_list = []
|
||||||
|
for username, attrs in (config.get("users") or {}).items():
|
||||||
|
if not isinstance(attrs, dict):
|
||||||
|
continue
|
||||||
|
users_list.append({
|
||||||
|
"username": username,
|
||||||
|
"full_name": attrs.get("full_name", ""),
|
||||||
|
"admin": bool(attrs.get("admin", False)),
|
||||||
|
"avatar": attrs.get("avatar", ""),
|
||||||
|
"notification_channels": attrs.get("notification_channels", []),
|
||||||
|
})
|
||||||
|
|
||||||
|
# ---- Threshold configurations -----------------------------------------
|
||||||
|
def _tc_to_row(tc):
|
||||||
|
return {
|
||||||
|
"metric": tc.metric_path,
|
||||||
|
"operator": tc.operator.value,
|
||||||
|
"warning": tc.warning,
|
||||||
|
"critical": tc.critical,
|
||||||
|
"hysteresis": tc.hysteresis,
|
||||||
|
"count": tc.count,
|
||||||
|
"enabled": tc.enabled,
|
||||||
|
"display": tc.display or "",
|
||||||
|
"grace": tc.grace,
|
||||||
|
}
|
||||||
|
|
||||||
|
threshold_config_list = []
|
||||||
|
if threshold_checker is not None:
|
||||||
|
if threshold_checker.threshold_configs:
|
||||||
|
for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
|
||||||
|
# For the default config use the merged effective set;
|
||||||
|
# for named overrides use only the explicitly defined metrics
|
||||||
|
# (threshold_raw_configs) so inherited defaults are not repeated.
|
||||||
|
if cfg_name == "default":
|
||||||
|
display_metrics = cfg_metrics
|
||||||
|
else:
|
||||||
|
display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
|
||||||
|
metrics = sorted(
|
||||||
|
[_tc_to_row(tc) for tc in display_metrics.values()],
|
||||||
|
key=lambda m: m["metric"],
|
||||||
|
)
|
||||||
|
threshold_config_list.append({"name": cfg_name, "metrics": metrics})
|
||||||
|
elif threshold_checker.thresholds:
|
||||||
|
metrics = sorted(
|
||||||
|
[_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
|
||||||
|
key=lambda m: m["metric"],
|
||||||
|
)
|
||||||
|
threshold_config_list.append({"name": "default", "metrics": metrics})
|
||||||
|
|
||||||
|
# ---- Hosts summary ----------------------------------------------------
|
||||||
|
hosts_list = []
|
||||||
|
for hname, hcfg in sorted((config.get("hosts") or {}).items()):
|
||||||
|
if not isinstance(hcfg, dict):
|
||||||
|
continue
|
||||||
|
hosts_list.append({
|
||||||
|
"name": hname,
|
||||||
|
"watch": bool(hcfg.get("watch", True)),
|
||||||
|
"dyndns": bool(hcfg.get("dyndns", False)),
|
||||||
|
"owner": hcfg.get("owner", ""),
|
||||||
|
"managers": hcfg.get("managers", []),
|
||||||
|
"monitors": hcfg.get("monitors", []),
|
||||||
|
"threshold_configs": (
|
||||||
|
list(v) if isinstance(v := hcfg.get("threshold_config"), list)
|
||||||
|
else ([v] if v else [])
|
||||||
|
),
|
||||||
|
"notification_channels": hcfg.get("notification_channels", []),
|
||||||
|
})
|
||||||
|
|
||||||
|
# ---- OAuth providers -------------------------------------------------------
|
||||||
|
oauth_providers = []
|
||||||
|
for pname, pattrs in (config.get("oauth") or {}).items():
|
||||||
|
if not isinstance(pattrs, dict):
|
||||||
|
continue
|
||||||
|
cs = pattrs.get("client_secret", "")
|
||||||
|
oauth_providers.append({
|
||||||
|
"name": pname,
|
||||||
|
"type": pattrs.get("type", "gitea"),
|
||||||
|
"url": pattrs.get("url", ""),
|
||||||
|
"client_id": pattrs.get("client_id", ""),
|
||||||
|
"client_secret": "•••" if cs else "",
|
||||||
|
"label": pattrs.get("label", ""),
|
||||||
|
"logo": pattrs.get("logo", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": "network",
|
||||||
|
"title": "Network",
|
||||||
|
"description": "Ports and bind addresses for all server sockets.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "server",
|
||||||
|
"fields": [
|
||||||
|
field("hb_port", "Heartbeat UDP port", "port",
|
||||||
|
"UDP port the server listens on for heartbeat datagrams.", editable=True),
|
||||||
|
field("hbd_host", "HTTP bind address", "text",
|
||||||
|
"Interface to bind the HTTP server to. Empty = all interfaces.", editable=True),
|
||||||
|
field("hbd_port", "HTTP API port", "port",
|
||||||
|
"TCP port for the HTTP API and web UI.", editable=True),
|
||||||
|
field("ws_port", "WebSocket port", "port",
|
||||||
|
"TCP port for the plain WebSocket server.", editable=True),
|
||||||
|
field("wss_port", "Secure WebSocket port", "port",
|
||||||
|
"TCP port for WSS (TLS WebSocket). Leave empty to disable.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "tls",
|
||||||
|
"title": "TLS / WebSocket Security",
|
||||||
|
"description": "Certificate paths used when wss_port is set.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": None,
|
||||||
|
"fields": [
|
||||||
|
field("cert_path", "Certificate directory", "path",
|
||||||
|
"Directory containing the TLS certificate and key files."),
|
||||||
|
field("wss_pem", "Certificate file", "text",
|
||||||
|
"Filename of the TLS certificate chain (PEM format)."),
|
||||||
|
field("wss_key", "Key file", "text",
|
||||||
|
"Filename of the TLS private key (PEM format)."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "monitoring",
|
||||||
|
"title": "Monitoring",
|
||||||
|
"description": "Heartbeat timing and alert re-notification behaviour.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "server",
|
||||||
|
"fields": [
|
||||||
|
field("interval", "Heartbeat interval", "duration",
|
||||||
|
"Expected time between heartbeat messages from each client.", editable=True),
|
||||||
|
field("grace", "Grace period", "number",
|
||||||
|
"Extra seconds to wait after a missed heartbeat before sending notifications.", editable=True),
|
||||||
|
field("threshold_renotify_interval", "Re-notify interval", "duration",
|
||||||
|
"How often to re-send notifications for ongoing threshold alerts.", editable=True),
|
||||||
|
field("autosave_interval", "Autosave interval", "duration",
|
||||||
|
"How often the server saves its state to disk."),
|
||||||
|
field("base_url", "Base URL", "text",
|
||||||
|
"Base URL for notification links.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "persistence",
|
||||||
|
"title": "Persistence & Logging",
|
||||||
|
"description": "State file and event log settings.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "server",
|
||||||
|
"fields": [
|
||||||
|
field("pickfile", "State file", "path",
|
||||||
|
"Path to the pickle file used to persist host state across restarts.", editable=True),
|
||||||
|
field("logfile", "Event log", "path",
|
||||||
|
"Path to the event log file.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "journal",
|
||||||
|
"title": "Message Journal",
|
||||||
|
"description": "All received heartbeat and plugin messages are journalled here.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "server",
|
||||||
|
"fields": [
|
||||||
|
field("journal_enabled", "Enabled", "boolean",
|
||||||
|
"Turn journalling on or off.", editable=True),
|
||||||
|
field("journal_dir", "Journal directory","path",
|
||||||
|
"Directory where journal files are written.", editable=True),
|
||||||
|
field("journal_file", "Journal filename", "text",
|
||||||
|
"Base filename for the journal (rotated copies get a numeric suffix)."),
|
||||||
|
field("journal_max_size", "Max file size", "size",
|
||||||
|
"Rotate the journal when it exceeds this size.", editable=True),
|
||||||
|
field("journal_max_backups", "Backup count", "number",
|
||||||
|
"Number of rotated journal files to keep.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "dns",
|
||||||
|
"title": "Dynamic DNS",
|
||||||
|
"description": "nsupdate-based DNS registration via nsupdate(8).",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "dns",
|
||||||
|
"fields": [
|
||||||
|
field("nsupdate_bin", "nsupdate binary", "path",
|
||||||
|
"Path to the nsupdate binary.", editable=True),
|
||||||
|
field("rndc_key", "RNDC key file", "path",
|
||||||
|
"Path to the rndc key file used to authenticate DNS updates.", editable=True),
|
||||||
|
field("dyndomains", "Dynamic domains", "list",
|
||||||
|
"Domains updated via nsupdate when a host with dyndns: true reports in.",
|
||||||
|
editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "users",
|
||||||
|
"title": "Users",
|
||||||
|
"description": "Accounts defined in the config file. Password hashes are never shown.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "users",
|
||||||
|
"users": users_list,
|
||||||
|
"fields": [
|
||||||
|
field("default_owner", "Default owner", "text",
|
||||||
|
"Username that owns hosts with no explicit owner. "
|
||||||
|
"Falls back to the first admin user.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "oauth",
|
||||||
|
"title": "OAuth Providers",
|
||||||
|
"description": "OAuth2 login providers. Client secrets are masked.",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": "oauth",
|
||||||
|
"providers": oauth_providers,
|
||||||
|
"fields": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "channels",
|
||||||
|
"title": "Notification Channels",
|
||||||
|
"description": "Named notification providers. Credentials are masked.",
|
||||||
|
"section_mode": "channels",
|
||||||
|
"api_section": "notification_channels",
|
||||||
|
"channels": notif_channels,
|
||||||
|
"fields": [
|
||||||
|
field("default_notification_channels", "Default channels", "list",
|
||||||
|
"Channels used when a host does not specify its own."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hosts",
|
||||||
|
"title": "Hosts",
|
||||||
|
"description": "Host definitions loaded from the config file.",
|
||||||
|
"section_mode": "hosts",
|
||||||
|
"api_section": "hosts",
|
||||||
|
"hosts": hosts_list,
|
||||||
|
"fields": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"title": "Threshold Configurations",
|
||||||
|
"description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
|
||||||
|
"section_mode": "thresholds",
|
||||||
|
"api_section": "thresholds",
|
||||||
|
"threshold_configs": threshold_config_list,
|
||||||
|
"fields": [
|
||||||
|
field("default_threshold_config", "Default config", "text",
|
||||||
|
"Threshold config used for hosts with no explicit mapping.", editable=True),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "runtime",
|
||||||
|
"title": "Runtime",
|
||||||
|
"description": "Flags set at startup (require restart to change).",
|
||||||
|
"section_mode": "form",
|
||||||
|
"api_section": None,
|
||||||
|
"fields": [
|
||||||
|
field("foreground", "Foreground mode", "boolean",
|
||||||
|
"Run in the foreground instead of daemonising."),
|
||||||
|
field("verbose", "Verbose logging", "boolean",
|
||||||
|
"Enable verbose log output."),
|
||||||
|
field("debug", "Debug level", "number",
|
||||||
|
"0 = off. Higher values increase log verbosity."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_settings_data(config: dict, threshold_checker=None) -> dict:
|
||||||
|
"""Return sections list + auxiliary data for the settings template."""
|
||||||
|
sections = get_settings_sections(config, threshold_checker=threshold_checker)
|
||||||
|
all_channel_names = sorted((config.get("notification_channels") or {}).keys())
|
||||||
|
all_usernames = sorted((config.get("users") or {}).keys())
|
||||||
|
all_threshold_configs = sorted((config.get("threshold_configs") or {}).keys())
|
||||||
|
return {
|
||||||
|
"sections": sections,
|
||||||
|
"all_channel_names": all_channel_names,
|
||||||
|
"all_usernames": all_usernames,
|
||||||
|
"all_threshold_configs": all_threshold_configs,
|
||||||
|
}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 181 KiB |
@@ -140,3 +140,68 @@
|
|||||||
float: left;
|
float: left;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── Responsive / mobile ── */
|
||||||
|
|
||||||
|
/* Suppress the global transition on mobile to avoid sluggish feel */
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
* { transition: none !important; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
overflow: auto;
|
||||||
|
height: auto;
|
||||||
|
font-size: 16px; /* prevent iOS auto-zoom on inputs */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pages that use flex-column full-viewport layout need to relax on mobile */
|
||||||
|
body[style*="height: 100vh"],
|
||||||
|
body {
|
||||||
|
height: auto !important;
|
||||||
|
min-height: 100vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Containers: full width, no fixed heights */
|
||||||
|
.container {
|
||||||
|
max-width: 100% !important;
|
||||||
|
max-height: none !important;
|
||||||
|
overflow: visible !important;
|
||||||
|
padding: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Log section: fixed reasonable height instead of flex-grow */
|
||||||
|
.log-section {
|
||||||
|
flex: none !important;
|
||||||
|
max-height: 40vh !important;
|
||||||
|
overflow-y: auto !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Table section: allow vertical scroll, cap height */
|
||||||
|
.table-section {
|
||||||
|
max-height: 55vh !important;
|
||||||
|
overflow-y: auto !important;
|
||||||
|
overflow-x: auto !important;
|
||||||
|
padding: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Slightly larger tap targets in tables */
|
||||||
|
#ntable td, #ntable th {
|
||||||
|
padding: 4px 6px !important;
|
||||||
|
font-size: 1.00em !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cards on plugin/alerts pages */
|
||||||
|
.host-card, .alert-card, .card {
|
||||||
|
padding: 10px !important;
|
||||||
|
margin-bottom: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Settings page tables */
|
||||||
|
table { width: 100%; }
|
||||||
|
|
||||||
|
h1 { font-size: 1.2em !important; }
|
||||||
|
h2 { font-size: 1em !important; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Suppress nav-username text on very narrow screens — avatar/initials is enough */
|
||||||
|
@media (max-width: 400px) {
|
||||||
|
.nav-username { display: none; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,212 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
html, body { overflow: visible; }
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 700px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 20px 24px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section h2 {
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #333;
|
||||||
|
margin: 0 0 16px;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 8px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.info-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.info-label {
|
||||||
|
width: 160px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.88em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-value {
|
||||||
|
color: #222;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-value a {
|
||||||
|
color: #0066cc;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.info-value a:hover { text-decoration: underline; }
|
||||||
|
|
||||||
|
.version-badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 3px 12px;
|
||||||
|
background: #e8f0fe;
|
||||||
|
color: #1a73e8;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 1.00em;
|
||||||
|
font-weight: 600;
|
||||||
|
font-family: monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hb-logo {
|
||||||
|
font-size: 2.5em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #0066cc;
|
||||||
|
letter-spacing: -1px;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hb-tagline {
|
||||||
|
color: #555;
|
||||||
|
font-size: 0.95em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo-section {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 20px;
|
||||||
|
padding: 8px 0 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo-text { flex: 1; }
|
||||||
|
|
||||||
|
/* ── Dark mode ── */
|
||||||
|
html[data-theme="dark"] h1 { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .section { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||||
|
html[data-theme="dark"] .section h2 { color: var(--text); border-bottom-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .info-row { border-bottom-color: var(--border-4); }
|
||||||
|
html[data-theme="dark"] .info-label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .info-value { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .info-value a { color: var(--link); }
|
||||||
|
html[data-theme="dark"] .hb-logo { color: var(--link); }
|
||||||
|
html[data-theme="dark"] .hb-tagline { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .version-badge { background: #1a3255; color: #60a5fa; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Heartbeat monitoring system</p>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<div class="logo-section">
|
||||||
|
<div class="logo-text">
|
||||||
|
<div class="hb-logo">Heartbeat</div>
|
||||||
|
<div class="hb-tagline">Lightweight host monitoring over UDP</div>
|
||||||
|
</div>
|
||||||
|
<span class="version-badge">v{{ hbd_version }}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Version</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Server version</span>
|
||||||
|
<span class="info-value">{{ hbd_version }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Python</span>
|
||||||
|
<span class="info-value">{{ python_version }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">License</span>
|
||||||
|
<span class="info-value">MIT</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Runtime</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Host</span>
|
||||||
|
<span class="info-value">{{ server_hostname }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Started</span>
|
||||||
|
<span class="info-value">{{ start_time_str }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Uptime</span>
|
||||||
|
<span class="info-value" id="uptime-value">{{ uptime_str }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Hosts monitored</span>
|
||||||
|
<span class="info-value">{{ host_count }}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Contact & Source</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Author</span>
|
||||||
|
<span class="info-value">Andreas Wrede</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Email</span>
|
||||||
|
<span class="info-value"><a href="mailto:aew.hbd@wrede.ca">aew.hbd@wrede.ca</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Repository</span>
|
||||||
|
<span class="info-value"><a href="https://git.wrede.ca/andreas/heartbeat" target="_blank" rel="noopener">git.wrede.ca/andreas/heartbeat</a></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
var startEpoch = {{ start_epoch }};
|
||||||
|
var el = document.getElementById('uptime-value');
|
||||||
|
if (!el) return;
|
||||||
|
function fmt(s) {
|
||||||
|
var d = Math.floor(s / 86400);
|
||||||
|
var h = Math.floor((s % 86400) / 3600);
|
||||||
|
var m = Math.floor((s % 3600) / 60);
|
||||||
|
var sec = s % 60;
|
||||||
|
if (d > 0) return d + 'd ' + h + 'h ' + m + 'm';
|
||||||
|
if (h > 0) return h + 'h ' + m + 'm ' + sec + 's';
|
||||||
|
return m + 'm ' + sec + 's';
|
||||||
|
}
|
||||||
|
function tick() {
|
||||||
|
var up = Math.floor(Date.now() / 1000 - startEpoch);
|
||||||
|
el.textContent = fmt(up);
|
||||||
|
}
|
||||||
|
tick();
|
||||||
|
setInterval(tick, 1000);
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,623 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
height: auto;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 1400px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-cards {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 10px;
|
||||||
|
margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-card {
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 6px 14px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
border-left: 4px solid #ddd;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-card.critical { border-left-color: #ea1e0f; }
|
||||||
|
.summary-card.warning { border-left-color: #ff9800; }
|
||||||
|
.summary-card.ok { border-left-color: #4caf50; }
|
||||||
|
|
||||||
|
.summary-number {
|
||||||
|
font-size: 1.4em;
|
||||||
|
font-weight: bold;
|
||||||
|
line-height: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-number.critical { color: #ea1e0f; }
|
||||||
|
.summary-number.warning { color: #ff9800; }
|
||||||
|
.summary-number.ok { color: #4caf50; }
|
||||||
|
|
||||||
|
.summary-label {
|
||||||
|
color: #666;
|
||||||
|
font-size: 1.00em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filters {
|
||||||
|
background: white;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
display: flex;
|
||||||
|
gap: 15px;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-label {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button {
|
||||||
|
padding: 8px 16px;
|
||||||
|
border: 2px solid #ddd;
|
||||||
|
background: white;
|
||||||
|
border-radius: 20px;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: all 0.2s;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button:hover {
|
||||||
|
border-color: #2196f3;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button.active {
|
||||||
|
background: #2196f3;
|
||||||
|
color: white;
|
||||||
|
border-color: #2196f3;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-input {
|
||||||
|
padding: 7px 12px;
|
||||||
|
border: 2px solid #ddd;
|
||||||
|
border-radius: 20px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
outline: none;
|
||||||
|
width: 200px;
|
||||||
|
transition: border-color 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-input:focus {
|
||||||
|
border-color: #2196f3;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-input.invalid {
|
||||||
|
border-color: #f44336;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alerts-container {
|
||||||
|
background: white;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 20px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item {
|
||||||
|
border-left: 5px solid #ddd;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
background: #fafafa;
|
||||||
|
border-radius: 4px;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
transition: all 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.acknowledged {
|
||||||
|
opacity: 0.8;
|
||||||
|
background: #f0f0f0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item:hover {
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
transform: translateX(5px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.critical {
|
||||||
|
border-left-color: #f44336;
|
||||||
|
background: #ffebee;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.warning {
|
||||||
|
border-left-color: #ff9800;
|
||||||
|
background: #fff3e0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.unknown {
|
||||||
|
border-left-color: #9e9e9e;
|
||||||
|
background: #f5f5f5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-main {
|
||||||
|
flex: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 15px;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level {
|
||||||
|
padding: 4px 12px;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: bold;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.critical {
|
||||||
|
background: #f44336;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.warning {
|
||||||
|
background: #ff9800;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.unknown {
|
||||||
|
background: #9e9e9e;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-hostname {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #0066cc;
|
||||||
|
font-size: 1.1em;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.alert-hostname:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-metric {
|
||||||
|
color: #0066cc;
|
||||||
|
font-size: 1.1em;
|
||||||
|
font-weight: normal;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-details {
|
||||||
|
display: flex;
|
||||||
|
gap: 20px;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-value {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-duration {
|
||||||
|
color: #999;
|
||||||
|
font-size: 1.00em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-actions {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 8px;
|
||||||
|
margin-left: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn {
|
||||||
|
padding: 8px 16px;
|
||||||
|
background: #2196f3;
|
||||||
|
color: white;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 1.00em;
|
||||||
|
transition: all 0.2s;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn:hover {
|
||||||
|
background: #1976d2;
|
||||||
|
transform: scale(1.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn:disabled {
|
||||||
|
background: #ccc;
|
||||||
|
cursor: not-allowed;
|
||||||
|
transform: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledged-badge {
|
||||||
|
padding: 4px 8px;
|
||||||
|
background: #4caf50;
|
||||||
|
color: white;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
text-align: center;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-alerts {
|
||||||
|
text-align: center;
|
||||||
|
padding: 60px 20px;
|
||||||
|
color: #999;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-alerts-icon {
|
||||||
|
font-size: 4em;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.loading {
|
||||||
|
text-align: center;
|
||||||
|
padding: 40px;
|
||||||
|
color: #666;
|
||||||
|
}
|
||||||
|
|
||||||
|
.error {
|
||||||
|
background: #ffebee;
|
||||||
|
border-left: 4px solid #f44336;
|
||||||
|
padding: 20px;
|
||||||
|
margin: 20px 0;
|
||||||
|
border-radius: 4px;
|
||||||
|
color: #c62828;
|
||||||
|
}
|
||||||
|
|
||||||
|
.refresh-info {
|
||||||
|
text-align: center;
|
||||||
|
color: #999;
|
||||||
|
font-size: 1.00em;
|
||||||
|
margin-top: 20px;
|
||||||
|
padding-top: 20px;
|
||||||
|
border-top: 1px solid #e0e0e0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.last-update {
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.9em;
|
||||||
|
text-align: right;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Dark mode ── */
|
||||||
|
html[data-theme="dark"] h1 { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .summary-card { background: var(--surface); }
|
||||||
|
html[data-theme="dark"] .summary-label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .filters { background: var(--surface); }
|
||||||
|
html[data-theme="dark"] .filter-label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .filter-button { background: var(--surface-2); border-color: var(--border); color: var(--text); }
|
||||||
|
html[data-theme="dark"] .filter-button.active { background: #2196f3; color: #fff; border-color: #2196f3; }
|
||||||
|
html[data-theme="dark"] .filter-input { background: var(--input-bg); border-color: var(--input-border); color: var(--text); }
|
||||||
|
html[data-theme="dark"] .alerts-container { background: var(--surface); }
|
||||||
|
html[data-theme="dark"] .alert-item { background: var(--surface-2); }
|
||||||
|
html[data-theme="dark"] .alert-item.acknowledged { background: var(--surface-3); }
|
||||||
|
html[data-theme="dark"] .alert-item.critical { background: #2e0a0a; border-left-color: #f44336; }
|
||||||
|
html[data-theme="dark"] .alert-item.warning { background: #2e1a00; border-left-color: #ff9800; }
|
||||||
|
html[data-theme="dark"] .alert-item.unknown { background: var(--surface-2); }
|
||||||
|
html[data-theme="dark"] .alert-hostname { color: var(--link); }
|
||||||
|
html[data-theme="dark"] .alert-details { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .alert-value { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .alert-duration { color: var(--text-muted); }
|
||||||
|
html[data-theme="dark"] .last-update { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .refresh-info { color: var(--text-muted); border-top-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .no-alerts,
|
||||||
|
html[data-theme="dark"] .loading { color: var(--text-muted); }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
|
||||||
|
|
||||||
|
<div class="summary-cards" id="summary-cards">
|
||||||
|
<div class="summary-card critical">
|
||||||
|
<div class="summary-label">Critical</div>
|
||||||
|
<div class="summary-number critical" id="critical-count">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="summary-card warning">
|
||||||
|
<div class="summary-label">Warning</div>
|
||||||
|
<div class="summary-number warning" id="warning-count">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="summary-card ok">
|
||||||
|
<div class="summary-label">Total Hosts</div>
|
||||||
|
<div class="summary-number ok" id="host-count">-</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="filters">
|
||||||
|
<span class="filter-label">Show:</span>
|
||||||
|
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
|
||||||
|
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
|
||||||
|
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
|
||||||
|
<input id="host-filter" class="filter-input" type="text" placeholder="host filter (regex)" oninput="onHostFilterInput(this)">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="alerts-container">
|
||||||
|
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
|
||||||
|
<div id="alerts-list">
|
||||||
|
<div class="loading">Loading alerts...</div>
|
||||||
|
</div>
|
||||||
|
<div class="refresh-info">
|
||||||
|
Auto-refreshing every 15 seconds
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let currentFilter = 'all';
|
||||||
|
let allAlerts = [];
|
||||||
|
let hostFilterRe = null;
|
||||||
|
|
||||||
|
async function loadAlerts() {
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/0/alerts');
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
allAlerts = data.alerts;
|
||||||
|
|
||||||
|
// Update summary cards
|
||||||
|
document.getElementById('critical-count').textContent = data.summary.critical || 0;
|
||||||
|
document.getElementById('warning-count').textContent = data.summary.warning || 0;
|
||||||
|
document.getElementById('host-count').textContent = data.host_count || 0;
|
||||||
|
|
||||||
|
// Update last update time
|
||||||
|
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
|
||||||
|
|
||||||
|
// Render alerts
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
document.getElementById('alerts-list').innerHTML =
|
||||||
|
`<div class="error">Failed to load alerts: ${error.message}</div>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderAlerts(alerts) {
|
||||||
|
const container = document.getElementById('alerts-list');
|
||||||
|
|
||||||
|
// Filter alerts based on current filter
|
||||||
|
let filteredAlerts = alerts;
|
||||||
|
if (currentFilter !== 'all') {
|
||||||
|
filteredAlerts = filteredAlerts.filter(alert =>
|
||||||
|
alert.level.toLowerCase() === currentFilter
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (hostFilterRe) {
|
||||||
|
filteredAlerts = filteredAlerts.filter(alert => hostFilterRe.test(alert.hostname));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filteredAlerts.length === 0) {
|
||||||
|
if (currentFilter === 'all' && alerts.length === 0) {
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="no-alerts">
|
||||||
|
<div class="no-alerts-icon">✓</div>
|
||||||
|
<h2>All Systems Normal</h2>
|
||||||
|
<p>No active alerts at this time</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="no-alerts">
|
||||||
|
<p>No ${currentFilter} alerts</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = '';
|
||||||
|
for (const alert of filteredAlerts) {
|
||||||
|
html += renderAlert(alert);
|
||||||
|
}
|
||||||
|
container.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderAlert(alert) {
|
||||||
|
const level = alert.level.toLowerCase();
|
||||||
|
const duration = getDuration(alert.since);
|
||||||
|
const acknowledged = alert.acknowledged || false;
|
||||||
|
|
||||||
|
// Use formatted message if available, otherwise build from individual fields
|
||||||
|
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||||
|
if (alert.formatted_message) {
|
||||||
|
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
|
||||||
|
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||||
|
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||||
|
}
|
||||||
|
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||||
|
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||||
|
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build actions section
|
||||||
|
let actionsHtml = '';
|
||||||
|
if (acknowledged) {
|
||||||
|
actionsHtml = `
|
||||||
|
<div class="alert-actions">
|
||||||
|
<div class="acknowledged-badge">✓ Acknowledged</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
actionsHtml = `
|
||||||
|
<div class="alert-actions">
|
||||||
|
<button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
|
||||||
|
Acknowledge
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
|
||||||
|
<div class="alert-main">
|
||||||
|
<div class="alert-header">
|
||||||
|
<span class="alert-level ${level}">${alert.level}</span>
|
||||||
|
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||||
|
<span class="alert-metric">${(alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path).replace(/_status_code$/, '')}</span>
|
||||||
|
</div>
|
||||||
|
<div class="alert-details">
|
||||||
|
<span>${valueText}</span>
|
||||||
|
<span class="alert-duration">Active for ${duration}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
${actionsHtml}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatValue(value) {
|
||||||
|
if (typeof value === 'number') {
|
||||||
|
if (value > 1000) {
|
||||||
|
return value.toLocaleString();
|
||||||
|
}
|
||||||
|
return value.toFixed(2);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getDuration(timestamp) {
|
||||||
|
const now = Date.now() / 1000;
|
||||||
|
const seconds = Math.floor(now - timestamp);
|
||||||
|
|
||||||
|
if (seconds < 60) {
|
||||||
|
return `${seconds}s`;
|
||||||
|
} else if (seconds < 3600) {
|
||||||
|
return `${Math.floor(seconds / 60)}m`;
|
||||||
|
} else if (seconds < 86400) {
|
||||||
|
const hours = Math.floor(seconds / 3600);
|
||||||
|
const minutes = Math.floor((seconds % 3600) / 60);
|
||||||
|
return `${hours}h ${minutes}m`;
|
||||||
|
} else {
|
||||||
|
const days = Math.floor(seconds / 86400);
|
||||||
|
const hours = Math.floor((seconds % 86400) / 3600);
|
||||||
|
return `${days}d ${hours}h`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function filterAlerts(filter) {
|
||||||
|
currentFilter = filter;
|
||||||
|
|
||||||
|
// Update active button
|
||||||
|
document.querySelectorAll('.filter-button').forEach(btn => {
|
||||||
|
btn.classList.remove('active');
|
||||||
|
});
|
||||||
|
event.target.classList.add('active');
|
||||||
|
|
||||||
|
// Re-render with new filter
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function acknowledgeAlert(hostname, metricPath, event) {
|
||||||
|
// Prevent event bubbling
|
||||||
|
if (event) {
|
||||||
|
event.stopPropagation();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disable the button
|
||||||
|
const button = event.target;
|
||||||
|
button.disabled = true;
|
||||||
|
button.textContent = 'Acknowledging...';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/0/alerts/acknowledge', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
hostname: hostname,
|
||||||
|
metric_path: metricPath,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
// Update the alert in our local data
|
||||||
|
const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
|
||||||
|
if (alert) {
|
||||||
|
alert.acknowledged = true;
|
||||||
|
alert.acknowledged_at = result.acknowledged_at;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-render alerts
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
alert(`Failed to acknowledge alert: ${error.message}`);
|
||||||
|
button.disabled = false;
|
||||||
|
button.textContent = 'Acknowledge';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function onHostFilterInput(input) {
|
||||||
|
const val = input.value.trim();
|
||||||
|
if (!val) {
|
||||||
|
hostFilterRe = null;
|
||||||
|
input.classList.remove('invalid');
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
hostFilterRe = new RegExp(val, 'i');
|
||||||
|
input.classList.remove('invalid');
|
||||||
|
} catch (_) {
|
||||||
|
hostFilterRe = null;
|
||||||
|
input.classList.add('invalid');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-refresh every 15 seconds
|
||||||
|
setInterval(loadAlerts, 15000);
|
||||||
|
|
||||||
|
// Initialise filter from URL query string (?filter=...)
|
||||||
|
(function () {
|
||||||
|
const param = new URLSearchParams(window.location.search).get('filter');
|
||||||
|
if (param) {
|
||||||
|
const input = document.getElementById('host-filter');
|
||||||
|
input.value = param;
|
||||||
|
onHostFilterInput(input);
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
// Initial load
|
||||||
|
loadAlerts();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
<footer>
|
<footer>
|
||||||
<div id="copyright">
|
<div id="copyright">
|
||||||
©2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
©2002-2026 <A HREF="mailto:aew.hbd@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||||
</div>
|
</div>
|
||||||
</footer>
|
</footer>
|
||||||
@@ -0,0 +1,386 @@
|
|||||||
|
<head>
|
||||||
|
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||||
|
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
{% if extra_scripts %}<script src="{{ extra_scripts }}"></script>{% endif %}
|
||||||
|
<script>
|
||||||
|
/* Apply saved theme before first paint to avoid flash */
|
||||||
|
(function() {
|
||||||
|
try {
|
||||||
|
var p = localStorage.getItem('hbd_theme') || 'auto';
|
||||||
|
var dark = p === 'dark' || (p === 'auto' && window.matchMedia('(prefers-color-scheme: dark)').matches);
|
||||||
|
if (dark) document.documentElement.setAttribute('data-theme', 'dark');
|
||||||
|
} catch(e) {}
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
<style>
|
||||||
|
/* ── Theme variables ── */
|
||||||
|
:root {
|
||||||
|
--bg: #f5f5f5;
|
||||||
|
--surface: #ffffff;
|
||||||
|
--surface-2: #f8f8f8;
|
||||||
|
--surface-3: #f5f5f5;
|
||||||
|
--text: #222222;
|
||||||
|
--text-2: #333333;
|
||||||
|
--text-3: #555555;
|
||||||
|
--text-sec: #666666;
|
||||||
|
--text-muted: #888888;
|
||||||
|
--text-dim: #aaaaaa;
|
||||||
|
--text-ghost: #cccccc;
|
||||||
|
--border: #e0e0e0;
|
||||||
|
--border-2: #eeeeee;
|
||||||
|
--border-3: #f0f0f0;
|
||||||
|
--border-4: #f5f5f5;
|
||||||
|
--link: #0066cc;
|
||||||
|
--nav-bg: #ffffff;
|
||||||
|
--input-bg: #ffffff;
|
||||||
|
--input-border: #cccccc;
|
||||||
|
--shadow-sm: rgba(0,0,0,.08);
|
||||||
|
--shadow: rgba(0,0,0,.10);
|
||||||
|
--shadow-nav: rgba(0,0,0,.10);
|
||||||
|
}
|
||||||
|
html[data-theme="dark"] {
|
||||||
|
color-scheme: dark;
|
||||||
|
--bg: #111827;
|
||||||
|
--surface: #1f2937;
|
||||||
|
--surface-2: #283447;
|
||||||
|
--surface-3: #374151;
|
||||||
|
--text: #e5e7eb;
|
||||||
|
--text-2: #d1d5db;
|
||||||
|
--text-3: #9ca3af;
|
||||||
|
--text-sec: #9ca3af;
|
||||||
|
--text-muted: #6b7280;
|
||||||
|
--text-dim: #4b5563;
|
||||||
|
--text-ghost: #374151;
|
||||||
|
--border: #374151;
|
||||||
|
--border-2: #2d3748;
|
||||||
|
--border-3: #253040;
|
||||||
|
--border-4: #1e2a38;
|
||||||
|
--link: #60a5fa;
|
||||||
|
--nav-bg: #1f2937;
|
||||||
|
--input-bg: #283447;
|
||||||
|
--input-border: #4b5563;
|
||||||
|
--shadow-sm: rgba(0,0,0,.30);
|
||||||
|
--shadow: rgba(0,0,0,.40);
|
||||||
|
--shadow-nav: rgba(0,0,0,.40);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Reset / shared baseline ── */
|
||||||
|
*, *::before, *::after { box-sizing: border-box; }
|
||||||
|
html {
|
||||||
|
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
|
||||||
|
font-size: 14px;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 10px;
|
||||||
|
padding-top: 60px;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--text);
|
||||||
|
}
|
||||||
|
h1 { font-size: 1.5em; color: var(--text-2); margin: 0 0 5px; }
|
||||||
|
h2 { font-size: 1.1em; color: var(--text-2); margin: 0 0 8px; }
|
||||||
|
p { margin: 0; }
|
||||||
|
|
||||||
|
/* Navigation bar — shared across all pages */
|
||||||
|
.nav {
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
right: 0;
|
||||||
|
z-index: 200;
|
||||||
|
background: var(--nav-bg);
|
||||||
|
padding: 6px 12px;
|
||||||
|
box-shadow: 0 2px 4px var(--shadow-nav);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: space-between;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
.nav-links { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
|
||||||
|
.nav a {
|
||||||
|
margin-right: 20px;
|
||||||
|
text-decoration: none;
|
||||||
|
color: var(--link);
|
||||||
|
font-weight: 500;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.nav a:hover { text-decoration: underline; }
|
||||||
|
.nav a.active { color: var(--text-2); font-weight: bold; }
|
||||||
|
.nav-user {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
text-decoration: none;
|
||||||
|
color: var(--text-2);
|
||||||
|
font-size: 0.9em;
|
||||||
|
font-weight: 500;
|
||||||
|
padding: 4px 8px;
|
||||||
|
border-radius: 20px;
|
||||||
|
transition: background 0.15s;
|
||||||
|
}
|
||||||
|
.nav-user:hover { background: var(--surface-2); text-decoration: none; }
|
||||||
|
.nav-username {
|
||||||
|
max-width: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
white-space: nowrap;
|
||||||
|
opacity: 0;
|
||||||
|
transition: max-width 0.2s ease, opacity 0.2s ease;
|
||||||
|
}
|
||||||
|
.nav-user:hover .nav-username {
|
||||||
|
max-width: 160px;
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
.nav-avatar {
|
||||||
|
width: 28px; height: 28px;
|
||||||
|
border-radius: 50%;
|
||||||
|
object-fit: cover;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.nav-initials {
|
||||||
|
width: 28px; height: 28px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: var(--link);
|
||||||
|
color: #fff;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: 700;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Mobile nav: hamburger toggle ── */
|
||||||
|
.nav-hamburger {
|
||||||
|
display: none;
|
||||||
|
flex-direction: column;
|
||||||
|
justify-content: space-between;
|
||||||
|
width: 26px; height: 20px;
|
||||||
|
cursor: pointer;
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: none;
|
||||||
|
border: none;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
.nav-hamburger span {
|
||||||
|
display: block;
|
||||||
|
height: 3px;
|
||||||
|
background: var(--text-muted);
|
||||||
|
border-radius: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
.nav-hamburger { display: flex; }
|
||||||
|
.nav-links {
|
||||||
|
display: none;
|
||||||
|
width: 100%;
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: flex-start;
|
||||||
|
padding-top: 8px;
|
||||||
|
border-top: 1px solid var(--border-2);
|
||||||
|
order: 3;
|
||||||
|
}
|
||||||
|
.nav-links.nav-open { display: flex; }
|
||||||
|
.nav-links a { margin-right: 0; padding: 6px 0; font-size: 1em; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Global dark-mode: inputs ── */
|
||||||
|
html[data-theme="dark"] input:not([type=checkbox]):not([type=radio]),
|
||||||
|
html[data-theme="dark"] select,
|
||||||
|
html[data-theme="dark"] textarea {
|
||||||
|
background-color: var(--input-bg);
|
||||||
|
border-color: var(--input-border);
|
||||||
|
color: var(--text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pending config publish button */
|
||||||
|
.nav-publish-btn {
|
||||||
|
background: #e65100;
|
||||||
|
color: #fff;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 4px 10px;
|
||||||
|
font-size: 0.82em;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
flex-shrink: 0;
|
||||||
|
white-space: nowrap;
|
||||||
|
margin-left: auto;
|
||||||
|
}
|
||||||
|
.nav-publish-btn:hover { background: #bf360c; }
|
||||||
|
.nav-publish-btn:disabled { opacity: 0.7; cursor: default; }
|
||||||
|
|
||||||
|
/* Swiss railway clock — nav */
|
||||||
|
.nav-pie {
|
||||||
|
flex-shrink: 0;
|
||||||
|
line-height: 0;
|
||||||
|
margin-left: auto;
|
||||||
|
padding: 4px 4px 4px 0;
|
||||||
|
}
|
||||||
|
#alert-pie { display: block; cursor: default; }
|
||||||
|
.nav-clock {
|
||||||
|
flex-shrink: 0;
|
||||||
|
line-height: 0;
|
||||||
|
padding: 4px 4px 4px 0;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#swiss-clock { display: block; }
|
||||||
|
|
||||||
|
/* Swiss railway clock — full-page overlay */
|
||||||
|
#clock-overlay {
|
||||||
|
display: none;
|
||||||
|
position: fixed;
|
||||||
|
inset: 0;
|
||||||
|
z-index: 9999;
|
||||||
|
background: #1a1a1a;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#clock-overlay.visible { display: flex; }
|
||||||
|
#swiss-clock-overlay { display: block; }
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
/* ── Swiss Federal Railway (SBB) clock ── */
|
||||||
|
|
||||||
|
/* Draw one frame of the clock onto any canvas element. */
|
||||||
|
function drawSwissClock(canvas) {
|
||||||
|
var SIZE = canvas.width;
|
||||||
|
var R = SIZE / 2;
|
||||||
|
var ctx = canvas.getContext('2d');
|
||||||
|
var now = new Date();
|
||||||
|
var h = now.getHours() % 12;
|
||||||
|
var m = now.getMinutes();
|
||||||
|
var s = now.getSeconds();
|
||||||
|
var ms = now.getMilliseconds();
|
||||||
|
|
||||||
|
/* Seconds hand idles ~1.5 s at 12 before advancing (SBB behaviour) */
|
||||||
|
var sFrac = s + ms / 1000;
|
||||||
|
var sAngle = sFrac >= 58.5 ? 0 : (sFrac / 58.5) * Math.PI * 2;
|
||||||
|
|
||||||
|
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||||
|
|
||||||
|
/* face */
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#fff';
|
||||||
|
ctx.fill();
|
||||||
|
ctx.strokeStyle = '#333';
|
||||||
|
ctx.lineWidth = SIZE * 0.018;
|
||||||
|
ctx.stroke();
|
||||||
|
|
||||||
|
/* tick marks */
|
||||||
|
for (var i = 0; i < 60; i++) {
|
||||||
|
var a = (i / 60) * Math.PI * 2 - Math.PI / 2;
|
||||||
|
var isHour = (i % 5 === 0);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(R + Math.cos(a) * (isHour ? R * 0.72 : R * 0.88),
|
||||||
|
R + Math.sin(a) * (isHour ? R * 0.72 : R * 0.88));
|
||||||
|
ctx.lineTo(R + Math.cos(a) * R * 0.94,
|
||||||
|
R + Math.sin(a) * R * 0.94);
|
||||||
|
ctx.strokeStyle = '#222';
|
||||||
|
ctx.lineWidth = isHour ? SIZE * 0.027 : SIZE * 0.011;
|
||||||
|
ctx.lineCap = 'butt';
|
||||||
|
ctx.stroke();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* hands */
|
||||||
|
function hand(angle, tip, tail, width, color) {
|
||||||
|
ctx.save();
|
||||||
|
ctx.translate(R, R);
|
||||||
|
ctx.rotate(angle);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(tail, 0);
|
||||||
|
ctx.lineTo(tip, 0);
|
||||||
|
ctx.strokeStyle = color;
|
||||||
|
ctx.lineWidth = width;
|
||||||
|
ctx.lineCap = 'square';
|
||||||
|
ctx.stroke();
|
||||||
|
ctx.restore();
|
||||||
|
}
|
||||||
|
|
||||||
|
hand((sFrac >= 58.5 ? m + 1 : m) / 60 * Math.PI * 2 - Math.PI / 2,
|
||||||
|
R * 0.88, -R * 0.12, SIZE * 0.027, '#222'); /* minute */
|
||||||
|
hand((h + m / 60) / 12 * Math.PI * 2 - Math.PI / 2,
|
||||||
|
R * 0.58, -R * 0.12, SIZE * 0.039, '#222'); /* hour */
|
||||||
|
hand(sAngle - Math.PI / 2, R * 0.78, -R * 0.22,
|
||||||
|
SIZE * 0.013, '#e00'); /* second tail+tip */
|
||||||
|
|
||||||
|
/* round dot at tip of second hand */
|
||||||
|
var dotR = SIZE * 0.028;
|
||||||
|
ctx.save();
|
||||||
|
ctx.translate(R, R);
|
||||||
|
ctx.rotate(sAngle - Math.PI / 2);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R * 0.78, 0, dotR, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#e00';
|
||||||
|
ctx.fill();
|
||||||
|
ctx.restore();
|
||||||
|
|
||||||
|
/* centre cap */
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R * 0.04, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#222';
|
||||||
|
ctx.fill();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Resize the overlay canvas to fit the viewport, keeping it square. */
|
||||||
|
function resizeOverlayClock() {
|
||||||
|
var oc = document.getElementById('swiss-clock-overlay');
|
||||||
|
if (!oc) return;
|
||||||
|
var size = Math.min(window.innerWidth, window.innerHeight) * 0.88;
|
||||||
|
size = Math.floor(size);
|
||||||
|
oc.width = size;
|
||||||
|
oc.height = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Main tick — redraws both nav clock and (if visible) overlay clock. */
|
||||||
|
function clockTick() {
|
||||||
|
var nav = document.getElementById('swiss-clock');
|
||||||
|
if (nav) drawSwissClock(nav);
|
||||||
|
var overlay = document.getElementById('clock-overlay');
|
||||||
|
if (overlay && overlay.classList.contains('visible')) {
|
||||||
|
var oc = document.getElementById('swiss-clock-overlay');
|
||||||
|
if (oc) drawSwissClock(oc);
|
||||||
|
}
|
||||||
|
var delay = 100 - (Date.now() % 100);
|
||||||
|
setTimeout(clockTick, delay);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Keep auto-theme in sync with system setting changes */
|
||||||
|
try {
|
||||||
|
window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', function(e) {
|
||||||
|
var pref = localStorage.getItem('hbd_theme') || 'auto';
|
||||||
|
if (pref === 'auto') {
|
||||||
|
if (e.matches) { document.documentElement.setAttribute('data-theme', 'dark'); }
|
||||||
|
else { document.documentElement.removeAttribute('data-theme'); }
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
/* Start the shared tick loop */
|
||||||
|
clockTick();
|
||||||
|
|
||||||
|
/* Overlay toggle — clicking the nav clock opens it */
|
||||||
|
var navClock = document.querySelector('.nav-clock');
|
||||||
|
var overlay = document.getElementById('clock-overlay');
|
||||||
|
if (navClock && overlay) {
|
||||||
|
navClock.addEventListener('click', function() {
|
||||||
|
resizeOverlayClock();
|
||||||
|
overlay.classList.add('visible');
|
||||||
|
});
|
||||||
|
overlay.addEventListener('click', function() {
|
||||||
|
overlay.classList.remove('visible');
|
||||||
|
});
|
||||||
|
window.addEventListener('resize', function() {
|
||||||
|
if (overlay.classList.contains('visible')) resizeOverlayClock();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
<script src="static/sorttable.js"></script>
|
||||||
|
</head>
|
||||||
@@ -0,0 +1,693 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
height: 100vh;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
body {
|
||||||
|
height: auto;
|
||||||
|
min-height: 100vh;
|
||||||
|
overflow: auto;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
.container {
|
||||||
|
max-height: none;
|
||||||
|
overflow: visible;
|
||||||
|
}
|
||||||
|
.table-section {
|
||||||
|
max-height: 55vh;
|
||||||
|
}
|
||||||
|
.log-section {
|
||||||
|
flex: none;
|
||||||
|
max-height: 40vh;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
flex: 1;
|
||||||
|
min-height: 0;
|
||||||
|
max-width: 1600px;
|
||||||
|
width: 100%;
|
||||||
|
margin: 0 auto;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 15px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
margin-top: 15px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
h2 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
font-size: 1.2em;
|
||||||
|
padding: 10px 15px;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.content {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.table-section {
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 15px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
overflow-x: auto;
|
||||||
|
overflow-y: auto;
|
||||||
|
max-height: 60vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section {
|
||||||
|
flex: 1;
|
||||||
|
min-height: 0;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 15px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable {
|
||||||
|
border-collapse: collapse;
|
||||||
|
width: 100%;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable td,
|
||||||
|
#ntable th {
|
||||||
|
border: 1px solid #e0e0e0;
|
||||||
|
text-align: left;
|
||||||
|
padding: 2px 4px;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tr:nth-child(even) {
|
||||||
|
background-color: #fafafa;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tr:hover {
|
||||||
|
background-color: #e3f2fd;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-warning {
|
||||||
|
background-color: #fff8c5;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-critical {
|
||||||
|
background-color: #fde8e8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-warning:hover {
|
||||||
|
background-color: #fff0a0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-critical:hover {
|
||||||
|
background-color: #f9c8c8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable th {
|
||||||
|
padding: 6px 8px;
|
||||||
|
background-color: #2196f3;
|
||||||
|
color: white;
|
||||||
|
font-weight: 600;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable
|
||||||
|
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||||
|
content: " ⇅";
|
||||||
|
opacity: 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Alert count column styling */
|
||||||
|
#ntable td.alert-warning {
|
||||||
|
color: #ff9800;
|
||||||
|
font-weight: bold;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable td.alert-critical {
|
||||||
|
color: #f44336;
|
||||||
|
font-weight: bold;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Scrollbar styling */
|
||||||
|
.log-section::-webkit-scrollbar {
|
||||||
|
width: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-track {
|
||||||
|
background: #f1f1f1;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-thumb {
|
||||||
|
background: #888;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-thumb:hover {
|
||||||
|
background: #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Message styling */
|
||||||
|
#messages {
|
||||||
|
font-size: 1.00em;
|
||||||
|
line-height: 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#messages .log-entry {
|
||||||
|
padding: 5px 0;
|
||||||
|
border-bottom: 1px solid #f0f0f0;
|
||||||
|
display: flex;
|
||||||
|
gap: 0.5em;
|
||||||
|
align-items: baseline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-ts { color: #888; white-space: nowrap; }
|
||||||
|
.log-level { font-weight: bold; min-width: 6em; }
|
||||||
|
.log-host { font-weight: 600; }
|
||||||
|
.log-service { color: #888; }
|
||||||
|
|
||||||
|
.log-warning .log-level { color: #b8860b; }
|
||||||
|
.log-critical .log-level { color: #c00; }
|
||||||
|
.log-recover .log-level { color: #2a7a2a; }
|
||||||
|
.log-info .log-level { color: #555; }
|
||||||
|
|
||||||
|
.log-section-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 12px;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
padding: 8px 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section-title {
|
||||||
|
font-size: 1.2em;
|
||||||
|
font-weight: bold;
|
||||||
|
color: #333;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-filter-bar {
|
||||||
|
display: flex;
|
||||||
|
gap: 6px;
|
||||||
|
align-items: center;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-filter-bar input[type="text"],
|
||||||
|
.log-filter-bar select {
|
||||||
|
padding: 3px 7px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 1.00em;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-filter-bar input[type="text"] { width: 110px; }
|
||||||
|
|
||||||
|
/* Modal for connection status messages */
|
||||||
|
.connection-modal {
|
||||||
|
display: none;
|
||||||
|
position: fixed;
|
||||||
|
z-index: 1000;
|
||||||
|
left: 0;
|
||||||
|
top: 0;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: rgba(0, 0, 0, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal.show {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal-content {
|
||||||
|
background-color: white;
|
||||||
|
padding: 30px 40px;
|
||||||
|
border-radius: 8px;
|
||||||
|
text-align: center;
|
||||||
|
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
|
||||||
|
min-width: 300px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal-content p {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 16px;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* State indicators */
|
||||||
|
.state-up {
|
||||||
|
color: #4caf50;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
|
||||||
|
.state-down {
|
||||||
|
color: #f44336;
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.state-overdue {
|
||||||
|
color: #ff9800;
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
#ntable a.host-link { color: inherit; text-decoration: none; }
|
||||||
|
#ntable a.host-link:hover { text-decoration: underline; }
|
||||||
|
|
||||||
|
/* ── Dark mode ── */
|
||||||
|
html[data-theme="dark"] h1,
|
||||||
|
html[data-theme="dark"] h2 { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] h2,
|
||||||
|
html[data-theme="dark"] .table-section,
|
||||||
|
html[data-theme="dark"] .log-section,
|
||||||
|
html[data-theme="dark"] .log-section-header { background: var(--surface); }
|
||||||
|
html[data-theme="dark"] .log-section-title { color: var(--text); }
|
||||||
|
html[data-theme="dark"] #ntable td,
|
||||||
|
html[data-theme="dark"] #ntable th { border-color: var(--border); }
|
||||||
|
html[data-theme="dark"] #ntable tr:nth-child(even) { background: var(--surface-2); }
|
||||||
|
html[data-theme="dark"] #ntable tr:hover { background: #1e3a5f; }
|
||||||
|
html[data-theme="dark"] #ntable tbody tr.row-warning { background: #3a2800; }
|
||||||
|
html[data-theme="dark"] #ntable tbody tr.row-critical { background: #3a0a0a; }
|
||||||
|
html[data-theme="dark"] #ntable tbody tr.row-warning:hover { background: #4a3200; }
|
||||||
|
html[data-theme="dark"] #ntable tbody tr.row-critical:hover { background: #4a1010; }
|
||||||
|
html[data-theme="dark"] #messages .log-entry { border-bottom-color: var(--border-3); }
|
||||||
|
html[data-theme="dark"] .log-ts,
|
||||||
|
html[data-theme="dark"] .log-service { color: var(--text-muted); }
|
||||||
|
html[data-theme="dark"] .log-info .log-level { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .log-filter-bar input,
|
||||||
|
html[data-theme="dark"] .log-filter-bar select { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .connection-modal-content { background: var(--surface); color: var(--text); }
|
||||||
|
</style>
|
||||||
|
<script type="text/javascript">
|
||||||
|
var cnt = 0;
|
||||||
|
var nTable = document;
|
||||||
|
var name_idx = {};
|
||||||
|
var c = 0;
|
||||||
|
var HBD_VERSION = "{{ hbd_version }}";
|
||||||
|
|
||||||
|
function hostNameHtml(data) {
|
||||||
|
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
|
||||||
|
var nameHtml = data.name;
|
||||||
|
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||||
|
nameHtml += ' 🥀';
|
||||||
|
}
|
||||||
|
var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||||
|
return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function setup() {
|
||||||
|
name_idx = {};
|
||||||
|
nTable = document.getElementById("ntable");
|
||||||
|
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||||
|
if (i == 0) continue;
|
||||||
|
var cell = nTable.rows[i].cells[0];
|
||||||
|
var name = cell.dataset.name || cell.innerText.replace(/\s*🥀\s*$/, '').trim();
|
||||||
|
name_idx[name] = nTable.rows[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateRowAlert(row, data) {
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
row.classList.remove('row-warning', 'row-critical');
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
row.classList.add('row-critical');
|
||||||
|
} else if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
row.classList.add('row-warning');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function createRow(data) {
|
||||||
|
var row = document.createElement("tr");
|
||||||
|
var c_name = document.createElement("td");
|
||||||
|
var c_warning = document.createElement("td");
|
||||||
|
c_warning.style.textAlign = "center";
|
||||||
|
c_warning.style.color = "#ff9800";
|
||||||
|
c_warning.style.fontWeight = "bold";
|
||||||
|
var c_critical = document.createElement("td");
|
||||||
|
c_critical.style.textAlign = "center";
|
||||||
|
c_critical.style.color = "#f44336";
|
||||||
|
c_critical.style.fontWeight = "bold";
|
||||||
|
var c_ipv4addr = document.createElement("td");
|
||||||
|
var c_ipv4state = document.createElement("td");
|
||||||
|
var c_ipv4latency = document.createElement("td");
|
||||||
|
c_ipv4latency.style.textAlign = "right";
|
||||||
|
var c_ipv4statets = document.createElement("td");
|
||||||
|
c_ipv4statets.style.textAlign = "right";
|
||||||
|
var c_ipv6addr = document.createElement("td");
|
||||||
|
var c_ipv6state = document.createElement("td");
|
||||||
|
var c_ipv6latency = document.createElement("td");
|
||||||
|
c_ipv6latency.style.textAlign = "right";
|
||||||
|
var c_ipv6statets = document.createElement("td");
|
||||||
|
c_ipv6statets.style.textAlign = "right";
|
||||||
|
row.appendChild(c_name);
|
||||||
|
row.appendChild(c_warning);
|
||||||
|
row.appendChild(c_critical);
|
||||||
|
row.appendChild(c_ipv4addr);
|
||||||
|
row.appendChild(c_ipv4state);
|
||||||
|
row.appendChild(c_ipv4latency);
|
||||||
|
row.appendChild(c_ipv4statets);
|
||||||
|
row.appendChild(c_ipv6addr);
|
||||||
|
row.appendChild(c_ipv6state);
|
||||||
|
row.appendChild(c_ipv6latency);
|
||||||
|
row.appendChild(c_ipv6statets);
|
||||||
|
c_name.dataset.name = data.name;
|
||||||
|
c_name.innerHTML = hostNameHtml(data);
|
||||||
|
|
||||||
|
// Set alert counts in "x/y" format (unacked/acked)
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
|
||||||
|
if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||||
|
} else {
|
||||||
|
c_warning.innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||||
|
} else {
|
||||||
|
c_critical.innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||||
|
c_ipv4state.innerHTML = data.connections[0].state;
|
||||||
|
if (data.connections.length > 1) {
|
||||||
|
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||||
|
c_ipv6state.innerHTML = data.connections[1].state;
|
||||||
|
}
|
||||||
|
var table = document.getElementById("ntablebody"); // find table to append to
|
||||||
|
table.appendChild(row); // append row to table
|
||||||
|
name_idx[c_name] = row;
|
||||||
|
updateRowAlert(row, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatTS(ts) {
|
||||||
|
const now = new Date();
|
||||||
|
const d = new Date(ts * 1000);
|
||||||
|
|
||||||
|
const pad = n => String(n).padStart(2, '0');
|
||||||
|
const timeStr = `${pad(d.getHours())}:${pad(d.getMinutes())}:${pad(d.getSeconds())}`;
|
||||||
|
|
||||||
|
// Same calendar day → show time only
|
||||||
|
if (d.toDateString() === now.toDateString()) {
|
||||||
|
return timeStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Within 8 days → show "-X d hh:mm:ss"
|
||||||
|
const todayStart = new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
||||||
|
const dStart = new Date(d.getFullYear(), d.getMonth(), d.getDate());
|
||||||
|
const diffDays = Math.round((todayStart - dStart) / 86400000);
|
||||||
|
if (diffDays < 8) {
|
||||||
|
return `-${diffDays}d ${timeStr}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Older → date only
|
||||||
|
return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function update_table(data) {
|
||||||
|
if (!(data.name in name_idx)) {
|
||||||
|
createRow(data);
|
||||||
|
setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update name cell (version indicator)
|
||||||
|
var nameCell = name_idx[data.name].cells[0];
|
||||||
|
nameCell.dataset.name = data.name;
|
||||||
|
nameCell.innerHTML = hostNameHtml(data);
|
||||||
|
|
||||||
|
// Update warning and critical counts in "x/y" format (unacked/acked)
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
|
||||||
|
if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||||
|
} else {
|
||||||
|
name_idx[data.name].cells[1].innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||||
|
} else {
|
||||||
|
name_idx[data.name].cells[2].innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var i = 0; i < data.connections.length; i++) {
|
||||||
|
// Offset by 2 for the warning/critical count columns
|
||||||
|
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
|
||||||
|
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
|
||||||
|
data.connections[i].statetime
|
||||||
|
);
|
||||||
|
if (data.connections[i].state == "up") {
|
||||||
|
state = '<span class="state-up">up</span>';
|
||||||
|
latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
|
||||||
|
} else {
|
||||||
|
if (data.connections[i].state == "unknown") {
|
||||||
|
state = "";
|
||||||
|
latency = "";
|
||||||
|
name_idx[data.name].cells[3 + i * 4].innerHTML = "";
|
||||||
|
name_idx[data.name].cells[6 + i * 4].innerHTML = "";
|
||||||
|
} else if (data.connections[i].state == "down") {
|
||||||
|
state = '<span class="state-down">down</span>';
|
||||||
|
latency = "-";
|
||||||
|
} else if (data.connections[i].state == "overdue") {
|
||||||
|
state = '<span class="state-overdue">overdue</span>';
|
||||||
|
latency = "-";
|
||||||
|
} else {
|
||||||
|
state = "<b>" + data.connections[i].state + "</b>";
|
||||||
|
latency = "-";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
name_idx[data.name].cells[4 + i * 4].innerHTML = state;
|
||||||
|
name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
|
||||||
|
}
|
||||||
|
updateRowAlert(name_idx[data.name], data);
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyLogFilters() {
|
||||||
|
var hostFilter = document.getElementById('filter-host').value.toLowerCase().trim();
|
||||||
|
var levelFilter = document.getElementById('filter-level').value;
|
||||||
|
var msgFilter = document.getElementById('filter-msg').value.toLowerCase().trim();
|
||||||
|
document.querySelectorAll('#messages .log-entry').forEach(function(entry) {
|
||||||
|
var show = true;
|
||||||
|
if (hostFilter && !(entry.dataset.host || '').toLowerCase().includes(hostFilter)) show = false;
|
||||||
|
if (levelFilter && entry.dataset.level !== levelFilter) show = false;
|
||||||
|
if (msgFilter) {
|
||||||
|
var msgEl = entry.querySelector('.log-msg');
|
||||||
|
if (!msgEl || !msgEl.textContent.toLowerCase().includes(msgFilter)) show = false;
|
||||||
|
}
|
||||||
|
entry.style.display = show ? '' : 'none';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function WS_Connect() {
|
||||||
|
if ("WebSocket" in window) {
|
||||||
|
//N.B: subprotocol field causes chrome to error 1006
|
||||||
|
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||||
|
|
||||||
|
ws_hbd.onopen = function () {
|
||||||
|
// Web Socket is connected, send data using send()
|
||||||
|
console.log("ws connect {{heartbeat_ws_url}}");
|
||||||
|
// Hide modal window if visible
|
||||||
|
var modal = document.getElementById("connectionModal");
|
||||||
|
if (modal) {
|
||||||
|
modal.classList.remove("show");
|
||||||
|
}
|
||||||
|
ws_hbd.send("heartbeat_web");
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onerror = function (event) {
|
||||||
|
console.log(event);
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onmessage = function (event) {
|
||||||
|
/* console.log(event.data); */
|
||||||
|
var state = JSON.parse(event.data);
|
||||||
|
/* console.log("State: " + state.type); */
|
||||||
|
if (state.type == "host") {
|
||||||
|
update_table(state.data);
|
||||||
|
} else if (state.type == "message") {
|
||||||
|
var msgs = document.getElementById("messages");
|
||||||
|
var msg = state.data;
|
||||||
|
var _d = new Date(msg.ts * 1000);
|
||||||
|
function _p(n) { return n < 10 ? '0' + n : '' + n; }
|
||||||
|
var ts_str = _d.getFullYear() + '-' + _p(_d.getMonth()+1) + '-' + _p(_d.getDate())
|
||||||
|
+ ' ' + _p(_d.getHours()) + ':' + _p(_d.getMinutes()) + ':' + _p(_d.getSeconds());
|
||||||
|
var lvl = (msg.level || "INFO").toLowerCase();
|
||||||
|
var hostVal = msg.host || '';
|
||||||
|
var html = '<div class="log-entry log-' + lvl + '" data-level="' + lvl + '" data-host="' + hostVal.replace(/"/g, '"') + '">';
|
||||||
|
html += '<span class="log-ts">' + ts_str + '</span>';
|
||||||
|
html += '<span class="log-level">' + (msg.level || "") + '</span>';
|
||||||
|
if (msg.host) html += '<span class="log-host">' + msg.host + '</span>';
|
||||||
|
if (msg.service) html += '<span class="log-service">' + msg.service + '</span>';
|
||||||
|
html += '<span class="log-msg">' + msg.message + '</span>';
|
||||||
|
html += '</div>';
|
||||||
|
msgs.insertAdjacentHTML(state.history ? "beforeend" : "afterbegin", html);
|
||||||
|
applyLogFilters();
|
||||||
|
}
|
||||||
|
cnt++;
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onclose = function (event) {
|
||||||
|
/* console.log(event); */
|
||||||
|
console.log("Connection is closed, reopening");
|
||||||
|
// Show modal window
|
||||||
|
var modal = document.getElementById("connectionModal");
|
||||||
|
if (modal) {
|
||||||
|
modal.classList.add("show");
|
||||||
|
}
|
||||||
|
setTimeout(function () {
|
||||||
|
WS_Connect();
|
||||||
|
}, 3000);
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
// The browser doesn't support WebSocket
|
||||||
|
console.log("WebSocket NOT supported by your Browser!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WS_Connect();
|
||||||
|
</script>
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
{% include 'menu.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Real-time host monitoring and event log</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="table-section">
|
||||||
|
<table id="ntable" class="sortable">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Name</th>
|
||||||
|
<th style="text-align: center" title="Warning Alerts">⚠️</th>
|
||||||
|
<th style="text-align: center" title="Critical Alerts">🔴</th>
|
||||||
|
<th>IPv4 Addr</th>
|
||||||
|
<th>State</th>
|
||||||
|
<th style="text-align: right">Latency</th>
|
||||||
|
<th style="text-align: right">Last State</th>
|
||||||
|
<th>IPv6 Addr</th>
|
||||||
|
<th>State</th>
|
||||||
|
<th style="text-align: right">Latency</th>
|
||||||
|
<th style="text-align: right">Last State</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="ntablebody">
|
||||||
|
{% for host in hosts %}
|
||||||
|
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||||
|
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||||
|
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||||
|
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||||
|
{%- set warning_acked = host.alert_warning_acked -%}
|
||||||
|
{%- if warning_unacked > 0 or warning_acked > 0 -%}
|
||||||
|
{{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
|
||||||
|
{%- endif -%}
|
||||||
|
</td>
|
||||||
|
<td style="text-align: center; color: #f44336; font-weight: bold;">
|
||||||
|
{%- set critical_unacked = host.alert_critical_unacked -%}
|
||||||
|
{%- set critical_acked = host.alert_critical_acked -%}
|
||||||
|
{%- if critical_unacked > 0 or critical_acked > 0 -%}
|
||||||
|
{{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
|
||||||
|
{%- endif -%}
|
||||||
|
</td>
|
||||||
|
{% for conn in host.connections %}
|
||||||
|
<td>{{ conn.addr if conn.addr else '' }}</td>
|
||||||
|
<td>{{ conn.state if conn.state else '' }}</td>
|
||||||
|
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
|
||||||
|
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
|
||||||
|
{% endfor %}
|
||||||
|
{% if host.connections|length == 0 %}
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
{% elif host.connections|length == 1 %}
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
{% endif %}
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="log-section">
|
||||||
|
<div class="log-section-header">
|
||||||
|
<span class="log-section-title">Log of Events</span>
|
||||||
|
<div class="log-filter-bar">
|
||||||
|
<input type="text" id="filter-host" placeholder="Host…" title="Filter by host" />
|
||||||
|
<select id="filter-level" title="Filter by level">
|
||||||
|
<option value="">All levels</option>
|
||||||
|
<option value="info">INFO</option>
|
||||||
|
<option value="warning">WARNING</option>
|
||||||
|
<option value="critical">CRITICAL</option>
|
||||||
|
<option value="recover">RECOVER</option>
|
||||||
|
<option value="unknown">UNKNOWN</option>
|
||||||
|
</select>
|
||||||
|
<input type="text" id="filter-msg" placeholder="Message…" title="Filter by message text" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="messages"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% include 'foot.html' %}
|
||||||
|
|
||||||
|
<!-- Connection status modal -->
|
||||||
|
<div id="connectionModal" class="connection-modal">
|
||||||
|
<div class="connection-modal-content">
|
||||||
|
<p>⚠️ Connection is closed, reopening...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
setup();
|
||||||
|
document.getElementById('filter-host').addEventListener('input', applyLogFilters);
|
||||||
|
document.getElementById('filter-level').addEventListener('change', applyLogFilters);
|
||||||
|
document.getElementById('filter-msg').addEventListener('input', applyLogFilters);
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||||
|
s<header>{{ header }}</header> -->
|
||||||
@@ -0,0 +1,134 @@
|
|||||||
|
<div class="nav">
|
||||||
|
<button class="nav-hamburger" id="nav-hamburger-btn" aria-label="Menu" aria-expanded="false">
|
||||||
|
<span></span><span></span><span></span>
|
||||||
|
</button>
|
||||||
|
<div class="nav-links" id="nav-links">
|
||||||
|
<a href="/live"{% if active_page == "live" %} class="active"{% endif %}>Live Dashboard</a>
|
||||||
|
<a href="/plugins"{% if active_page == "plugins" %} class="active"{% endif %}>Host Overview</a>
|
||||||
|
<a href="/alerts"{% if active_page == "alerts" %} class="active"{% endif %}>Alerts</a>
|
||||||
|
{% if current_user and current_user.admin %}
|
||||||
|
<a href="/settings"{% if active_page == "settings" %} class="active"{% endif %}>Settings</a>
|
||||||
|
{% endif %}
|
||||||
|
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
||||||
|
</div>
|
||||||
|
{% if current_user and current_user.admin %}
|
||||||
|
<button id="nav-publish-btn" class="nav-publish-btn" onclick="navPublishConfig()" style="display:none" title="Publish pending config changes to .hb.yaml">⚠ Publish Config</button>
|
||||||
|
{% endif %}
|
||||||
|
<div class="nav-pie" title="Host alert status">
|
||||||
|
<canvas id="alert-pie" width="44" height="44"></canvas>
|
||||||
|
</div>
|
||||||
|
<div class="nav-clock" title="Click for full-screen clock">
|
||||||
|
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
||||||
|
</div>
|
||||||
|
{% if current_user %}
|
||||||
|
<a href="/profile" class="nav-user{% if active_page == 'profile' %} active{% endif %}" title="{{ current_user.full_name or current_user.username }}">
|
||||||
|
{% if current_user.avatar %}
|
||||||
|
<img class="nav-avatar" src="{{ current_user.avatar_url }}" alt="{{ current_user.full_name or current_user.username }}">
|
||||||
|
{% else %}
|
||||||
|
<span class="nav-initials">{{ (current_user.full_name or current_user.username)[:1] | upper }}</span>
|
||||||
|
{% endif %}
|
||||||
|
<span class="nav-username">{{ current_user.full_name or current_user.username }}</span>
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Full-page clock overlay (click anywhere to dismiss) -->
|
||||||
|
<div id="clock-overlay">
|
||||||
|
<canvas id="swiss-clock-overlay" width="400" height="400"></canvas>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
var btn = document.getElementById('nav-hamburger-btn');
|
||||||
|
var links = document.getElementById('nav-links');
|
||||||
|
if (btn && links) {
|
||||||
|
btn.addEventListener('click', function() {
|
||||||
|
var open = links.classList.toggle('nav-open');
|
||||||
|
btn.setAttribute('aria-expanded', open ? 'true' : 'false');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
function drawAlertPie(critical, warning, ok) {
|
||||||
|
var canvas = document.getElementById('alert-pie');
|
||||||
|
if (!canvas) return;
|
||||||
|
var ctx = canvas.getContext('2d');
|
||||||
|
var SIZE = canvas.width;
|
||||||
|
var R = SIZE / 2;
|
||||||
|
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||||
|
var total = critical + warning + ok;
|
||||||
|
if (total === 0) {
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#ccc';
|
||||||
|
ctx.fill();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
var slices = [
|
||||||
|
{ value: critical, color: '#e53935' },
|
||||||
|
{ value: warning, color: '#ffb300' },
|
||||||
|
{ value: ok, color: '#43a047' }
|
||||||
|
];
|
||||||
|
var start = -Math.PI / 2;
|
||||||
|
slices.forEach(function(s) {
|
||||||
|
if (s.value === 0) return;
|
||||||
|
var sweep = (s.value / total) * Math.PI * 2;
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(R, R);
|
||||||
|
ctx.arc(R, R, R - 1, start, start + sweep);
|
||||||
|
ctx.closePath();
|
||||||
|
ctx.fillStyle = s.color;
|
||||||
|
ctx.fill();
|
||||||
|
start += sweep;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateAlertPie() {
|
||||||
|
fetch('/api/0/alert_summary').then(function(r) {
|
||||||
|
if (!r.ok) return;
|
||||||
|
return r.json();
|
||||||
|
}).then(function(d) {
|
||||||
|
if (d) drawAlertPie(d.critical || 0, d.warning || 0, d.ok || 0);
|
||||||
|
}).catch(function() {});
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
updateAlertPie();
|
||||||
|
setInterval(updateAlertPie, 30000);
|
||||||
|
navCheckPendingConfig();
|
||||||
|
window.addEventListener('storage', navCheckPendingConfig);
|
||||||
|
});
|
||||||
|
|
||||||
|
function navCheckPendingConfig() {
|
||||||
|
var btn = document.getElementById('nav-publish-btn');
|
||||||
|
if (!btn) return;
|
||||||
|
btn.style.display = localStorage.getItem('hbd_pending_config') ? '' : 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function navPublishConfig() {
|
||||||
|
var btn = document.getElementById('nav-publish-btn');
|
||||||
|
var pending = localStorage.getItem('hbd_pending_config');
|
||||||
|
if (!pending) return;
|
||||||
|
var staged;
|
||||||
|
try { staged = JSON.parse(pending); } catch(e) { return; }
|
||||||
|
if (btn) { btn.disabled = true; btn.textContent = 'Saving…'; }
|
||||||
|
try {
|
||||||
|
var resp = await fetch('/api/0/config', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: pending
|
||||||
|
});
|
||||||
|
if (resp.ok) {
|
||||||
|
localStorage.removeItem('hbd_pending_config');
|
||||||
|
window.location.reload();
|
||||||
|
} else {
|
||||||
|
var err = await resp.json().catch(function() { return {}; });
|
||||||
|
alert('Error: ' + (err.error || resp.statusText));
|
||||||
|
if (btn) { btn.disabled = false; btn.textContent = '⚠ Publish Config'; }
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
alert('Network error: ' + e.message);
|
||||||
|
if (btn) { btn.disabled = false; btn.textContent = '⚠ Publish Config'; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,842 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
html, body { overflow: visible; }
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 900px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Profile card ---- */
|
||||||
|
.profile-card {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 28px 32px;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 28px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.avatar-large {
|
||||||
|
width: 80px;
|
||||||
|
height: 80px;
|
||||||
|
border-radius: 50%;
|
||||||
|
object-fit: cover;
|
||||||
|
flex-shrink: 0;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.avatar-initials-large {
|
||||||
|
width: 80px;
|
||||||
|
height: 80px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: #0066cc;
|
||||||
|
color: #fff;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
font-size: 2em;
|
||||||
|
font-weight: 700;
|
||||||
|
flex-shrink: 0;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.profile-info { flex: 1; }
|
||||||
|
|
||||||
|
.profile-name {
|
||||||
|
font-size: 1.4em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #222;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.profile-username {
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 10px;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.78em;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||||
|
.badge-user { background: #f1f3f4; color: #555; }
|
||||||
|
|
||||||
|
.profile-logout {
|
||||||
|
margin-top: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-logout {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 6px 16px;
|
||||||
|
border-radius: 4px;
|
||||||
|
background: #f44336;
|
||||||
|
color: #fff;
|
||||||
|
font-size: 1.00em;
|
||||||
|
font-weight: 500;
|
||||||
|
text-decoration: none;
|
||||||
|
transition: background 0.15s;
|
||||||
|
}
|
||||||
|
.btn-logout:hover { background: #d32f2f; text-decoration: none; }
|
||||||
|
|
||||||
|
/* ---- Section cards ---- */
|
||||||
|
.section {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 20px 24px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section h2 {
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #333;
|
||||||
|
margin: 0 0 16px;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Settings rows ---- */
|
||||||
|
.settings-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 8px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.settings-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.settings-label {
|
||||||
|
width: 180px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.88em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.settings-value { color: #222; }
|
||||||
|
|
||||||
|
.settings-empty { color: #aaa; font-style: italic; }
|
||||||
|
|
||||||
|
/* ---- Host lists ---- */
|
||||||
|
.host-grid {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.host-chip {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
padding: 4px 12px;
|
||||||
|
border-radius: 16px;
|
||||||
|
font-size: 1.00em;
|
||||||
|
font-weight: 500;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.host-chip.owner { background: #e8f5e9; color: #2e7d32; }
|
||||||
|
.host-chip.manager { background: #e3f2fd; color: #1565c0; }
|
||||||
|
.host-chip.monitor { background: #f3e5f5; color: #6a1b9a; }
|
||||||
|
|
||||||
|
.host-chip-dot {
|
||||||
|
width: 7px; height: 7px; border-radius: 50%;
|
||||||
|
}
|
||||||
|
.owner .host-chip-dot { background: #2e7d32; }
|
||||||
|
.manager .host-chip-dot { background: #1565c0; }
|
||||||
|
.monitor .host-chip-dot { background: #6a1b9a; }
|
||||||
|
|
||||||
|
.no-hosts {
|
||||||
|
color: #aaa;
|
||||||
|
font-size: 0.9em;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Notification channels ---- */
|
||||||
|
.channel-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 6px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.channel-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.channel-type {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-size: 0.78em;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
background: #f1f3f4;
|
||||||
|
color: #555;
|
||||||
|
min-width: 70px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-name { color: #333; }
|
||||||
|
|
||||||
|
.edit-section { margin-top: 20px; }
|
||||||
|
.edit-section h4 { font-size: .88em; font-weight: 600; color: #333; margin: 0 0 10px; text-transform: uppercase; letter-spacing: .04em; border-bottom: 1px solid #eee; padding-bottom: 6px; }
|
||||||
|
.edit-field { margin-bottom: 10px; }
|
||||||
|
.edit-field label { display: block; font-size: .82em; color: #666; margin-bottom: 3px; }
|
||||||
|
.edit-input { width: 100%; border: 1px solid #ccc; border-radius: 4px; padding: 5px 8px; font-size: .88em; box-sizing: border-box; }
|
||||||
|
.edit-input:focus { border-color: #0066cc; outline: none; }
|
||||||
|
.status-msg { font-size: .82em; margin-left: 8px; }
|
||||||
|
.save-row { display: flex; align-items: center; margin-top: 8px; }
|
||||||
|
.btn-save { background: #0066cc; color: #fff; border: none; border-radius: 4px; padding: 5px 14px; font-size: .85em; cursor: pointer; }
|
||||||
|
.btn-save:hover { background: #0055aa; }
|
||||||
|
/* ---- Channel chip picker ---- */
|
||||||
|
.ch-picker { }
|
||||||
|
.ch-picker-label { font-size: .8em; font-weight: 600; color: #888; text-transform: uppercase; letter-spacing: .04em; margin-bottom: 6px; }
|
||||||
|
.ch-chips { display: flex; flex-wrap: wrap; gap: 6px; min-height: 32px; margin-bottom: 10px; }
|
||||||
|
.ch-chip {
|
||||||
|
display: inline-flex; align-items: center; gap: 5px;
|
||||||
|
padding: 4px 10px; border-radius: 14px; font-size: .85em; font-weight: 500; cursor: pointer;
|
||||||
|
border: none; font-family: inherit;
|
||||||
|
}
|
||||||
|
.ch-chip.selected { background: #e3f2fd; color: #1565c0; }
|
||||||
|
.ch-chip.selected:hover { background: #bbdefb; }
|
||||||
|
.ch-chip.available { background: #f1f3f4; color: #555; }
|
||||||
|
.ch-chip.available:hover { background: #e8eaf6; color: #283593; }
|
||||||
|
.ch-chip-x { font-size: .9em; line-height: 1; color: inherit; opacity: .7; }
|
||||||
|
|
||||||
|
/* ---- My Channels card list ---- */
|
||||||
|
.my-ch-card {
|
||||||
|
border: 1px solid #e8eaf6; border-radius: 6px; margin-bottom: 8px; overflow: hidden;
|
||||||
|
}
|
||||||
|
.my-ch-header {
|
||||||
|
display: flex; align-items: center; gap: 8px; padding: 8px 12px;
|
||||||
|
background: #f8f9ff; border-bottom: 1px solid #e8eaf6;
|
||||||
|
}
|
||||||
|
.my-ch-name { font-weight: 600; font-size: .9em; color: #222; }
|
||||||
|
.my-ch-type { padding: 2px 7px; border-radius: 8px; font-size: .72em; font-weight: 600; background: #e8eaf6; color: #3949ab; }
|
||||||
|
.my-ch-private { padding: 2px 7px; border-radius: 8px; font-size: .72em; font-weight: 600; background: #fce4ec; color: #c62828; }
|
||||||
|
.my-ch-actions { margin-left: auto; display: flex; gap: 5px; }
|
||||||
|
.btn-sm-edit { background: #888; color: #fff; border: none; border-radius: 4px; padding: 2px 8px; font-size: .78em; cursor: pointer; }
|
||||||
|
.btn-sm-edit:hover { background: #666; }
|
||||||
|
.btn-sm-del { background: transparent; color: #c62828; border: 1px solid #e0e0e0; border-radius: 4px; padding: 2px 7px; font-size: .78em; cursor: pointer; }
|
||||||
|
.btn-sm-del:hover { background: #fce4ec; }
|
||||||
|
|
||||||
|
/* ---- Theme picker ---- */
|
||||||
|
.theme-btns { display: flex; gap: 6px; }
|
||||||
|
.theme-btn {
|
||||||
|
padding: 5px 14px;
|
||||||
|
border: 1px solid var(--border, #e0e0e0);
|
||||||
|
border-radius: 4px;
|
||||||
|
background: var(--surface-3, #f5f5f5);
|
||||||
|
color: var(--text-sec, #666);
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: .88em;
|
||||||
|
font-family: inherit;
|
||||||
|
}
|
||||||
|
.theme-btn:hover { border-color: var(--link, #0066cc); color: var(--link, #0066cc); }
|
||||||
|
.theme-btn.active { background: var(--link, #0066cc); color: #fff; border-color: var(--link, #0066cc); }
|
||||||
|
|
||||||
|
/* ── Dark mode ── */
|
||||||
|
html[data-theme="dark"] h1 { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .subtitle { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .profile-card { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||||
|
html[data-theme="dark"] .profile-name { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .profile-username { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .badge-admin { background: #1a3255; color: #7aa8f0; }
|
||||||
|
html[data-theme="dark"] .badge-user { background: var(--surface-3); color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .section { background: var(--surface); box-shadow: 0 1px 6px var(--shadow); }
|
||||||
|
html[data-theme="dark"] .section h2 { color: var(--text); border-bottom-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .settings-row { border-bottom-color: var(--border-4); }
|
||||||
|
html[data-theme="dark"] .settings-label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .settings-value { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .settings-empty { color: var(--text-dim); }
|
||||||
|
html[data-theme="dark"] .edit-section h4 { color: var(--text); border-bottom-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .edit-field label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .edit-input { background: var(--input-bg); border-color: var(--input-border); color: var(--text); }
|
||||||
|
html[data-theme="dark"] .channel-row { border-bottom-color: var(--border-4); }
|
||||||
|
html[data-theme="dark"] .channel-name { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .ch-picker-label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .ch-chip.selected { background: #1a3255; color: #60a5fa; }
|
||||||
|
html[data-theme="dark"] .ch-chip.available { background: var(--surface-3); color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .ch-chip.available:hover { background: var(--border); color: var(--link); }
|
||||||
|
html[data-theme="dark"] .my-ch-card { border-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .my-ch-header { background: var(--surface-2); border-bottom-color: var(--border); }
|
||||||
|
html[data-theme="dark"] .my-ch-name { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .host-chip.owner { background: #0d2e17; color: #66bb6a; }
|
||||||
|
html[data-theme="dark"] .host-chip.manager { background: #0d1f40; color: #64b5f6; }
|
||||||
|
html[data-theme="dark"] .host-chip.monitor { background: #1e0d30; color: #ba68c8; }
|
||||||
|
html[data-theme="dark"] .no-hosts { color: var(--text-dim); }
|
||||||
|
html[data-theme="dark"] .ch-modal-box { background: var(--surface); color: var(--text); }
|
||||||
|
html[data-theme="dark"] .ch-modal-box h3 { color: var(--text); }
|
||||||
|
html[data-theme="dark"] .ch-form-row label { color: var(--text-sec); }
|
||||||
|
html[data-theme="dark"] .ch-form-divider { color: var(--text-muted); border-top-color: var(--border); }
|
||||||
|
|
||||||
|
/* ---- Channel modal (for My Channels CRUD) ---- */
|
||||||
|
.ch-modal-overlay {
|
||||||
|
position: fixed; inset: 0; background: rgba(0,0,0,.4);
|
||||||
|
display: flex; align-items: center; justify-content: center; z-index: 1001;
|
||||||
|
}
|
||||||
|
.ch-modal-box {
|
||||||
|
background: #fff; border-radius: 8px; padding: 24px;
|
||||||
|
min-width: 360px; max-width: 520px; width: 95%;
|
||||||
|
box-shadow: 0 8px 32px rgba(0,0,0,.2);
|
||||||
|
}
|
||||||
|
.ch-modal-box h3 { margin: 0 0 16px; font-size: 1em; }
|
||||||
|
.ch-form-row { margin-bottom: 12px; }
|
||||||
|
.ch-form-row label { display: block; font-size: .83em; font-weight: 600; color: #555; margin-bottom: 3px; }
|
||||||
|
.ch-form-row input[type=text], .ch-form-row input[type=password], .ch-form-row select {
|
||||||
|
width: 100%; border: 1px solid #ccc; border-radius: 4px; padding: 5px 8px;
|
||||||
|
font-size: .88em; box-sizing: border-box; font-family: inherit;
|
||||||
|
}
|
||||||
|
.ch-form-row input:focus, .ch-form-row select:focus { border-color: #0066cc; outline: none; }
|
||||||
|
.ch-form-divider { font-size: .78em; font-weight: 700; text-transform: uppercase; letter-spacing: .05em; color: #888; margin: 14px 0 8px; border-top: 1px solid #eee; padding-top: 10px; }
|
||||||
|
.ch-modal-footer { display: flex; justify-content: flex-end; gap: 8px; margin-top: 18px; }
|
||||||
|
.ch-modal-status { font-size: .83em; margin-top: 8px; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Your account settings and host access</p>
|
||||||
|
|
||||||
|
<!-- Profile card -->
|
||||||
|
<div class="profile-card">
|
||||||
|
{% if current_user and current_user.avatar %}
|
||||||
|
<img class="avatar-large" src="{{ current_user.avatar_url }}" alt="">
|
||||||
|
{% else %}
|
||||||
|
<div class="avatar-initials-large">
|
||||||
|
{{ ((current_user.full_name if current_user else '') or (current_user.username if current_user else '?'))[:1] | upper }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div class="profile-info">
|
||||||
|
<div class="profile-name">{{ current_user.full_name if current_user and current_user.full_name else (current_user.username if current_user else '—') }}</div>
|
||||||
|
<div class="profile-username">@{{ current_user.username if current_user else '—' }}</div>
|
||||||
|
{% if current_user and current_user.admin %}
|
||||||
|
<span class="badge badge-admin">Admin</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-user">User</span>
|
||||||
|
{% endif %}
|
||||||
|
<div class="profile-logout">
|
||||||
|
<a href="/logout" class="btn-logout">Sign out</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Account settings -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Account</h2>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Username</span>
|
||||||
|
<span class="settings-value">{{ current_user.username if current_user else '—' }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Full name</span>
|
||||||
|
{% if current_user and current_user.full_name %}
|
||||||
|
<span class="settings-value">{{ current_user.full_name }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="settings-empty">Not set</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Role</span>
|
||||||
|
<span class="settings-value">{{ 'Administrator' if current_user and current_user.admin else 'User' }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Avatar</span>
|
||||||
|
{% if current_user and current_user.avatar %}
|
||||||
|
<span class="settings-value" style="word-break:break-all;">{{ current_user.avatar }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="settings-empty">Not set (initials used)</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if current_user %}
|
||||||
|
<!-- ---- Editable identity ---- -->
|
||||||
|
<div class="section edit-section">
|
||||||
|
<h4>Identity</h4>
|
||||||
|
<div class="edit-field">
|
||||||
|
<label for="profile-fullname">Display name</label>
|
||||||
|
<input id="profile-fullname" class="edit-input" type="text" value="{{ current_user.full_name | e }}" placeholder="Full name">
|
||||||
|
</div>
|
||||||
|
<div class="edit-field">
|
||||||
|
<label for="profile-avatar">Avatar URL or path</label>
|
||||||
|
<input id="profile-avatar" class="edit-input" type="text" value="{{ current_user.avatar | e }}" placeholder="/path/to/avatar.png or https://…">
|
||||||
|
</div>
|
||||||
|
<div class="save-row">
|
||||||
|
<button class="btn-save" onclick="saveIdentity()">Save</button>
|
||||||
|
<span id="identity-status" class="status-msg"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ---- Change password ---- -->
|
||||||
|
<div class="section edit-section">
|
||||||
|
<h4>Change password</h4>
|
||||||
|
<div class="edit-field">
|
||||||
|
<label for="profile-current-pw">Current password</label>
|
||||||
|
<input id="profile-current-pw" class="edit-input" type="password" autocomplete="current-password">
|
||||||
|
</div>
|
||||||
|
<div class="edit-field">
|
||||||
|
<label for="profile-new-pw">New password</label>
|
||||||
|
<input id="profile-new-pw" class="edit-input" type="password" autocomplete="new-password">
|
||||||
|
</div>
|
||||||
|
<div class="save-row">
|
||||||
|
<button class="btn-save" onclick="changePassword()">Change password</button>
|
||||||
|
<span id="password-status" class="status-msg"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<!-- Notification channels — chip picker -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Notification Channels</h2>
|
||||||
|
{% if current_user %}
|
||||||
|
<p style="font-size:.82em;color:#888;margin:0 0 12px">Click a channel to add or remove it from your alert list.</p>
|
||||||
|
{% if all_channels %}
|
||||||
|
<div class="ch-picker">
|
||||||
|
<div class="ch-picker-label">Selected</div>
|
||||||
|
<div id="selected-chips" class="ch-chips">
|
||||||
|
{% for ch in all_channels %}
|
||||||
|
{% if ch.name in (current_user.notification_channels or []) %}
|
||||||
|
<button class="ch-chip selected" data-ch="{{ ch.name | e }}" onclick="toggleChip(this)">
|
||||||
|
{{ ch.name | e }} <span class="ch-chip-x">×</span>
|
||||||
|
</button>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% set selected_set = current_user.notification_channels or [] %}
|
||||||
|
{% set has_selected = selected_set | length > 0 %}
|
||||||
|
{% if not has_selected %}
|
||||||
|
<span style="font-size:.83em;color:#bbb;font-style:italic;align-self:center">None selected</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<div class="ch-picker-label">Available</div>
|
||||||
|
<div id="available-chips" class="ch-chips">
|
||||||
|
{% for ch in all_channels %}
|
||||||
|
{% if ch.name not in (current_user.notification_channels or []) %}
|
||||||
|
<button class="ch-chip available" data-ch="{{ ch.name | e }}" onclick="toggleChip(this)">
|
||||||
|
+ {{ ch.name | e }}
|
||||||
|
</button>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<p style="font-size:.83em;color:#bbb;font-style:italic">No notification channels available. You can create your own below.</p>
|
||||||
|
{% endif %}
|
||||||
|
<div class="save-row">
|
||||||
|
<button class="btn-save" onclick="saveChannels()">Save channels</button>
|
||||||
|
<span id="channels-status" class="status-msg"></span>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">Log in to manage notification channels.</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- My Channels — create/edit/delete own channels -->
|
||||||
|
{% if current_user %}
|
||||||
|
<div class="section">
|
||||||
|
<h2>My Channels</h2>
|
||||||
|
<p style="font-size:.82em;color:#888;margin:0 0 12px">Channels you own. Public channels are available to all users; private channels are visible only to you.</p>
|
||||||
|
<div id="my-channels-list">
|
||||||
|
{% set my_channels = all_channels | selectattr('owner', 'equalto', current_user.username) | list %}
|
||||||
|
{% for ch in my_channels %}
|
||||||
|
<div class="my-ch-card" id="mychcard-{{ ch.name | e }}">
|
||||||
|
<div class="my-ch-header">
|
||||||
|
<span class="my-ch-name">{{ ch.name | e }}</span>
|
||||||
|
<span class="my-ch-type">{{ ch.type | e }}</span>
|
||||||
|
{% if ch.private %}<span class="my-ch-private">private</span>{% endif %}
|
||||||
|
<span class="my-ch-actions">
|
||||||
|
<button class="btn-sm-edit" onclick="openMyChModal('{{ ch.name | e }}')">Edit</button>
|
||||||
|
<button class="btn-sm-del" onclick="deleteMyChannel('{{ ch.name | e }}')">✕</button>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% if not my_channels %}
|
||||||
|
<p id="my-channels-empty" style="font-size:.83em;color:#bbb;font-style:italic">No channels yet.</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<div class="save-row" style="margin-top:8px">
|
||||||
|
<button class="btn-save" onclick="openMyChModal()">+ New channel</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- My Channels modal -->
|
||||||
|
<div id="my-ch-modal" class="ch-modal-overlay" style="display:none" onclick="if(event.target===this)closeMyChModal()">
|
||||||
|
<div class="ch-modal-box">
|
||||||
|
<h3 id="my-ch-modal-title">New Channel</h3>
|
||||||
|
<div class="ch-form-row">
|
||||||
|
<label>Channel name</label>
|
||||||
|
<input type="text" id="my-ch-name" placeholder="e.g. my_pushover" autocomplete="off">
|
||||||
|
</div>
|
||||||
|
<div class="ch-form-row">
|
||||||
|
<label>Type</label>
|
||||||
|
<select id="my-ch-type" onchange="onMyChTypeChange()">
|
||||||
|
<option value="">— select —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div id="my-ch-type-fields"></div>
|
||||||
|
<div class="ch-form-divider">Options</div>
|
||||||
|
<div class="ch-form-row">
|
||||||
|
<label>Minimum alert level</label>
|
||||||
|
<select id="my-ch-min-level">
|
||||||
|
<option value="WARNING">WARNING (and above)</option>
|
||||||
|
<option value="CRITICAL">CRITICAL only</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="ch-form-row">
|
||||||
|
<label style="display:flex;align-items:center;gap:6px;cursor:pointer">
|
||||||
|
<input type="checkbox" id="my-ch-private"> Private — visible only to you
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div id="my-ch-modal-status" class="ch-modal-status"></div>
|
||||||
|
<div class="ch-modal-footer">
|
||||||
|
<button class="btn-save" style="background:#888" onclick="closeMyChModal()">Cancel</button>
|
||||||
|
<button class="btn-save" onclick="saveMyChannel()">Save</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<!-- Appearance -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Appearance</h2>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Theme</span>
|
||||||
|
<div class="theme-btns">
|
||||||
|
<button class="theme-btn" data-theme-val="auto" onclick="setTheme('auto')">Auto</button>
|
||||||
|
<button class="theme-btn" data-theme-val="light" onclick="setTheme('light')">Light</button>
|
||||||
|
<button class="theme-btn" data-theme-val="dark" onclick="setTheme('dark')">Dark</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Host access -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Host Access</h2>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Owner</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if owned_hosts %}
|
||||||
|
{% for h in owned_hosts %}
|
||||||
|
<span class="host-chip owner"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Manager</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if managed_hosts %}
|
||||||
|
{% for h in managed_hosts %}
|
||||||
|
<span class="host-chip manager"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 4px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Monitor</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if monitored_hosts %}
|
||||||
|
{% for h in monitored_hosts %}
|
||||||
|
<span class="host-chip monitor"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
// ---- Theme ----
|
||||||
|
function applyTheme(pref) {
|
||||||
|
var dark = pref === 'dark' ||
|
||||||
|
(pref === 'auto' && window.matchMedia('(prefers-color-scheme: dark)').matches);
|
||||||
|
if (dark) { document.documentElement.setAttribute('data-theme', 'dark'); }
|
||||||
|
else { document.documentElement.removeAttribute('data-theme'); }
|
||||||
|
}
|
||||||
|
function setTheme(pref) {
|
||||||
|
try { localStorage.setItem('hbd_theme', pref); } catch(e) {}
|
||||||
|
applyTheme(pref);
|
||||||
|
document.querySelectorAll('.theme-btn').forEach(function(b) {
|
||||||
|
b.classList.toggle('active', b.dataset.themeVal === pref);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
(function() {
|
||||||
|
var pref = 'auto';
|
||||||
|
try { pref = localStorage.getItem('hbd_theme') || 'auto'; } catch(e) {}
|
||||||
|
document.querySelectorAll('.theme-btn').forEach(function(b) {
|
||||||
|
b.classList.toggle('active', b.dataset.themeVal === pref);
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
|
||||||
|
// ---- Identity ----
|
||||||
|
async function saveIdentity() {
|
||||||
|
const full_name = document.getElementById('profile-fullname').value;
|
||||||
|
const avatar = document.getElementById('profile-avatar').value;
|
||||||
|
const resp = await fetch('/api/0/users/me', {
|
||||||
|
method: 'PUT',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({full_name, avatar}),
|
||||||
|
});
|
||||||
|
if (resp.ok) {
|
||||||
|
showStatus('identity-status', 'Saved', '#2e7d32');
|
||||||
|
} else {
|
||||||
|
const err = await resp.json().catch(() => ({}));
|
||||||
|
showStatus('identity-status', err.error || 'Error saving', '#c62828');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Password ----
|
||||||
|
async function changePassword() {
|
||||||
|
const current = document.getElementById('profile-current-pw').value;
|
||||||
|
const newpw = document.getElementById('profile-new-pw').value;
|
||||||
|
if (!current || !newpw) {
|
||||||
|
showStatus('password-status', 'Both fields are required', '#c62828');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const resp = await fetch('/api/0/users/me', {
|
||||||
|
method: 'PUT',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({password: {current, new: newpw}}),
|
||||||
|
});
|
||||||
|
if (resp.ok) {
|
||||||
|
document.getElementById('profile-current-pw').value = '';
|
||||||
|
document.getElementById('profile-new-pw').value = '';
|
||||||
|
showStatus('password-status', 'Password changed', '#2e7d32');
|
||||||
|
} else {
|
||||||
|
const err = await resp.json().catch(() => ({}));
|
||||||
|
showStatus('password-status', err.error || 'Error', '#c62828');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Channel chip picker ----
|
||||||
|
function toggleChip(btn) {
|
||||||
|
const name = btn.dataset.ch;
|
||||||
|
const isSelected = btn.classList.contains('selected');
|
||||||
|
if (isSelected) {
|
||||||
|
// Move to available
|
||||||
|
btn.classList.remove('selected');
|
||||||
|
btn.classList.add('available');
|
||||||
|
btn.innerHTML = '+ ' + escHtml(name);
|
||||||
|
btn.onclick = function() { toggleChip(this); };
|
||||||
|
document.getElementById('available-chips').appendChild(btn);
|
||||||
|
// Remove "None selected" placeholder if it exists
|
||||||
|
} else {
|
||||||
|
// Move to selected
|
||||||
|
btn.classList.remove('available');
|
||||||
|
btn.classList.add('selected');
|
||||||
|
btn.innerHTML = escHtml(name) + ' <span class="ch-chip-x">×</span>';
|
||||||
|
btn.onclick = function() { toggleChip(this); };
|
||||||
|
document.getElementById('selected-chips').appendChild(btn);
|
||||||
|
}
|
||||||
|
// Update placeholder visibility
|
||||||
|
const sel = document.getElementById('selected-chips');
|
||||||
|
const placeholder = sel.querySelector('span[style]');
|
||||||
|
const hasChips = sel.querySelectorAll('.ch-chip.selected').length > 0;
|
||||||
|
if (placeholder) placeholder.style.display = hasChips ? 'none' : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function saveChannels() {
|
||||||
|
const notification_channels = [
|
||||||
|
...document.querySelectorAll('#selected-chips .ch-chip.selected')
|
||||||
|
].map(b => b.dataset.ch);
|
||||||
|
const resp = await fetch('/api/0/users/me', {
|
||||||
|
method: 'PUT',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({notification_channels}),
|
||||||
|
});
|
||||||
|
if (resp.ok) {
|
||||||
|
showStatus('channels-status', 'Saved', '#2e7d32');
|
||||||
|
} else {
|
||||||
|
const err = await resp.json().catch(() => ({}));
|
||||||
|
showStatus('channels-status', err.error || 'Error saving', '#c62828');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- My Channels CRUD ----
|
||||||
|
let _myChSchemas = {};
|
||||||
|
let _myChEditName = null;
|
||||||
|
|
||||||
|
async function _loadMyChSchemas() {
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/0/notification_channel_types');
|
||||||
|
_myChSchemas = await r.json();
|
||||||
|
const sel = document.getElementById('my-ch-type');
|
||||||
|
if (!sel) return;
|
||||||
|
Object.entries(_myChSchemas).forEach(([k, v]) => {
|
||||||
|
const opt = document.createElement('option');
|
||||||
|
opt.value = k; opt.textContent = v.label;
|
||||||
|
sel.appendChild(opt);
|
||||||
|
});
|
||||||
|
} catch(e) { console.warn('Could not load channel schemas', e); }
|
||||||
|
}
|
||||||
|
|
||||||
|
function onMyChTypeChange() {
|
||||||
|
const type = document.getElementById('my-ch-type').value;
|
||||||
|
const container = document.getElementById('my-ch-type-fields');
|
||||||
|
container.innerHTML = '';
|
||||||
|
if (!type || !_myChSchemas[type]) return;
|
||||||
|
const divider = document.createElement('div');
|
||||||
|
divider.className = 'ch-form-divider';
|
||||||
|
divider.textContent = _myChSchemas[type].label + ' settings';
|
||||||
|
container.appendChild(divider);
|
||||||
|
(_myChSchemas[type].fields || []).forEach(sf => {
|
||||||
|
const row = document.createElement('div');
|
||||||
|
row.className = 'ch-form-row';
|
||||||
|
const lbl = document.createElement('label');
|
||||||
|
lbl.textContent = sf.label + (sf.required ? ' *' : '');
|
||||||
|
const inp = document.createElement('input');
|
||||||
|
inp.type = sf.type === 'secret' ? 'password' : 'text';
|
||||||
|
inp.id = 'mychf-' + sf.key;
|
||||||
|
inp.placeholder = sf.required ? '(required)' : '(optional)';
|
||||||
|
inp.autocomplete = 'off';
|
||||||
|
row.appendChild(lbl);
|
||||||
|
row.appendChild(inp);
|
||||||
|
container.appendChild(row);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function openMyChModal(name) {
|
||||||
|
_myChEditName = name || null;
|
||||||
|
document.getElementById('my-ch-modal-status').textContent = '';
|
||||||
|
document.getElementById('my-ch-modal-title').textContent = name ? 'Edit Channel' : 'New Channel';
|
||||||
|
document.getElementById('my-ch-name').value = name || '';
|
||||||
|
document.getElementById('my-ch-name').disabled = !!name;
|
||||||
|
document.getElementById('my-ch-type').value = '';
|
||||||
|
document.getElementById('my-ch-type-fields').innerHTML = '';
|
||||||
|
document.getElementById('my-ch-min-level').value = 'WARNING';
|
||||||
|
document.getElementById('my-ch-private').checked = false;
|
||||||
|
|
||||||
|
if (name) {
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/0/notification_channels');
|
||||||
|
const channels = await r.json();
|
||||||
|
const ch = channels.find(c => c.name === name);
|
||||||
|
if (ch) {
|
||||||
|
document.getElementById('my-ch-type').value = ch.type;
|
||||||
|
onMyChTypeChange();
|
||||||
|
document.getElementById('my-ch-min-level').value = ch.min_level || 'WARNING';
|
||||||
|
document.getElementById('my-ch-private').checked = ch.private || false;
|
||||||
|
(ch.fields || []).forEach(f => {
|
||||||
|
const inp = document.getElementById('mychf-' + f.key);
|
||||||
|
if (inp) inp.value = f.value || '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch(e) { console.warn('Failed to load channel', e); }
|
||||||
|
}
|
||||||
|
document.getElementById('my-ch-modal').style.display = 'flex';
|
||||||
|
}
|
||||||
|
|
||||||
|
function closeMyChModal() {
|
||||||
|
document.getElementById('my-ch-modal').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function saveMyChannel() {
|
||||||
|
const name = document.getElementById('my-ch-name').value.trim();
|
||||||
|
const type = document.getElementById('my-ch-type').value;
|
||||||
|
const minLevel = document.getElementById('my-ch-min-level').value;
|
||||||
|
const isPrivate = document.getElementById('my-ch-private').checked;
|
||||||
|
const statusEl = document.getElementById('my-ch-modal-status');
|
||||||
|
statusEl.textContent = '';
|
||||||
|
|
||||||
|
if (!name) { statusEl.textContent = 'Name is required.'; statusEl.style.color = '#c62828'; return; }
|
||||||
|
if (!type) { statusEl.textContent = 'Please select a type.'; statusEl.style.color = '#c62828'; return; }
|
||||||
|
|
||||||
|
const body = { name, type, min_level: minLevel, private: isPrivate };
|
||||||
|
if (_myChSchemas[type]) {
|
||||||
|
(_myChSchemas[type].fields || []).forEach(sf => {
|
||||||
|
const inp = document.getElementById('mychf-' + sf.key);
|
||||||
|
if (inp) body[sf.key] = inp.value;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const isEdit = !!_myChEditName;
|
||||||
|
const url = isEdit
|
||||||
|
? '/api/0/notification_channels/' + encodeURIComponent(_myChEditName)
|
||||||
|
: '/api/0/notification_channels';
|
||||||
|
const method = isEdit ? 'PUT' : 'POST';
|
||||||
|
try {
|
||||||
|
const r = await fetch(url, { method, headers: {'Content-Type': 'application/json'}, body: JSON.stringify(body) });
|
||||||
|
if (r.ok) {
|
||||||
|
closeMyChModal();
|
||||||
|
window.location.reload();
|
||||||
|
} else {
|
||||||
|
const err = await r.json().catch(() => ({}));
|
||||||
|
statusEl.textContent = err.error || 'Error saving.';
|
||||||
|
statusEl.style.color = '#c62828';
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
statusEl.textContent = 'Network error: ' + e.message;
|
||||||
|
statusEl.style.color = '#c62828';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function deleteMyChannel(name) {
|
||||||
|
if (!confirm('Delete channel "' + name + '"?')) return;
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/0/notification_channels/' + encodeURIComponent(name), { method: 'DELETE' });
|
||||||
|
if (r.ok) {
|
||||||
|
window.location.reload();
|
||||||
|
} else {
|
||||||
|
const err = await r.json().catch(() => ({}));
|
||||||
|
alert('Error: ' + (err.error || 'Could not delete.'));
|
||||||
|
}
|
||||||
|
} catch(e) { alert('Network error: ' + e.message); }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Utilities ----
|
||||||
|
function showStatus(id, msg, color) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
if (!el) return;
|
||||||
|
el.textContent = msg;
|
||||||
|
el.style.color = color;
|
||||||
|
setTimeout(() => { el.textContent = ''; }, 3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function escHtml(s) {
|
||||||
|
return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', _loadMyChSchemas);
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,529 @@
|
|||||||
|
"""UDP listener and datagram processing."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import socket
|
||||||
|
import struct
|
||||||
|
import time
|
||||||
|
import zlib
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from platform import system as platform_system
|
||||||
|
|
||||||
|
from ..common.proto import stodict, oldmtodict
|
||||||
|
from ..common.utils import dur
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
|
||||||
|
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
|
||||||
|
# Python's socket module on all platforms
|
||||||
|
platform = platform_system()
|
||||||
|
if platform == "Darwin":
|
||||||
|
_SO_TIMESTAMP = 1024 # SO_TIMESTAMP on macOS (not in Python's socket module)
|
||||||
|
elif platform == "Linux":
|
||||||
|
_SO_TIMESTAMP = 29 # Linux value (not in older Python versions)
|
||||||
|
elif platform == "FreeBSD":
|
||||||
|
_SO_TIMESTAMP = 32 # FreeBSD value (not in older Python versions)
|
||||||
|
else:
|
||||||
|
logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
|
||||||
|
_SO_TIMESTAMP = None
|
||||||
|
|
||||||
|
# struct timeval uses two native C longs: tv_sec and tv_usec
|
||||||
|
_TIMEVAL = struct.Struct('@ll')
|
||||||
|
|
||||||
|
|
||||||
|
def enable_kernel_timestamps(sock) -> bool:
|
||||||
|
"""Try to enable SO_TIMESTAMP on *sock*.
|
||||||
|
|
||||||
|
Returns True if the kernel will supply receive timestamps, False otherwise
|
||||||
|
(unsupported platform, older kernel, or insufficient permissions).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
sock.setsockopt(socket.SOL_SOCKET, _SO_TIMESTAMP, 1)
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_kernel_ts(ancdata) -> float | None:
|
||||||
|
"""Parse recvmsg ancillary data and return the kernel receive time.
|
||||||
|
|
||||||
|
Returns seconds as a float, or None if no SO_TIMESTAMP cmsg is present.
|
||||||
|
"""
|
||||||
|
for cmsg_level, cmsg_type, cmsg_data in ancdata:
|
||||||
|
if cmsg_level == socket.SOL_SOCKET and cmsg_type == _SO_TIMESTAMP:
|
||||||
|
if len(cmsg_data) >= _TIMEVAL.size:
|
||||||
|
sec, usec = _TIMEVAL.unpack_from(cmsg_data)
|
||||||
|
return sec + usec * 1e-6
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class RecvmsgTransport:
|
||||||
|
"""Thin wrapper used when SO_TIMESTAMP is active (add_reader path).
|
||||||
|
|
||||||
|
Exposes the same sendto() / close() interface as asyncio's DatagramTransport
|
||||||
|
so the rest of the code does not need to know which path is in use.
|
||||||
|
"""
|
||||||
|
def __init__(self, loop, sock):
|
||||||
|
self._loop = loop
|
||||||
|
self._sock = sock
|
||||||
|
|
||||||
|
def sendto(self, data, addr):
|
||||||
|
try:
|
||||||
|
self._sock.sendto(data, addr)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("sendto failed: %s", e)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
try:
|
||||||
|
self._loop.remove_reader(self._sock.fileno())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
self._sock.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def make_recvmsg_reader(sock, handler, transport):
|
||||||
|
"""Return a callback suitable for loop.add_reader().
|
||||||
|
|
||||||
|
Reads one datagram per call using recvmsg() so that kernel timestamps in
|
||||||
|
the ancillary data are accessible. Falls back to time.time() if the
|
||||||
|
cmsg is missing.
|
||||||
|
|
||||||
|
handler(msg, addr, transport, kernel_ts) – same signature as udp_handler
|
||||||
|
in main.py with the optional kernel_ts argument.
|
||||||
|
"""
|
||||||
|
BUFSIZE = 65536
|
||||||
|
ANCBUFSIZE = 128 # enough for one struct timespec cmsg
|
||||||
|
|
||||||
|
def _read():
|
||||||
|
try:
|
||||||
|
data, ancdata, _, addr = sock.recvmsg(BUFSIZE, ANCBUFSIZE)
|
||||||
|
except BlockingIOError:
|
||||||
|
return
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning("recvmsg error: %s", e)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
kernel_ts = _extract_kernel_ts(ancdata)
|
||||||
|
msg = parse_message(data)
|
||||||
|
if msg:
|
||||||
|
handler(msg, addr, transport, kernel_ts)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error processing datagram from %s", addr)
|
||||||
|
|
||||||
|
return _read
|
||||||
|
|
||||||
|
|
||||||
|
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||||
|
def __init__(self, config=None, handler=None):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config or {}
|
||||||
|
self.handler = handler
|
||||||
|
|
||||||
|
def connection_made(self, transport):
|
||||||
|
self.transport = transport
|
||||||
|
logger.info("UDP Server listening...")
|
||||||
|
|
||||||
|
def datagram_received(self, data, addr):
|
||||||
|
logger.debug("Received from %s", addr)
|
||||||
|
try:
|
||||||
|
msg = parse_message(data)
|
||||||
|
if self.handler:
|
||||||
|
# handler can be a callable provided by the application
|
||||||
|
# pass the transport so handlers can send replies (ACKs/commands)
|
||||||
|
self.handler(msg, addr, self.transport)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error while processing datagram from %s", addr)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_message(data: bytes):
|
||||||
|
"""Parse a raw datagram into a message dict.
|
||||||
|
|
||||||
|
Uses the protocol decoding helpers and falls back to old format when
|
||||||
|
decoding returns an empty dict (compat with older clients).
|
||||||
|
"""
|
||||||
|
msg = stodict(data)
|
||||||
|
if not msg:
|
||||||
|
# fallback to old format
|
||||||
|
msg = oldmtodict(data)
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
|
def dicttos(ID, d):
|
||||||
|
s = []
|
||||||
|
for k in d:
|
||||||
|
if isinstance(d[k], float):
|
||||||
|
s.append("%s=%0.5f" % (k, d[k]))
|
||||||
|
else:
|
||||||
|
s.append("%s=%s" % (k, d[k]))
|
||||||
|
pk = ";".join(s)
|
||||||
|
zpk = zlib.compress(pk.encode(), 6)
|
||||||
|
ID = "!" + ID + ":"
|
||||||
|
opk = ID.encode() + zpk
|
||||||
|
return opk
|
||||||
|
|
||||||
|
|
||||||
|
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def _set_connectivity_alert(host, afam, level_name):
|
||||||
|
"""Update (or clear) a connectivity alert_state entry for a host/address-family.
|
||||||
|
|
||||||
|
level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so
|
||||||
|
that recovered hosts don't clutter the Alerts Dashboard.
|
||||||
|
"""
|
||||||
|
from .threshold import AlertState, AlertLevel
|
||||||
|
metric_path = f"connectivity.{afam}"
|
||||||
|
level = getattr(AlertLevel, level_name, AlertLevel.OK)
|
||||||
|
if level == AlertLevel.OK:
|
||||||
|
host.alert_states.pop(metric_path, None)
|
||||||
|
return
|
||||||
|
if metric_path not in host.alert_states:
|
||||||
|
host.alert_states[metric_path] = AlertState(metric_path)
|
||||||
|
state = host.alert_states[metric_path]
|
||||||
|
state.update(level, level_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_timer_callbacks(uname, host, ctx):
|
||||||
|
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||||
|
|
||||||
|
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||||
|
"""
|
||||||
|
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
|
||||||
|
async def on_unknown(connection):
|
||||||
|
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||||
|
# Keep connectivity alert active when host transitions to unknown
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
|
||||||
|
async def on_overdue(connection):
|
||||||
|
if connection.getstate() != connection.__class__.UP:
|
||||||
|
return
|
||||||
|
now = time.time()
|
||||||
|
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||||
|
msg = f"{connection.afam} overdue"
|
||||||
|
eventlog(uname, "CRITICAL", msg)
|
||||||
|
if host.watched:
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||||
|
))
|
||||||
|
# Track in alert_states so the Alerts Dashboard shows this
|
||||||
|
_set_connectivity_alert(host, connection.afam, "CRITICAL")
|
||||||
|
if threshold_checker:
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path="rtt",
|
||||||
|
value=float("inf"),
|
||||||
|
alert_states=host.alert_states,
|
||||||
|
)
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||||
|
|
||||||
|
return on_overdue, on_unknown
|
||||||
|
|
||||||
|
|
||||||
|
def restore_connection_timers(hbdclass, ctx):
|
||||||
|
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||||
|
|
||||||
|
For UP connections, the remaining time until overdue is calculated from
|
||||||
|
lastbeat so that clients that vanished during hbd's downtime are detected.
|
||||||
|
For OVERDUE connections, the UNKNOWN drop timer is restored.
|
||||||
|
"""
|
||||||
|
now = time.time()
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
grace = cfg.get("grace", 2)
|
||||||
|
|
||||||
|
restored = 0
|
||||||
|
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
interval = host.interval
|
||||||
|
for afam, conn in list(host.connections.items()):
|
||||||
|
state = conn.getstate()
|
||||||
|
if state == hbdclass.Connection.DOWN:
|
||||||
|
continue
|
||||||
|
|
||||||
|
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||||
|
|
||||||
|
if state == hbdclass.Connection.UP and interval > 0:
|
||||||
|
elapsed = now - conn.lastbeat
|
||||||
|
# Give hosts one full (interval + grace) of extra time on startup
|
||||||
|
# so hosts that were silent while hbd was down are not immediately
|
||||||
|
# flagged as overdue before they have a chance to check in.
|
||||||
|
startup_grace = interval + grace
|
||||||
|
remaining = max(startup_grace, 2 * startup_grace - elapsed)
|
||||||
|
conn.reset_overdue_timer(remaining, on_overdue)
|
||||||
|
logger.debug(
|
||||||
|
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
|
||||||
|
uname, afam, remaining, elapsed, startup_grace,
|
||||||
|
)
|
||||||
|
restored += 1
|
||||||
|
|
||||||
|
elif state == hbdclass.Connection.OVERDUE:
|
||||||
|
elapsed_overdue = now - conn.statetime
|
||||||
|
remaining = DROPOVERDUE - elapsed_overdue
|
||||||
|
if remaining <= 1:
|
||||||
|
# Already past the drop window — mark UNKNOWN immediately
|
||||||
|
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||||
|
logger.info(
|
||||||
|
"Marking %s/%s UNKNOWN (overdue %.1f days)",
|
||||||
|
uname, afam, elapsed_overdue / 86400,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.reset_overdue_timer(remaining, on_unknown)
|
||||||
|
logger.debug(
|
||||||
|
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||||
|
uname, afam, remaining,
|
||||||
|
)
|
||||||
|
restored += 1
|
||||||
|
|
||||||
|
logger.info("Restored timers for %d connection(s)", restored)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||||
|
"""Handle a parsed datagram message.
|
||||||
|
|
||||||
|
ctx is a dictionary with runtime dependencies:
|
||||||
|
- config: dict of configuration
|
||||||
|
- hbdclass: module providing Host/Connection classes
|
||||||
|
- log: callable(loghost, message)
|
||||||
|
- msg_to_websockets: callable(typ, data)
|
||||||
|
- msg_journal: MessageJournal instance for logging all messages
|
||||||
|
- DEBUG, verbose
|
||||||
|
"""
|
||||||
|
if not msg:
|
||||||
|
return
|
||||||
|
now = ctx.get("recv_ts") or time.time()
|
||||||
|
|
||||||
|
# Log message to journal
|
||||||
|
msg_journal = ctx.get("msg_journal")
|
||||||
|
if msg_journal:
|
||||||
|
# Create async task to log message (non-blocking)
|
||||||
|
import asyncio
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.create_task(msg_journal.log_message(msg, addr, now))
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Failed to log message to journal: {e}")
|
||||||
|
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
hbdcls = ctx.get("hbdclass")
|
||||||
|
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||||
|
DEBUG = ctx.get("DEBUG", 0)
|
||||||
|
verbose = ctx.get("verbose", False)
|
||||||
|
|
||||||
|
# normalize addr (ip, port)
|
||||||
|
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||||
|
name = msg.get("name", "unknown")
|
||||||
|
from ..common.utils import shortname
|
||||||
|
from . import config as config_mod
|
||||||
|
|
||||||
|
uname = shortname(name)
|
||||||
|
|
||||||
|
if uname not in hbdcls.Host.hosts:
|
||||||
|
host = hbdcls.Host(uname)
|
||||||
|
# Use new config function to check dyndns
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||||
|
host.dyn = uname in dyndnshosts
|
||||||
|
watchhosts = config_mod.get_watchhosts(cfg)
|
||||||
|
host.watched = uname in watchhosts
|
||||||
|
# Apply user-access settings from config
|
||||||
|
access = config_mod.get_host_access(cfg, uname)
|
||||||
|
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||||
|
logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
|
||||||
|
newh = True
|
||||||
|
else:
|
||||||
|
host = hbdcls.Host.hosts[uname]
|
||||||
|
newh = False
|
||||||
|
|
||||||
|
cid = msg.get("id", 0)
|
||||||
|
try:
|
||||||
|
rtt = float(msg.get("rtt"))
|
||||||
|
except TypeError:
|
||||||
|
rtt = None
|
||||||
|
|
||||||
|
if msg.get("ID") == "HTB":
|
||||||
|
host.doesack = msg.get("acks", -1)
|
||||||
|
# send ACK back; ask client to resend plugin info when we have none yet
|
||||||
|
rmsg = {"time": time.time()}
|
||||||
|
if not host.plugin_data:
|
||||||
|
rmsg["request_update"] = 1
|
||||||
|
opkt = dicttos("ACK", rmsg)
|
||||||
|
try:
|
||||||
|
transport.sendto(opkt, addr)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send ack: %s" % e))
|
||||||
|
|
||||||
|
elif msg.get("ID") == "PLG":
|
||||||
|
# Handle plugin data message
|
||||||
|
plugin_name = msg.get("plugin")
|
||||||
|
if plugin_name:
|
||||||
|
# Extract plugin fields, dropping protocol metadata fields
|
||||||
|
plugin_data = {k: v for k, v in msg.items()
|
||||||
|
if k not in ("ID", "plugin", "id", "name")}
|
||||||
|
# Store plugin data with timestamp
|
||||||
|
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||||
|
|
||||||
|
# If os_info reports an owner and none is configured server-side, apply it
|
||||||
|
if plugin_name == "os_info":
|
||||||
|
config_owner = config_mod.get_host_access(cfg, uname).get("owner")
|
||||||
|
default_owner = config_mod.get_default_owner(cfg)
|
||||||
|
inferred_owner = plugin_data.get("owner", config_owner or default_owner)
|
||||||
|
host.owner = inferred_owner
|
||||||
|
logger.info(f"owner for {uname} is {host.owner}")
|
||||||
|
if DEBUG > 1:
|
||||||
|
print(f"Stored plugin data for {uname}: {plugin_name}")
|
||||||
|
|
||||||
|
# Check thresholds if checker is available
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker:
|
||||||
|
try:
|
||||||
|
state_changes = threshold_checker.check_plugin_data(
|
||||||
|
host_name=uname,
|
||||||
|
plugin_name=plugin_name,
|
||||||
|
data=plugin_data,
|
||||||
|
alert_states=host.alert_states,
|
||||||
|
)
|
||||||
|
if DEBUG > 1 and state_changes:
|
||||||
|
print(f"Threshold state changes for {uname}: {state_changes}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
|
||||||
|
|
||||||
|
# Notify websockets of plugin update
|
||||||
|
if msg_to_websockets:
|
||||||
|
try:
|
||||||
|
msg_to_websockets("plugin", {
|
||||||
|
"host": uname,
|
||||||
|
"plugin": plugin_name,
|
||||||
|
"data": plugin_data,
|
||||||
|
"timestamp": now
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn, res = host.conndata(cid, ip, rtt, now)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print("conndata failed: %s" % e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if res:
|
||||||
|
eventlog(uname, "WARNING", res)
|
||||||
|
if host.watched:
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
|
||||||
|
))
|
||||||
|
|
||||||
|
interval = int(msg.get("interval", 0) or 0)
|
||||||
|
shutdown = msg.get("shutdown", 0)
|
||||||
|
service = msg.get("service", "unknown")
|
||||||
|
message = msg.get("msg", None)
|
||||||
|
boot = msg.get("boot", 0)
|
||||||
|
|
||||||
|
if boot:
|
||||||
|
eventlog(uname, "INFO", "booted")
|
||||||
|
if host.watched:
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||||
|
))
|
||||||
|
if message:
|
||||||
|
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||||
|
|
||||||
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
|
lasts = conn.state
|
||||||
|
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||||
|
# Clear connectivity alert now that the host is back up
|
||||||
|
_set_connectivity_alert(host, conn.afam, "OK")
|
||||||
|
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||||
|
# it was never down, it just hasn't been seen before.
|
||||||
|
if not newh:
|
||||||
|
if d == 0 or lasts == "unknown":
|
||||||
|
m = "%s is up" % (conn.afam)
|
||||||
|
elif d < 4:
|
||||||
|
# Transient blip (likely client restart) — skip log and notification
|
||||||
|
m = None
|
||||||
|
else:
|
||||||
|
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||||
|
if m:
|
||||||
|
eventlog(uname, "RECOVER", m)
|
||||||
|
if host.watched:
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||||
|
))
|
||||||
|
|
||||||
|
if boot or newh:
|
||||||
|
host.upcount = host.doesack
|
||||||
|
else:
|
||||||
|
host.upcount += 1
|
||||||
|
|
||||||
|
if shutdown:
|
||||||
|
m = "%s shutdown" % conn.afam
|
||||||
|
eventlog(uname, "INFO", m)
|
||||||
|
if host.watched:
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||||
|
))
|
||||||
|
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||||
|
_set_connectivity_alert(host, conn.afam, "CRITICAL")
|
||||||
|
|
||||||
|
if interval > 0:
|
||||||
|
host.interval = interval
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
# Reset overdue timer on every heartbeat
|
||||||
|
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||||
|
grace = cfg.get("grace", 2)
|
||||||
|
timeout_seconds = interval + grace
|
||||||
|
on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
|
||||||
|
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||||
|
|
||||||
|
# Check RTT thresholds using the threshold checker
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker and rtt and rtt > 0:
|
||||||
|
# Metric path for RTT is simply "rtt"
|
||||||
|
metric_path = "rtt"
|
||||||
|
|
||||||
|
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path=metric_path,
|
||||||
|
value=rtt,
|
||||||
|
alert_states=host.alert_states
|
||||||
|
)
|
||||||
|
|
||||||
|
# send any commands we have queued
|
||||||
|
while len(host.cmds):
|
||||||
|
op, rmsg = host.cmds[0]
|
||||||
|
if op == "CMD":
|
||||||
|
del host.cmds[0]
|
||||||
|
eventlog(uname, "INFO", "command sent")
|
||||||
|
elif op == "UPD":
|
||||||
|
del host.cmds[0]
|
||||||
|
eventlog(uname, "INFO", "update initiated")
|
||||||
|
opkt = dicttos(op, rmsg)
|
||||||
|
try:
|
||||||
|
transport.sendto(opkt, addr)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send cmd/update: %s" % e))
|
||||||
|
|
||||||
|
if msg_to_websockets:
|
||||||
|
try:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send websocket message: %s" % e))
|
||||||
@@ -0,0 +1,271 @@
|
|||||||
|
"""User management: loading, authentication, and session tracking.
|
||||||
|
|
||||||
|
Users are defined in the config file under the ``users`` key:
|
||||||
|
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
avatar: /path/to/avatar.png # file path, URL, or base64 data URI
|
||||||
|
password: pbkdf2:sha256:... # generated with: hbd passwd
|
||||||
|
admin: true # optional server-level admin
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
Roles are assigned per-host:
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
|
||||||
|
If no users are defined the server runs in unauthenticated mode (backwards
|
||||||
|
compatible). When users are defined every API call must carry a valid session
|
||||||
|
token in an ``Authorization: Bearer <token>`` or ``X-Auth-Token`` header,
|
||||||
|
obtained via ``POST /api/0/auth/login``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import hmac
|
||||||
|
import logging
|
||||||
|
import secrets
|
||||||
|
import time
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Session lifetime in seconds (24 hours).
|
||||||
|
SESSION_TTL = 86400
|
||||||
|
|
||||||
|
# Global session store: token -> {"username": str, "expires": float, "created": float}
|
||||||
|
_sessions: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# User class
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class User:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str,
|
||||||
|
full_name: str = "",
|
||||||
|
avatar: str = "",
|
||||||
|
password_hash: str = "",
|
||||||
|
admin: bool = False,
|
||||||
|
notification_channels: list | None = None,
|
||||||
|
):
|
||||||
|
self.username = username
|
||||||
|
self.full_name = full_name
|
||||||
|
self.avatar = avatar
|
||||||
|
self.password_hash = password_hash
|
||||||
|
self.admin = admin
|
||||||
|
self.notification_channels: list = notification_channels or []
|
||||||
|
|
||||||
|
def check_password(self, password: str) -> bool:
|
||||||
|
if not self.password_hash:
|
||||||
|
return False
|
||||||
|
return _verify_password(password, self.password_hash)
|
||||||
|
|
||||||
|
def avatar_is_local(self) -> bool:
|
||||||
|
"""Return True when the avatar is a local filesystem path (starts with '/')."""
|
||||||
|
return bool(self.avatar and self.avatar.startswith("/"))
|
||||||
|
|
||||||
|
def avatar_url(self) -> str:
|
||||||
|
"""Return the URL to use as an <img src>.
|
||||||
|
|
||||||
|
Local file paths are served via the /api/0/users/{username}/avatar
|
||||||
|
endpoint. External URLs and data URIs are returned as-is.
|
||||||
|
"""
|
||||||
|
if self.avatar_is_local():
|
||||||
|
return f"/api/0/users/{self.username}/avatar"
|
||||||
|
return self.avatar
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"username": self.username,
|
||||||
|
"full_name": self.full_name,
|
||||||
|
"avatar": self.avatar,
|
||||||
|
"avatar_url": self.avatar_url(),
|
||||||
|
"admin": self.admin,
|
||||||
|
"notification_channels": self.notification_channels,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Password hashing (PBKDF2-HMAC-SHA256, stdlib only)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def hash_password(password: str) -> str:
|
||||||
|
"""Return a storable hash for *password*.
|
||||||
|
|
||||||
|
Format: ``pbkdf2:sha256:<iterations>:<salt>:<hex-digest>``
|
||||||
|
|
||||||
|
Use this to generate the ``password`` value in the config file::
|
||||||
|
|
||||||
|
python -c "from hbd.server.users import hash_password; print(hash_password('secret'))"
|
||||||
|
|
||||||
|
Or via the CLI::
|
||||||
|
|
||||||
|
hbd passwd
|
||||||
|
"""
|
||||||
|
salt = secrets.token_hex(16)
|
||||||
|
iterations = 260_000
|
||||||
|
dk = hashlib.pbkdf2_hmac(
|
||||||
|
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||||
|
)
|
||||||
|
return f"pbkdf2:sha256:{iterations}:{salt}:{dk.hex()}"
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_password(password: str, stored_hash: str) -> bool:
|
||||||
|
"""Return True if *password* matches *stored_hash*."""
|
||||||
|
try:
|
||||||
|
parts = stored_hash.split(":")
|
||||||
|
if len(parts) != 5 or parts[0] != "pbkdf2" or parts[1] != "sha256":
|
||||||
|
return False
|
||||||
|
_, _, iterations_str, salt, expected_hex = parts
|
||||||
|
iterations = int(iterations_str)
|
||||||
|
dk = hashlib.pbkdf2_hmac(
|
||||||
|
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||||
|
)
|
||||||
|
return hmac.compare_digest(dk.hex(), expected_hex)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Global user registry
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# username -> User
|
||||||
|
users: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def load_users(config: dict) -> dict:
|
||||||
|
"""Populate the global user registry from *config*.
|
||||||
|
|
||||||
|
Called once at startup and again on SIGHUP config reload.
|
||||||
|
Returns the new ``users`` dict.
|
||||||
|
"""
|
||||||
|
global users
|
||||||
|
old_users = dict(users) # snapshot before rebuild
|
||||||
|
users_cfg = config.get("users", {})
|
||||||
|
if not isinstance(users_cfg, dict):
|
||||||
|
users = {}
|
||||||
|
# Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
|
||||||
|
for username, existing_user in old_users.items():
|
||||||
|
if not existing_user.password_hash and username not in users:
|
||||||
|
users[username] = existing_user
|
||||||
|
return users
|
||||||
|
|
||||||
|
result: dict = {}
|
||||||
|
for username, attrs in users_cfg.items():
|
||||||
|
if not isinstance(attrs, dict):
|
||||||
|
logger.warning("Skipping user %r: expected a mapping", username)
|
||||||
|
continue
|
||||||
|
result[username] = User(
|
||||||
|
username=username,
|
||||||
|
full_name=attrs.get("full_name", ""),
|
||||||
|
avatar=attrs.get("avatar", ""),
|
||||||
|
password_hash=attrs.get("password", ""),
|
||||||
|
admin=bool(attrs.get("admin", False)),
|
||||||
|
notification_channels=attrs.get("notification_channels", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
users = result
|
||||||
|
# Preserve OAuth-provisioned users (password_hash == "") that aren't in config.
|
||||||
|
for username, existing_user in old_users.items():
|
||||||
|
if not existing_user.password_hash and username not in users:
|
||||||
|
users[username] = existing_user
|
||||||
|
logger.info("Loaded %d user(s) from config", len(users))
|
||||||
|
return users
|
||||||
|
|
||||||
|
|
||||||
|
def users_enabled() -> bool:
|
||||||
|
"""Return True if at least one user is configured (auth-required mode)."""
|
||||||
|
return bool(users)
|
||||||
|
|
||||||
|
|
||||||
|
def get_user(username: str) -> "User | None":
|
||||||
|
return users.get(username)
|
||||||
|
|
||||||
|
|
||||||
|
def authenticate(username: str, password: str) -> "User | None":
|
||||||
|
"""Return the User if credentials are valid, else None."""
|
||||||
|
user = users.get(username)
|
||||||
|
if user and user.check_password(password):
|
||||||
|
return user
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def provision_oauth_user(username: str, full_name: str, avatar: str) -> "User":
|
||||||
|
"""Create or update a user sourced from an OAuth2 provider.
|
||||||
|
|
||||||
|
New users are inserted with no password_hash — they can only authenticate
|
||||||
|
via OAuth. Existing users (e.g. defined in config with a password) have
|
||||||
|
their display name and avatar refreshed; all other attributes are preserved.
|
||||||
|
"""
|
||||||
|
user = users.get(username)
|
||||||
|
if user is None:
|
||||||
|
user = User(username=username, full_name=full_name, avatar=avatar)
|
||||||
|
users[username] = user
|
||||||
|
logger.info("Provisioned OAuth user %r", username)
|
||||||
|
else:
|
||||||
|
if full_name:
|
||||||
|
user.full_name = full_name
|
||||||
|
if avatar:
|
||||||
|
user.avatar = avatar
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Session management
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def create_session(username: str) -> str:
|
||||||
|
"""Create a new session for *username* and return the opaque token."""
|
||||||
|
_purge_expired_sessions()
|
||||||
|
token = secrets.token_hex(32)
|
||||||
|
_sessions[token] = {
|
||||||
|
"username": username,
|
||||||
|
"expires": time.time() + SESSION_TTL,
|
||||||
|
"created": time.time(),
|
||||||
|
}
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_user(token: str) -> "User | None":
|
||||||
|
"""Return the User for a valid *token*, or None if missing/expired."""
|
||||||
|
if not token:
|
||||||
|
return None
|
||||||
|
session = _sessions.get(token)
|
||||||
|
if not session:
|
||||||
|
return None
|
||||||
|
if session["expires"] < time.time():
|
||||||
|
del _sessions[token]
|
||||||
|
return None
|
||||||
|
return get_user(session["username"])
|
||||||
|
|
||||||
|
|
||||||
|
def delete_session(token: str) -> None:
|
||||||
|
"""Invalidate *token* (logout)."""
|
||||||
|
_sessions.pop(token, None)
|
||||||
|
|
||||||
|
|
||||||
|
def _purge_expired_sessions() -> None:
|
||||||
|
now = time.time()
|
||||||
|
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
|
||||||
|
for t in expired:
|
||||||
|
del _sessions[t]
|
||||||
|
|
||||||
|
|
||||||
|
def save_sessions() -> dict:
|
||||||
|
"""Return a snapshot of non-expired sessions suitable for pickling."""
|
||||||
|
_purge_expired_sessions()
|
||||||
|
return dict(_sessions)
|
||||||
|
|
||||||
|
|
||||||
|
def load_sessions(snapshot: dict) -> None:
|
||||||
|
"""Restore sessions from a pickled snapshot, dropping any that have expired."""
|
||||||
|
global _sessions
|
||||||
|
now = time.time()
|
||||||
|
_sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
|
||||||
|
logger.debug("Restored %d session(s) from pickle", len(_sessions))
|
||||||
@@ -0,0 +1,160 @@
|
|||||||
|
"""WebSocket handler and broadcast helpers for hbd.
|
||||||
|
|
||||||
|
WebSocket connections are served through the regular HTTP port via the
|
||||||
|
/ws route registered in http.py (aiohttp WebSocketResponse upgrade).
|
||||||
|
The separate standalone WebSocket server on ws_port is no longer used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Callable, Iterable, Optional
|
||||||
|
from . import data
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Map of WebSocket → User object (or None when auth is disabled)
|
||||||
|
_connections: dict = {}
|
||||||
|
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||||
|
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||||
|
_verbose: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def setup(
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
get_hosts: Optional[Callable[[], Iterable]] = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
):
|
||||||
|
"""Register the running loop and initial-state callback.
|
||||||
|
|
||||||
|
Call this once from _run_async before starting the HTTP server.
|
||||||
|
"""
|
||||||
|
global _loop, _get_hosts, _verbose
|
||||||
|
_loop = loop
|
||||||
|
_get_hosts = get_hosts
|
||||||
|
_verbose = verbose
|
||||||
|
|
||||||
|
|
||||||
|
def _user_can_see_host(user, host_name: str) -> bool:
|
||||||
|
"""Return True if *user* may see updates for *host_name* (manager or higher)."""
|
||||||
|
from . import hbdclass, users as users_mod
|
||||||
|
if user is None or not users_mod.users_enabled():
|
||||||
|
return True
|
||||||
|
if user.admin:
|
||||||
|
return True
|
||||||
|
host = hbdclass.Host.hosts.get(host_name)
|
||||||
|
if host is None:
|
||||||
|
return False
|
||||||
|
return host.is_manager(user.username)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_token(request) -> str:
|
||||||
|
"""Extract session token from request (mirrors logic in http.py)."""
|
||||||
|
auth = request.headers.get("Authorization", "")
|
||||||
|
if auth.startswith("Bearer "):
|
||||||
|
return auth[7:].strip()
|
||||||
|
token = request.headers.get("X-Auth-Token", "")
|
||||||
|
if token:
|
||||||
|
return token
|
||||||
|
return request.cookies.get("hbd_session", "")
|
||||||
|
|
||||||
|
|
||||||
|
async def handler(request):
|
||||||
|
"""aiohttp WebSocket upgrade handler — register as GET /ws."""
|
||||||
|
from aiohttp import web
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
ws = web.WebSocketResponse()
|
||||||
|
await ws.prepare(request)
|
||||||
|
|
||||||
|
token = _get_token(request)
|
||||||
|
user = users_mod.get_session_user(token) if token else None
|
||||||
|
|
||||||
|
_connections[ws] = user
|
||||||
|
remote = request.remote
|
||||||
|
logger.info("WebSocket connected from %s", remote)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Send current host state, filtered to hosts this user may see
|
||||||
|
if _get_hosts:
|
||||||
|
try:
|
||||||
|
for h in list(_get_hosts()):
|
||||||
|
host_name = h.get("raw_name") or h.get("name", "")
|
||||||
|
if _user_can_see_host(user, host_name):
|
||||||
|
await ws.send_str(json.dumps({"type": "host", "data": h}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error sending initial hosts: %s", e)
|
||||||
|
|
||||||
|
# Send recent messages newest-first so the client can append them in
|
||||||
|
# display order without reordering on arrival (tagged history=True so
|
||||||
|
# the client knows to append rather than prepend).
|
||||||
|
if data.msgs:
|
||||||
|
try:
|
||||||
|
for m in reversed(data.msgs):
|
||||||
|
host_name = m.get("host") if isinstance(m, dict) else None
|
||||||
|
if not host_name or _user_can_see_host(user, host_name):
|
||||||
|
await ws.send_str(json.dumps({"type": "message", "data": m, "history": True}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error sending initial messages: %s", e)
|
||||||
|
|
||||||
|
# Keep connection open, ignore incoming frames
|
||||||
|
async for msg in ws:
|
||||||
|
from aiohttp import WSMsgType
|
||||||
|
if msg.type == WSMsgType.TEXT:
|
||||||
|
if _verbose:
|
||||||
|
logger.debug("ws recv from %s: %s", remote, msg.data)
|
||||||
|
elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("WebSocket handler error from %s: %s", remote, e)
|
||||||
|
finally:
|
||||||
|
_connections.pop(ws, None)
|
||||||
|
logger.info("WebSocket disconnected from %s", remote)
|
||||||
|
|
||||||
|
return ws
|
||||||
|
|
||||||
|
|
||||||
|
def broadcast(typ: str, payload) -> bool:
|
||||||
|
"""Thread-safe broadcast to all connected WebSocket clients.
|
||||||
|
|
||||||
|
For host and plugin updates, only sends to clients whose user has
|
||||||
|
manager-or-higher access to that host. Other message types are
|
||||||
|
broadcast to all clients.
|
||||||
|
|
||||||
|
Can be called from any thread; schedules sends on the event loop.
|
||||||
|
Returns False if the loop is not running yet.
|
||||||
|
"""
|
||||||
|
if not _loop:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Determine the host name for access-filtered message types
|
||||||
|
host_name: Optional[str] = None
|
||||||
|
if typ in ("host", "plugin"):
|
||||||
|
host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
|
||||||
|
elif typ == "message" and isinstance(payload, dict):
|
||||||
|
host_name = payload.get("host")
|
||||||
|
|
||||||
|
jmsg = json.dumps({"type": typ, "data": payload})
|
||||||
|
|
||||||
|
async def _send_all():
|
||||||
|
dead = set()
|
||||||
|
for ws, user in list(_connections.items()):
|
||||||
|
try:
|
||||||
|
if ws.closed:
|
||||||
|
dead.add(ws)
|
||||||
|
continue
|
||||||
|
if host_name is not None and not _user_can_see_host(user, host_name):
|
||||||
|
continue
|
||||||
|
await ws.send_str(jmsg)
|
||||||
|
except Exception:
|
||||||
|
dead.add(ws)
|
||||||
|
for ws in dead:
|
||||||
|
_connections.pop(ws, None)
|
||||||
|
|
||||||
|
asyncio.run_coroutine_threadsafe(_send_all(), _loop)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def connection_count() -> int:
|
||||||
|
return len(_connections)
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 5.3 KiB |
@@ -1,7 +0,0 @@
|
|||||||
<head>
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
|
||||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
|
||||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
|
||||||
<title>{{ title }}</title>
|
|
||||||
<script src="{{ extra_scripts }}"></script>
|
|
||||||
</head>
|
|
||||||
@@ -1,281 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
{% include 'head.html' %}
|
|
||||||
|
|
||||||
<style>
|
|
||||||
.content {
|
|
||||||
display: flex;
|
|
||||||
flex-direction: column;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table {
|
|
||||||
/* flex: 1; */
|
|
||||||
flex-grow: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
.log {
|
|
||||||
flex: 2;
|
|
||||||
flex-grow: 1;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable {
|
|
||||||
border-collapse: collapse;
|
|
||||||
font-size: 95%;
|
|
||||||
/* width: 100%; */
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable td,
|
|
||||||
#ntable th {
|
|
||||||
border: 1px solid #ddd;
|
|
||||||
text-align: left;
|
|
||||||
padding: 0px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable tr:nth-child(even) {
|
|
||||||
background-color: #f2f2f2;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable tr:hover {
|
|
||||||
background-color: #ddd;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable th {
|
|
||||||
padding-top: 12px;
|
|
||||||
padding-bottom: 12px;
|
|
||||||
background-color: #9d9d9d;
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable
|
|
||||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
|
||||||
content: " \2195";
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Modal for connection status messages */
|
|
||||||
.connection-modal {
|
|
||||||
display: none;
|
|
||||||
position: fixed;
|
|
||||||
z-index: 1000;
|
|
||||||
left: 0;
|
|
||||||
top: 0;
|
|
||||||
width: 100%;
|
|
||||||
height: 100%;
|
|
||||||
background-color: rgba(0, 0, 0, 0.4);
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal.show {
|
|
||||||
display: flex;
|
|
||||||
justify-content: center;
|
|
||||||
align-items: center;
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal-content {
|
|
||||||
background-color: #f9f9f9;
|
|
||||||
padding: 20px;
|
|
||||||
border: 1px solid #888;
|
|
||||||
border-radius: 5px;
|
|
||||||
text-align: center;
|
|
||||||
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
|
|
||||||
min-width: 300px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal-content p {
|
|
||||||
margin: 10px 0;
|
|
||||||
font-size: 16px;
|
|
||||||
color: #333;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<script type="text/javascript">
|
|
||||||
var cnt = 0;
|
|
||||||
var nTable = document;
|
|
||||||
var name_idx = {};
|
|
||||||
var c = 0;
|
|
||||||
|
|
||||||
function setup() {
|
|
||||||
name_idx = {};
|
|
||||||
nTable = document.getElementById("ntable");
|
|
||||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
|
||||||
if (i == 0) continue;
|
|
||||||
name = nTable.rows[i].cells[0].innerText;
|
|
||||||
name_idx[name] = nTable.rows[i];
|
|
||||||
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function createRow(data) {
|
|
||||||
var row = document.createElement("tr");
|
|
||||||
var c_name = document.createElement("td");
|
|
||||||
var c_ver = document.createElement("td");
|
|
||||||
var c_ipv4addr = document.createElement("td");
|
|
||||||
var c_ipv4state = document.createElement("td");
|
|
||||||
var c_ipv4latency = document.createElement("td");
|
|
||||||
c_ipv4latency.style.textAlign = "right";
|
|
||||||
var c_ipv4statets = document.createElement("td");
|
|
||||||
c_ipv4statets.style.textAlign = "right";
|
|
||||||
var c_ipv6addr = document.createElement("td");
|
|
||||||
var c_ipv6state = document.createElement("td");
|
|
||||||
var c_ipv6latency = document.createElement("td");
|
|
||||||
c_ipv6latency.style.textAlign = "right";
|
|
||||||
var c_ipv6statets = document.createElement("td");
|
|
||||||
c_ipv6statets.style.textAlign = "right";
|
|
||||||
row.appendChild(c_name);
|
|
||||||
row.appendChild(c_ver);
|
|
||||||
row.appendChild(c_ipv4addr);
|
|
||||||
row.appendChild(c_ipv4state);
|
|
||||||
row.appendChild(c_ipv4latency);
|
|
||||||
row.appendChild(c_ipv4statets);
|
|
||||||
row.appendChild(c_ipv6addr);
|
|
||||||
row.appendChild(c_ipv6state);
|
|
||||||
row.appendChild(c_ipv6latency);
|
|
||||||
row.appendChild(c_ipv6statets);
|
|
||||||
if (data.dyn) {
|
|
||||||
c_name.innerHTML = "<b>" + data.name + "</b>";
|
|
||||||
} else {
|
|
||||||
c_name.innerHTML = data.name;
|
|
||||||
}
|
|
||||||
c_ver.innerHTML = data.cver;
|
|
||||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
|
||||||
c_ipv4state.innerHTML = data.connections[0].state;
|
|
||||||
if (data.connections.length > 1) {
|
|
||||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
|
||||||
c_ipv6state.innerHTML = data.connections[1].state;
|
|
||||||
}
|
|
||||||
var table = document.getElementById("ntablebody"); // find table to append to
|
|
||||||
table.appendChild(row); // append row to table
|
|
||||||
name_idx[c_name] = row;
|
|
||||||
}
|
|
||||||
|
|
||||||
function formatTS(ts) {
|
|
||||||
const milliseconds = ts * 1000;
|
|
||||||
const dateObject = new Date(milliseconds);
|
|
||||||
return dateObject.toLocaleString("de-DE");
|
|
||||||
}
|
|
||||||
|
|
||||||
function update_table(data) {
|
|
||||||
if (!(data.name in name_idx)) {
|
|
||||||
createRow(data);
|
|
||||||
setup();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var i = 0; i < data.connections.length; i++) {
|
|
||||||
name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
|
|
||||||
name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
|
|
||||||
data.connections[i].statetime
|
|
||||||
);
|
|
||||||
if (data.connections[i].state == "up") {
|
|
||||||
state = "up";
|
|
||||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
|
||||||
} else {
|
|
||||||
if (data.connections[i].state == "unknown") {
|
|
||||||
state = "";
|
|
||||||
latency = "";
|
|
||||||
name_idx[data.name].cells[2 + i * 4].innerHTML = "";
|
|
||||||
name_idx[data.name].cells[5 + i * 4].innerHTML = "";
|
|
||||||
} else {
|
|
||||||
state = "<b>" + data.connections[i].state + "</b>";
|
|
||||||
latency = "-";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
name_idx[data.name].cells[3 + i * 4].innerHTML = state;
|
|
||||||
name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function WS_Connect() {
|
|
||||||
if ("WebSocket" in window) {
|
|
||||||
//N.B: subprotocol field causes chrome to error 1006
|
|
||||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}" /*, "hdb" */);
|
|
||||||
|
|
||||||
ws_hbd.onopen = function () {
|
|
||||||
// Web Socket is connected, send data using send()
|
|
||||||
console.log("ws connect");
|
|
||||||
// Hide modal window if visible
|
|
||||||
var modal = document.getElementById("connectionModal");
|
|
||||||
if (modal) {
|
|
||||||
modal.classList.remove("show");
|
|
||||||
}
|
|
||||||
ws_hbd.send("heartbeat_web");
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onerror = function (event) {
|
|
||||||
console.log(event);
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onmessage = function (event) {
|
|
||||||
/* console.log(event.data); */
|
|
||||||
var state = JSON.parse(event.data);
|
|
||||||
/* console.log("State: " + state.type); */
|
|
||||||
if (state.type == "host") {
|
|
||||||
update_table(state.data);
|
|
||||||
} else if (state.type == "message") {
|
|
||||||
var msgs = document.getElementById("messages");
|
|
||||||
msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
|
|
||||||
}
|
|
||||||
cnt++;
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onclose = function (event) {
|
|
||||||
/* console.log(event); */
|
|
||||||
console.log("Connection is closed, reopening");
|
|
||||||
// Show modal window
|
|
||||||
var modal = document.getElementById("connectionModal");
|
|
||||||
if (modal) {
|
|
||||||
modal.classList.add("show");
|
|
||||||
}
|
|
||||||
setTimeout(function () {
|
|
||||||
WS_Connect();
|
|
||||||
}, 3000);
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
// The browser doesn't support WebSocket
|
|
||||||
console.log("WebSocket NOT supported by your Browser!");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
WS_Connect();
|
|
||||||
</script>
|
|
||||||
<body>
|
|
||||||
{% include 'menu.html' %}
|
|
||||||
|
|
||||||
<div id="content" class="content" style="overflow: hidden">
|
|
||||||
<div id="table" class="table" style="overflow: hidden">
|
|
||||||
<!-- <h2>{{title}}</h2> -->
|
|
||||||
<table id="ntable" class="sortable">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Name</th>
|
|
||||||
<th>Ver</th>
|
|
||||||
<th>IPv4 Addr</th>
|
|
||||||
<th>State</th>
|
|
||||||
<th style="text-align: right">Latencey</th>
|
|
||||||
<th style="text-align: right">Last State</th>
|
|
||||||
<th>IPv6 Addr</th>
|
|
||||||
<th>State</th>
|
|
||||||
<th style="text-align: right">Latencey</th>
|
|
||||||
<th style="text-align: right">Last State</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody id="ntablebody"></tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
<div id="log" class="log" style="overflow: auto;">
|
|
||||||
<h2>Log of Events</h2>
|
|
||||||
<div id="messages">
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% include 'foot.html' %}
|
|
||||||
|
|
||||||
<!-- Connection status modal -->
|
|
||||||
<div id="connectionModal" class="connection-modal">
|
|
||||||
<div class="connection-modal-content">
|
|
||||||
<p>⚠️ Connection is closed, reopening...</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
setup();
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
<label for="drawer-toggle" id="drawer-toggle-label"></label>
|
|
||||||
<header>{{ header }}</header>
|
|
||||||
|
|
||||||
-233
@@ -1,233 +0,0 @@
|
|||||||
"""UDP listener and datagram processing."""
|
|
||||||
import asyncio
|
|
||||||
import zlib
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
from .proto import stodict, oldmtodict
|
|
||||||
from hbd.utils import dur
|
|
||||||
|
|
||||||
|
|
||||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
|
||||||
def __init__(self, config=None, handler=None):
|
|
||||||
super().__init__()
|
|
||||||
self.config = config or {}
|
|
||||||
self.handler = handler
|
|
||||||
|
|
||||||
def connection_made(self, transport):
|
|
||||||
self.transport = transport
|
|
||||||
logger.info("UDP Server listening...")
|
|
||||||
|
|
||||||
def datagram_received(self, data, addr):
|
|
||||||
logger.debug("Received from %s", addr)
|
|
||||||
try:
|
|
||||||
msg = parse_message(data)
|
|
||||||
if self.handler:
|
|
||||||
# handler can be a callable provided by the application
|
|
||||||
# pass the transport so handlers can send replies (ACKs/commands)
|
|
||||||
self.handler(msg, addr, self.transport)
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Error while processing datagram from %s", addr)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_message(data: bytes):
|
|
||||||
"""Parse a raw datagram into a message dict.
|
|
||||||
|
|
||||||
Uses the protocol decoding helpers and falls back to old format when
|
|
||||||
decoding returns an empty dict (compat with older clients).
|
|
||||||
"""
|
|
||||||
msg = stodict(data)
|
|
||||||
if not msg:
|
|
||||||
# fallback to old format
|
|
||||||
msg = oldmtodict(data)
|
|
||||||
return msg
|
|
||||||
|
|
||||||
def dicttos(ID, d, compress=False):
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
if isinstance(d[k], float):
|
|
||||||
s.append("%s=%0.5f" % (k, d[k]))
|
|
||||||
else:
|
|
||||||
s.append("%s=%s" % (k, d[k]))
|
|
||||||
pk = ";".join(s)
|
|
||||||
if compress:
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
ID = "!" + ID + ":"
|
|
||||||
opk = ID.encode() + zpk
|
|
||||||
else:
|
|
||||||
zpk = pk
|
|
||||||
opk = ID + ":" + zpk
|
|
||||||
return opk
|
|
||||||
|
|
||||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|
||||||
"""Handle a parsed datagram message.
|
|
||||||
|
|
||||||
ctx is a dictionary with runtime dependencies:
|
|
||||||
- config: dict of configuration
|
|
||||||
- hbdclass: module providing Host/Connection classes
|
|
||||||
- log: callable(loghost, message)
|
|
||||||
- email: callable(subject, message)
|
|
||||||
- pushmsg: callable(message)
|
|
||||||
- msg_to_websockets: callable(typ, data)
|
|
||||||
- DEBUG, verbose
|
|
||||||
"""
|
|
||||||
if not msg:
|
|
||||||
return
|
|
||||||
now = __import__("time").time()
|
|
||||||
cfg = ctx.get("config", {})
|
|
||||||
hbdcls = ctx.get("hbdclass")
|
|
||||||
log = ctx.get("log")
|
|
||||||
email = ctx.get("email")
|
|
||||||
pushmsg = ctx.get("pushmsg")
|
|
||||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
|
||||||
DEBUG = ctx.get("DEBUG", 0)
|
|
||||||
verbose = ctx.get("verbose", False)
|
|
||||||
|
|
||||||
# normalize addr (ip, port)
|
|
||||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
|
||||||
name = msg.get("name", "unknown")
|
|
||||||
from hbd.utils import shortname
|
|
||||||
uname = shortname(name)
|
|
||||||
|
|
||||||
if uname not in hbdcls.Host.hosts:
|
|
||||||
host = hbdcls.Host(uname)
|
|
||||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
|
||||||
if verbose:
|
|
||||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
|
||||||
newh = True
|
|
||||||
else:
|
|
||||||
host = hbdcls.Host.hosts[uname]
|
|
||||||
newh = False
|
|
||||||
|
|
||||||
cid = msg.get("id", 0)
|
|
||||||
try:
|
|
||||||
rtt = float(msg.get("rtt", None))
|
|
||||||
except Exception:
|
|
||||||
rtt = None
|
|
||||||
|
|
||||||
if msg.get("ID") == "HTB":
|
|
||||||
host.doesack = msg.get("acks", -1)
|
|
||||||
host.setcver(msg.get("ver", 0))
|
|
||||||
|
|
||||||
try:
|
|
||||||
conn, res = host.conndata(cid, ip, rtt, now)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print("conndata failed: %s" % e)
|
|
||||||
return
|
|
||||||
|
|
||||||
if res:
|
|
||||||
if log:
|
|
||||||
log(uname, res)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("address change", "%s %s" % (host.name, res))
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s" % (host.name, res))
|
|
||||||
|
|
||||||
interval = int(msg.get("interval", 0) or 0)
|
|
||||||
shutdown = msg.get("shutdown", 0)
|
|
||||||
service = msg.get("service", "unknown")
|
|
||||||
message = msg.get("msg", None)
|
|
||||||
boot = msg.get("boot", 0)
|
|
||||||
|
|
||||||
if boot:
|
|
||||||
if log:
|
|
||||||
log(uname, "booted")
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
m = "%s booted" % (host.name)
|
|
||||||
if email:
|
|
||||||
email("booted", m)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg(m)
|
|
||||||
if message:
|
|
||||||
if log:
|
|
||||||
log(uname, "msg: %s" % message, service=service)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("msg", message)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg(message)
|
|
||||||
|
|
||||||
if conn.getstate() != hbdcls.Connection.UP:
|
|
||||||
lasts = conn.state
|
|
||||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
|
||||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
|
||||||
if log:
|
|
||||||
log(uname, m)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("%s back" % conn.afam, uname)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
|
||||||
|
|
||||||
if boot or newh:
|
|
||||||
host.upcount = host.doesack
|
|
||||||
else:
|
|
||||||
host.upcount += 1
|
|
||||||
|
|
||||||
if shutdown:
|
|
||||||
if log:
|
|
||||||
log(uname, "%s shutdown" % conn.afam)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("shutdown", "%s %s shutdown" % (uname, conn.afam))
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
|
||||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
|
||||||
|
|
||||||
if interval > 0:
|
|
||||||
host.interval = interval
|
|
||||||
|
|
||||||
# send ACK back
|
|
||||||
rmsg = {"time": __import__("time").time()}
|
|
||||||
if host.cver < 1:
|
|
||||||
opkt = b"ACK"
|
|
||||||
else:
|
|
||||||
opkt = dicttos("ACK", rmsg, host.cver > 1)
|
|
||||||
try:
|
|
||||||
transport.sendto(opkt, addr)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print(("cannot send ack: %s" % e))
|
|
||||||
|
|
||||||
# send any commands we have queued
|
|
||||||
while len(host.cmds):
|
|
||||||
op, rmsg = host.cmds[0]
|
|
||||||
if op == "CMD":
|
|
||||||
if email:
|
|
||||||
email("%s cmd exec" % uname, "command '%s' sent" % rmsg)
|
|
||||||
del host.cmds[0]
|
|
||||||
if log:
|
|
||||||
log(uname, "command sent")
|
|
||||||
if host.cver < 1:
|
|
||||||
rmsg = rmsg["cmd"]
|
|
||||||
elif op == "UPD":
|
|
||||||
del host.cmds[0]
|
|
||||||
if log:
|
|
||||||
log(uname, "update initiated")
|
|
||||||
if host.cver < 1:
|
|
||||||
if log:
|
|
||||||
log(uname, " ver 0 does not support UPD")
|
|
||||||
continue
|
|
||||||
if host.cver < 1:
|
|
||||||
opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
|
|
||||||
if isinstance(opkt, str):
|
|
||||||
opkt = opkt.encode()
|
|
||||||
else:
|
|
||||||
opkt = dicttos(op, rmsg, True)
|
|
||||||
try:
|
|
||||||
transport.sendto(opkt, addr)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print(("cannot send cmd/update: %s" % e))
|
|
||||||
|
|
||||||
if msg_to_websockets:
|
|
||||||
try:
|
|
||||||
msg_to_websockets("host", host.stateinfo())
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
"""WebSocket server and broadcast helpers for hbd.
|
|
||||||
|
|
||||||
Provides an asyncio-based WebSocket server and a thread-safe broadcast
|
|
||||||
function that other threads or synchronous code can call.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from typing import Callable, Iterable, Optional
|
|
||||||
|
|
||||||
import websockets
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_connections = set()
|
|
||||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
||||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
|
||||||
_get_msgs: Optional[Callable[[], Iterable]] = None
|
|
||||||
_verbose = False
|
|
||||||
|
|
||||||
|
|
||||||
async def _handler(websocket, path=None):
|
|
||||||
# Some versions of the websockets library call handler(connection) only;
|
|
||||||
# accept optional path and fall back to websocket.path when missing.
|
|
||||||
global _connections
|
|
||||||
_connections.add(websocket)
|
|
||||||
remote_address = getattr(websocket, "remote_address", None)
|
|
||||||
if path is None:
|
|
||||||
path = getattr(websocket, "path", None)
|
|
||||||
if _verbose:
|
|
||||||
logger.info("DBG ws_serve: %s: %s", remote_address, path)
|
|
||||||
try:
|
|
||||||
# send initial hosts
|
|
||||||
if _get_hosts:
|
|
||||||
for h in _get_hosts():
|
|
||||||
jmsg = json.dumps({"type": "host", "data": h})
|
|
||||||
await websocket.send(jmsg)
|
|
||||||
# send recent messages
|
|
||||||
if _get_msgs:
|
|
||||||
for m in list(_get_msgs())[-100:]:
|
|
||||||
jmsg = json.dumps({"type": "message", "data": m})
|
|
||||||
await websocket.send(jmsg)
|
|
||||||
|
|
||||||
# keep connection open until client disconnects
|
|
||||||
async for _ in websocket:
|
|
||||||
# we don't expect meaningful incoming messages besides the initial
|
|
||||||
# client 'hello' that some clients send; ignore for now
|
|
||||||
if _verbose:
|
|
||||||
logger.debug("received ws data: %s", _)
|
|
||||||
|
|
||||||
except (websockets.exceptions.ConnectionClosedOK, websockets.exceptions.ConnectionClosedError) as e:
|
|
||||||
if _verbose:
|
|
||||||
logger.info("ws closed: %r", e)
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("ws handler exception: %s", e)
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
_connections.remove(websocket)
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
await websocket.wait_closed()
|
|
||||||
|
|
||||||
|
|
||||||
async def start(host: str, ws_port: int, wss_port: Optional[int] = None, ssl_context=None, get_hosts: Optional[Callable] = None, get_msgs: Optional[Callable] = None, verbose: bool = False):
|
|
||||||
"""Start WebSocket servers and block until cancelled.
|
|
||||||
|
|
||||||
This is intended to be awaited inside the main asyncio event loop.
|
|
||||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
|
||||||
started.
|
|
||||||
"""
|
|
||||||
global _loop, _get_hosts, _get_msgs, _verbose
|
|
||||||
_loop = asyncio.get_running_loop()
|
|
||||||
_get_hosts = get_hosts
|
|
||||||
_get_msgs = get_msgs
|
|
||||||
_verbose = verbose
|
|
||||||
|
|
||||||
servers = []
|
|
||||||
# plain WebSocket
|
|
||||||
ws_server = websockets.serve(_handler, host, ws_port) #, subprotocols=["hbd"])
|
|
||||||
websockets_logger = logging.getLogger("websockets.server")
|
|
||||||
websockets_logger.setLevel(logging.INFO)
|
|
||||||
servers.append(ws_server)
|
|
||||||
|
|
||||||
# secure WebSocket (optional)
|
|
||||||
if wss_port and ssl_context:
|
|
||||||
wss_server = websockets.serve(_handler, host, wss_port, ssl=ssl_context) #, subprotocols=["hbd"])
|
|
||||||
servers.append(wss_server)
|
|
||||||
|
|
||||||
# await starting of all servers
|
|
||||||
try:
|
|
||||||
for srv in servers:
|
|
||||||
await srv
|
|
||||||
|
|
||||||
if _verbose:
|
|
||||||
logger.info("WebSocket server started on port %s (wss %s)", ws_port, wss_port)
|
|
||||||
|
|
||||||
# block forever (until loop is stopped or cancelled)
|
|
||||||
await asyncio.Future()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
logger.info("WebSocket server shutting down...")
|
|
||||||
# Close all active connections
|
|
||||||
for conn in list(_connections):
|
|
||||||
try:
|
|
||||||
await conn.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
_connections.clear()
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def broadcast(typ: str, data) -> bool:
|
|
||||||
"""Thread-safe broadcast helper.
|
|
||||||
|
|
||||||
Schedules coroutine(s) on the running loop to send message to all
|
|
||||||
connected websockets. Returns False if server was not running.
|
|
||||||
"""
|
|
||||||
global _loop
|
|
||||||
|
|
||||||
if not _loop:
|
|
||||||
return False
|
|
||||||
jmsg = json.dumps({"type": typ, "data": data})
|
|
||||||
to_close = []
|
|
||||||
for ws in list(_connections):
|
|
||||||
if ws.state != websockets.protocol.State.OPEN:
|
|
||||||
to_close.append(ws)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
|
|
||||||
except Exception:
|
|
||||||
to_close.append(ws)
|
|
||||||
logger.debug("ws.send exception: closed")
|
|
||||||
for ws in to_close:
|
|
||||||
try:
|
|
||||||
asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if ws in _connections:
|
|
||||||
_connections.remove(ws)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def connection_count() -> int:
|
|
||||||
return len(_connections)
|
|
||||||
-380
@@ -1,380 +0,0 @@
|
|||||||
"""
|
|
||||||
host and connection class shared between hbd and
|
|
||||||
the websit's heartbeat.py
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import copy
|
|
||||||
import queue
|
|
||||||
|
|
||||||
num = 0
|
|
||||||
|
|
||||||
MAXRTTS = 10
|
|
||||||
|
|
||||||
DEBUG = 2
|
|
||||||
|
|
||||||
|
|
||||||
def log(host, m):
|
|
||||||
if DEBUG:
|
|
||||||
print("class log: %s %s" % (host, m))
|
|
||||||
|
|
||||||
|
|
||||||
class Connection:
|
|
||||||
# map of addrs to names
|
|
||||||
|
|
||||||
htab = {}
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
UP = "up"
|
|
||||||
DOWN = "down"
|
|
||||||
OVERDUE = "overdue"
|
|
||||||
|
|
||||||
def __init__(self, host, cid, addr, afam):
|
|
||||||
self.host = host
|
|
||||||
self.cid = cid
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.addr = addr
|
|
||||||
self.afam = afam
|
|
||||||
self.rtts = [0]
|
|
||||||
self.lastbeat = time.time()
|
|
||||||
self.statetime = self.lastbeat
|
|
||||||
self.deltastatetime = "computed"
|
|
||||||
self.state = Connection.UNKNOWN
|
|
||||||
|
|
||||||
if host:
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
log(self.host.name, "dns update %s" % self.addr)
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def clearstate(self):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = ""
|
|
||||||
d["rtt"] = ""
|
|
||||||
d["lastbeat"] = ""
|
|
||||||
d["state"] = ""
|
|
||||||
d["statetime"] = ""
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
d["rttstate"] = ""
|
|
||||||
return d
|
|
||||||
|
|
||||||
def statedict(self, Null=False):
|
|
||||||
d = self.clearstate()
|
|
||||||
now = time.time()
|
|
||||||
if not Null:
|
|
||||||
d["addr"] = self.addr
|
|
||||||
if self.rtts[-1]:
|
|
||||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
|
||||||
elif self.state == Connection.UNKNOWN:
|
|
||||||
d["rtt"] = ""
|
|
||||||
else:
|
|
||||||
d["rtt"] = "?"
|
|
||||||
d["lastbeat"] = self.lastbeat
|
|
||||||
if self.state == Connection.OVERDUE:
|
|
||||||
d["state"] = "<b>%s</b>" % self.state
|
|
||||||
else:
|
|
||||||
d["state"] = self.state
|
|
||||||
if self.state == Connection.UP:
|
|
||||||
d["rttstate"] = d["rtt"]
|
|
||||||
elif self.state == Connection.OVERDUE:
|
|
||||||
d["rttstate"] = ""
|
|
||||||
else:
|
|
||||||
d["rttstate"] = d["state"]
|
|
||||||
d["statetime"] = time.strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
|
||||||
)
|
|
||||||
delta = now - self.statetime
|
|
||||||
|
|
||||||
if self.state == Connection.UNKNOWN:
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
elif delta > 86400:
|
|
||||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
|
||||||
elif delta > 3600:
|
|
||||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
|
||||||
elif delta > 60:
|
|
||||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
|
||||||
else:
|
|
||||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%i secs" % (delta)
|
|
||||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
|
||||||
d = self.clearstate()
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self, afam):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = "%s Addr" % afam
|
|
||||||
d["rtt"] = "Latencey"
|
|
||||||
d["lastbeat"] = "Last Contact"
|
|
||||||
d["state"] = "State"
|
|
||||||
d["statetime"] = "Last State"
|
|
||||||
d["rttstate"] = "Reach"
|
|
||||||
d["deltastatetime"] = "Last State"
|
|
||||||
return d
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.__dict__)
|
|
||||||
|
|
||||||
# set new state, return number of secs in previous state
|
|
||||||
def newstate(self, state, now, when=0):
|
|
||||||
self.state = state
|
|
||||||
delta = now - when
|
|
||||||
s = delta - self.statetime
|
|
||||||
self.statetime = delta
|
|
||||||
return s
|
|
||||||
|
|
||||||
def getstate(self):
|
|
||||||
return self.state
|
|
||||||
|
|
||||||
def newaddr(self, addr, rtt, now):
|
|
||||||
self.lastbeat = now
|
|
||||||
self.rtts.append(rtt)
|
|
||||||
if len(self.rtts) > MAXRTTS:
|
|
||||||
del self.rtts[0]
|
|
||||||
|
|
||||||
if self.addr == addr:
|
|
||||||
r = None
|
|
||||||
else:
|
|
||||||
r = "changed from %s to %s" % (self.addr, addr)
|
|
||||||
try:
|
|
||||||
del Connection.htab[self.addr]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self.addr = addr
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
class Host:
|
|
||||||
# Table of Hosts
|
|
||||||
hosts = {}
|
|
||||||
dnsQ = queue.Queue()
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
global num
|
|
||||||
self.name = name
|
|
||||||
if name:
|
|
||||||
num += 1
|
|
||||||
Host.hosts[name] = self
|
|
||||||
self.num = num
|
|
||||||
self.dyn = False
|
|
||||||
self.watched = False
|
|
||||||
self.upcount = 0
|
|
||||||
self.interval = 0
|
|
||||||
self.doesack = -1
|
|
||||||
self.cmds = []
|
|
||||||
self.cver = 0
|
|
||||||
self.connections = {}
|
|
||||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
|
||||||
|
|
||||||
def statedict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = self.name
|
|
||||||
if self.dyn:
|
|
||||||
d["name"] += "*"
|
|
||||||
if self.watched:
|
|
||||||
d["name"] = "<b>%s</b>" % d["name"]
|
|
||||||
d["dyn"] = str(self.dyn)
|
|
||||||
d["ver"] = str(self.cver)
|
|
||||||
d["num"] = self.num
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
cs = self.connections[c].statedict()
|
|
||||||
else:
|
|
||||||
cs = ubConnection.statedict(True)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = "Name"
|
|
||||||
d["dyn"] = "Dyn"
|
|
||||||
d["ver"] = "Ver"
|
|
||||||
d["num"] = "??"
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
cs = ubConnection.headerdict(c)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
return d
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
for af in self.connections:
|
|
||||||
self.connections[af].registerDns()
|
|
||||||
|
|
||||||
def stateinfo(self):
|
|
||||||
ddict = {}
|
|
||||||
for d in self.__dict__:
|
|
||||||
if d == "connections":
|
|
||||||
cl = []
|
|
||||||
for c in self.connections:
|
|
||||||
# dirty ugly hack: fix conn to host backpointer
|
|
||||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
|
||||||
cld["host"] = cld["host"].name
|
|
||||||
cl.append(cld)
|
|
||||||
ddict[d] = cl
|
|
||||||
else:
|
|
||||||
ddict[d] = self.__dict__[d]
|
|
||||||
return ddict
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.stateinfo())
|
|
||||||
|
|
||||||
def setcver(self, cver):
|
|
||||||
self.cver = cver
|
|
||||||
|
|
||||||
def isDynDns(self):
|
|
||||||
return self.dyn
|
|
||||||
|
|
||||||
def isIPv4(self, addr):
|
|
||||||
if isinstance(addr, tuple):
|
|
||||||
return addr[0].find(".") > 0
|
|
||||||
else:
|
|
||||||
return addr.find(".") > 0
|
|
||||||
|
|
||||||
def conndata(self, cid, addr, rtt, now):
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
if self.isIPv4(addr):
|
|
||||||
afam = "IPv4"
|
|
||||||
else:
|
|
||||||
afam = "IPv6"
|
|
||||||
|
|
||||||
if afam not in self.connections:
|
|
||||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
|
||||||
|
|
||||||
conn = self.connections[afam]
|
|
||||||
res = conn.newaddr(addr, rtt, now)
|
|
||||||
return conn, res
|
|
||||||
|
|
||||||
# called when reloading class from pickle, add new fields here
|
|
||||||
def fixup(self):
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
addr = self.connections[c].addr
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.connections[c].addr = addr
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def dispstate(self):
|
|
||||||
# if self.state in ["down", "overdue"]:
|
|
||||||
# state = "<b>%s</b>" % self.state
|
|
||||||
# elif self.state in ["up", "UP"]:
|
|
||||||
# state = ""
|
|
||||||
# for x in list(self.connections.keys()):
|
|
||||||
# try:
|
|
||||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
|
||||||
# except:
|
|
||||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
|
||||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
|
||||||
# state = ""
|
|
||||||
# else:
|
|
||||||
# state = "%s" % self.state
|
|
||||||
# return state
|
|
||||||
|
|
||||||
def dispstats(self):
|
|
||||||
if self.doesack != -1:
|
|
||||||
if self.upcount > 0:
|
|
||||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
|
||||||
r = ""
|
|
||||||
for v in range(3):
|
|
||||||
a, u = self.hdwcounts[v]
|
|
||||||
if (self.upcount - u) != 0:
|
|
||||||
vs = "%0.0f" % (
|
|
||||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
|
||||||
)
|
|
||||||
if vs == "0":
|
|
||||||
vs = ""
|
|
||||||
else:
|
|
||||||
vs = "-"
|
|
||||||
r += '<td align="right">%s</td>' % vs
|
|
||||||
return r
|
|
||||||
else:
|
|
||||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
|
||||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
|
||||||
|
|
||||||
hostfields_long = [
|
|
||||||
"name",
|
|
||||||
"IPv4.addr",
|
|
||||||
"IPv4.state",
|
|
||||||
("IPv4.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv4.statetime", 'style="text-align: right;"'),
|
|
||||||
"IPv6.addr",
|
|
||||||
"IPv6.state",
|
|
||||||
("IPv6.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv6.statetime", 'style="text-align: right;"'),
|
|
||||||
"ver",
|
|
||||||
]
|
|
||||||
|
|
||||||
hostfields_short = [
|
|
||||||
"name",
|
|
||||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def gene(self, tag, v, attrib=None):
|
|
||||||
if attrib:
|
|
||||||
a = " %s" % attrib
|
|
||||||
else:
|
|
||||||
a = ""
|
|
||||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
|
||||||
|
|
||||||
def htmltable(self, tag, hd, short):
|
|
||||||
if short:
|
|
||||||
hostfields = Host.hostfields_short
|
|
||||||
else:
|
|
||||||
hostfields = Host.hostfields_long
|
|
||||||
h = []
|
|
||||||
for f in hostfields:
|
|
||||||
if isinstance(f, tuple):
|
|
||||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
|
||||||
else:
|
|
||||||
h.append(self.gene(tag, hd[f]))
|
|
||||||
return self.gene("tr", "\n".join(h))
|
|
||||||
|
|
||||||
def buildhosttable(self, short=False):
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: start")
|
|
||||||
res = []
|
|
||||||
res.append('<table id="ntable" class="sortable">')
|
|
||||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
|
||||||
hosts_sorted = list(Host.hosts.keys())
|
|
||||||
if len(hosts_sorted):
|
|
||||||
hosts_sorted.sort()
|
|
||||||
for h in hosts_sorted:
|
|
||||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
|
||||||
res.append("</table>")
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: %s" % res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def buildmsgtable(self, msgs):
|
|
||||||
res = []
|
|
||||||
le = max(40 - len(Host.hosts), 3)
|
|
||||||
res.append("<h4>Log of Events</h4>")
|
|
||||||
for m in msgs[len(msgs) - le:]:
|
|
||||||
res.append("%s<BR>" % m)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
# create fake "unbound objects", remove in Python 3.0
|
|
||||||
ubHost = Host(None)
|
|
||||||
ubConnection = Connection(None, "", "", "")
|
|
||||||
@@ -1,194 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: heartbeat
|
|
||||||
Version: 0.1.0
|
|
||||||
Summary: Heartbeat daemon (hbd) — receive heartbeats and act on them
|
|
||||||
Author: heartbeat contributors
|
|
||||||
License: MIT
|
|
||||||
Keywords: heartbeat,monitoring,dns,websocket
|
|
||||||
Requires-Python: >=3.10
|
|
||||||
Description-Content-Type: text/markdown
|
|
||||||
Requires-Dist: websockets>=13.2
|
|
||||||
Requires-Dist: mattermostdriver>=7.3.0
|
|
||||||
Requires-Dist: PyYAML>=6.0
|
|
||||||
Requires-Dist: aiohttp>=3.8
|
|
||||||
Requires-Dist: Jinja2>=3.1.0
|
|
||||||
Provides-Extra: dev
|
|
||||||
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
||||||
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
||||||
Requires-Dist: flake8>=5.0; extra == "dev"
|
|
||||||
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Heartbeat Daemon (hbd) ✅
|
|
||||||
|
|
||||||
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📌 Features
|
|
||||||
|
|
||||||
- Receive and parse heartbeat datagrams (text or zlib-compressed) ✅
|
|
||||||
- Maintain host state and detect up/down transitions ✅
|
|
||||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
|
||||||
- WebSocket API for live updates (hosts & messages) ✅
|
|
||||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
|
||||||
- Modular codebase suitable for unit testing and CI ✅
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ⚙️ Quickstart
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
- Python 3.10+ (project uses language features from recent Python)
|
|
||||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
|
||||||
|
|
||||||
Install dependencies (recommended into a venv):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 -m venv .venv
|
|
||||||
source .venv/bin/activate
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
python -m pip install -r requirements.txt
|
|
||||||
# for development/testing tools
|
|
||||||
python -m pip install -r requirements-dev.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Run the daemon (example):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# run with default config lookup (~/.hb.yaml)
|
|
||||||
PYTHONPATH=. hbd -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also run it directly via the package entrypoint after installation:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m hbd.cli -c /path/to/config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🐞 Debugging in VS Code
|
|
||||||
|
|
||||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
|
||||||
|
|
||||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
|
||||||
- Use **F5** and pick one of these configurations from the Run view:
|
|
||||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
|
||||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
|
||||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
|
||||||
|
|
||||||
To start `hbd` manually and wait for the debugger to attach, run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
|
||||||
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🛠 Configuration
|
|
||||||
|
|
||||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
|
||||||
|
|
||||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
|
||||||
- `hbd_port`: internal control port (default: 50004)
|
|
||||||
- `hbd_host`: bind address for HTTP/WSS
|
|
||||||
- `pickfile`: path for persisted state
|
|
||||||
- `logfile`: path to log file
|
|
||||||
- `logfmt`: `text` or `msg`
|
|
||||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
|
||||||
- `interval` / `grace`: heartbeat timing configuration
|
|
||||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
|
||||||
- `nsupdate_bin`: path to nsupdate binary
|
|
||||||
|
|
||||||
Example `.hb.yaml` (minimal):
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
hbd_host: 0.0.0.0
|
|
||||||
hbd_port: 50004
|
|
||||||
dyndomains:
|
|
||||||
- example.com
|
|
||||||
nsupdate_bin: /usr/bin/nsupdate
|
|
||||||
pushsrv: pushover
|
|
||||||
```
|
|
||||||
|
|
||||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Architecture & Modules
|
|
||||||
|
|
||||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
|
||||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
|
||||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and a background DNS thread (`start_dns_thread`)
|
|
||||||
- `hbd.notify` — email and push notification helpers
|
|
||||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
|
||||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
|
||||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
|
||||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
|
||||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
|
||||||
|
|
||||||
This modular layout makes the code easier to test and maintain.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Testing & Dev
|
|
||||||
|
|
||||||
Tests are implemented using `unittest` and additional tests rely on `pytest` if you prefer. To run tests locally without installing anything beyond the dev requirements:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# with project root on PYTHONPATH
|
|
||||||
PYTHONPATH=. python -m unittest discover -v
|
|
||||||
# or with pytest if installed
|
|
||||||
pytest -q
|
|
||||||
```
|
|
||||||
|
|
||||||
Developer tooling included:
|
|
||||||
- `pyproject.toml` — project metadata and dependencies
|
|
||||||
- `requirements-dev.txt` — dev/test dependencies
|
|
||||||
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
|
||||||
|
|
||||||
To run linters and type checks locally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# after installing dev deps
|
|
||||||
tox -e lint
|
|
||||||
tox -e mypy
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Running in production
|
|
||||||
|
|
||||||
- Use your system service manager (systemd, launchd, etc.) to run `hbd` in the background.
|
|
||||||
- Ensure `nsupdate` and necessary credentials are available for dynamic DNS updates.
|
|
||||||
- Configure TLS for WSS if you enable secure websockets.
|
|
||||||
|
|
||||||
> Note: The project contains a small example for obtaining DNS-verified certs (certbot with RFC2136) — see earlier commit history or ask me to re-add the example to this README if you want it documented here.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🤝 Contributing
|
|
||||||
|
|
||||||
Contributions welcome! Please:
|
|
||||||
1. Open an issue to discuss larger changes.
|
|
||||||
2. Create a topic branch and a clear PR.
|
|
||||||
3. Add tests for new features and run linters.
|
|
||||||
4. Keep changes focused and documented.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📜 License
|
|
||||||
|
|
||||||
This repository is licensed under the MIT license. See `LICENSE` for details.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
If you'd like, I can also:
|
|
||||||
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
|
||||||
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
|
||||||
|
|
||||||
Which one should I do next? ✨
|
|
||||||
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
README.md
|
|
||||||
pyproject.toml
|
|
||||||
hbd/__init__.py
|
|
||||||
hbd/cli.py
|
|
||||||
hbd/config.py
|
|
||||||
hbd/dns.py
|
|
||||||
hbd/hbdclass.py
|
|
||||||
hbd/http.py
|
|
||||||
hbd/monitor.py
|
|
||||||
hbd/notify.py
|
|
||||||
hbd/proto.py
|
|
||||||
hbd/server.py
|
|
||||||
hbd/udp.py
|
|
||||||
hbd/utils.py
|
|
||||||
hbd/ws.py
|
|
||||||
heartbeat.egg-info/PKG-INFO
|
|
||||||
heartbeat.egg-info/SOURCES.txt
|
|
||||||
heartbeat.egg-info/dependency_links.txt
|
|
||||||
heartbeat.egg-info/entry_points.txt
|
|
||||||
heartbeat.egg-info/requires.txt
|
|
||||||
heartbeat.egg-info/top_level.txt
|
|
||||||
tests/test_dns.py
|
|
||||||
tests/test_handle_datagram.py
|
|
||||||
tests/test_proto.py
|
|
||||||
tests/test_udp.py
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
[console_scripts]
|
|
||||||
hbd = hbd.cli:main
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
websockets>=13.2
|
|
||||||
mattermostdriver>=7.3.0
|
|
||||||
PyYAML>=6.0
|
|
||||||
aiohttp>=3.8
|
|
||||||
Jinja2>=3.1.0
|
|
||||||
|
|
||||||
[dev]
|
|
||||||
pytest>=7.0
|
|
||||||
pytest-cov>=4.0
|
|
||||||
flake8>=5.0
|
|
||||||
mypy>=1.10
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
hbd
|
|
||||||
-15
@@ -1,15 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# install hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
|
|
||||||
|
|
||||||
set -e
|
|
||||||
if [ ! -d ~/venvs/hbd ]; then
|
|
||||||
mkdir -p ~/venvs
|
|
||||||
python3 -m venv ~/venvs/hbd
|
|
||||||
fi
|
|
||||||
. ~/venvs/hbd/bin/activate
|
|
||||||
pip install 'git+ssh://git@git.wrede.ca/andreas/heartbeat.git'
|
|
||||||
rm -f ~/bin/hbd
|
|
||||||
rm -f ~/bin/hbc
|
|
||||||
ln -sf $(which hbd) ~/bin/hbd
|
|
||||||
ln -sf $(which hbc) ~/bin/hbc
|
|
||||||
+63
-18
@@ -4,26 +4,58 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "hbd"
|
name = "hbd"
|
||||||
version = "5.0"
|
version = "5.3.8"
|
||||||
description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
|
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.11"
|
||||||
license = { text = "MIT" }
|
dependencies = [
|
||||||
keywords = ["heartbeat", "monitoring", "dns", "websocket"]
|
"PyYAML>=6.0",
|
||||||
|
]
|
||||||
|
license = "MIT"
|
||||||
|
license-files = ["LICENSE.md"]
|
||||||
|
keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "heartbeat contributors" }
|
{ name = "Andreas Wrede" }
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Operating System :: POSIX :: Linux",
|
||||||
|
"Operating System :: POSIX :: BSD",
|
||||||
|
"Topic :: System :: Monitoring",
|
||||||
|
"Topic :: System :: Networking :: Monitoring",
|
||||||
]
|
]
|
||||||
|
|
||||||
dependencies = [
|
[project.urls]
|
||||||
"websockets>=13.2",
|
Repository = "https://git.wrede.ca/andreas/heartbeat"
|
||||||
"mattermostdriver>=7.3.0",
|
|
||||||
"PyYAML>=6.0",
|
|
||||||
"aiohttp>=3.8",
|
|
||||||
"Jinja2>=3.1.0",
|
|
||||||
"fastapi>=0.95.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
# Client-only dependencies (hbc - system monitoring client)
|
||||||
|
client = [
|
||||||
|
"psutil>=5.9.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Server-only dependencies (hbd - heartbeat daemon/server)
|
||||||
|
server = [
|
||||||
|
"websockets>=13.2",
|
||||||
|
"mattermostdriver>=7.3.0",
|
||||||
|
"aiohttp>=3.11",
|
||||||
|
"Jinja2>=3.1.6",
|
||||||
|
"matrix-nio>=0.24",
|
||||||
|
"ruamel.yaml>=0.18",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Minimal client — hbc_mini only, no external dependencies
|
||||||
|
mini = []
|
||||||
|
|
||||||
|
# Install both client and server
|
||||||
|
all = [
|
||||||
|
"hbd[client,server]",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Development dependencies
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=7.0",
|
"pytest>=7.0",
|
||||||
"pytest-cov>=4.0",
|
"pytest-cov>=4.0",
|
||||||
@@ -31,17 +63,30 @@ dev = [
|
|||||||
"mypy>=1.10",
|
"mypy>=1.10",
|
||||||
"black>=23.0",
|
"black>=23.0",
|
||||||
"isort>=5.0",
|
"isort>=5.0",
|
||||||
"re-commit>=3.0",
|
|
||||||
"tox>=4.0",
|
"tox>=4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
hbd = "hbd.cli:main"
|
hbd = "hbd.server.cli:main"
|
||||||
hbc = "hbd.hbc:main"
|
hbc = "hbd.client.main:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
script-files = ["scripts/hb_install.sh", "scripts/hbc_mini.py"]
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
where = ["."]
|
where = ["."]
|
||||||
include = ["hbd*"]
|
include = ["hbd*"]
|
||||||
|
|
||||||
[tool.setuptools.package-data]
|
[tool.setuptools.package-data]
|
||||||
"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||||
|
"hbd.client" = ["*.yaml"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 111
|
||||||
|
|
||||||
|
[tool.flake8]
|
||||||
|
max-line-length = 111
|
||||||
|
|
||||||
|
[tool.pylint.format]
|
||||||
|
max-line-length = 111
|
||||||
|
|||||||
@@ -1,4 +0,0 @@
|
|||||||
key "rndc-key" {
|
|
||||||
algorithm hmac-md5;
|
|
||||||
secret "qlGa+AYKtyOgWNuozqECMw==";
|
|
||||||
};
|
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
async def send_sms(hass, user, password, sender_did, call):
|
||||||
|
"""Send SMS message using multipart form-data like MMS."""
|
||||||
|
_LOGGER = logging.getLogger(__name__)
|
||||||
|
recipient = call.data.get("recipient")
|
||||||
|
message = call.data.get("message")
|
||||||
|
|
||||||
|
if not recipient or not message:
|
||||||
|
_LOGGER.error("Recipient or message missing.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build form data dictionary
|
||||||
|
form_data = {
|
||||||
|
'api_username': str(user),
|
||||||
|
'api_password': str(password),
|
||||||
|
'did': str(sender_did),
|
||||||
|
'dst': str(recipient),
|
||||||
|
'message': str(message),
|
||||||
|
'method': 'sendSMS'
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
with aiohttp.MultipartWriter("form-data") as mp:
|
||||||
|
for key, value in form_data.items():
|
||||||
|
part = mp.append(value)
|
||||||
|
part.set_content_disposition('form-data', name=key)
|
||||||
|
|
||||||
|
_LOGGER.error("voipms_sms: sending SMS: %s", mp)
|
||||||
|
async with session.post(REST_ENDPOINT, data=mp) as response:
|
||||||
|
response_text = await response.text()
|
||||||
|
if response.status == 200:
|
||||||
|
response_json = json.loads(response_text)
|
||||||
|
if response_json['status'] == "success":
|
||||||
|
_LOGGER.info("voipms_sms: SMS sent successfully: %s", response_text)
|
||||||
|
else:
|
||||||
|
_LOGGER.error("voipms_sms: SMS not sent: %s", response_text)
|
||||||
|
else:
|
||||||
|
_LOGGER.error("voipms_sms: Failed to send SMS. Status: %s, Response: %s", response.status, response_text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user