Compare commits
144 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ab37ac7194 | |||
| f811a19d80 | |||
| 6239825f43 | |||
| b56245bb23 | |||
| 331c4e804d | |||
| 9fd945a481 | |||
| 26df08eeff | |||
| 5819dd6b25 | |||
| 6fb67f8615 | |||
| e70ae6f176 | |||
| a77f6d380c | |||
| 6aae2a1dab | |||
| 85ee0e1040 | |||
| c4f09e9ced | |||
| 64710fd4cd | |||
| 1f5e7465a3 | |||
| b290b21e23 | |||
| 65c4267847 | |||
| 462a445235 | |||
| 368e178f93 | |||
| 6905bf266a | |||
| b6dcce4f35 | |||
| e6436fc236 | |||
| c5ce41762e | |||
| 26ca0c095f | |||
| 1eecd67594 | |||
| caf3c2c0ac | |||
| 9af4006097 | |||
| ddf7067d13 | |||
| 505353a8a8 | |||
| 0402d33c71 | |||
| 7d8ca5d8db | |||
| 56037a036d | |||
| 65ceb31d8d | |||
| 1c9b6c1ca9 | |||
| d7e6b478e1 | |||
| 535dbda47d | |||
| c9567dddae | |||
| b5963badd6 | |||
| a76a39b4a0 | |||
| 94e1597978 | |||
| c9c2ed772f | |||
| aeb78dcb8e | |||
| 77b337e4dd | |||
| 293461f3f6 | |||
| c70a4807dc | |||
| 1a470e7cfa | |||
| 990c658e65 | |||
| b78d6ac0fe | |||
| afd5060f59 | |||
| f61f7aebc2 | |||
| 5c382d2b8d | |||
| 35bba451f5 | |||
| 80edfba0c0 | |||
| 6bc8de192e | |||
| 2d8166d04a | |||
| ab33d81b30 | |||
| 2c0328f36d | |||
| fb8e27825d | |||
| 1366c69cdc | |||
| d0c8c186f4 | |||
| 19f7c8312e | |||
| 24b0e362fb | |||
| 3a030548c0 | |||
| 094cb7ed9d | |||
| 0199ca4693 | |||
| 75344ebbbd | |||
| 7f049a4e26 | |||
| 6559f5462c | |||
| 6556d35f97 | |||
| dec96a0da6 | |||
| 8d3de01117 | |||
| 5bedf026b1 | |||
| daf5277507 | |||
| ee3b72878f | |||
| 6217f7a124 | |||
| 2468386f24 | |||
| 2015195112 | |||
| 3426185383 | |||
| 9eedbafe97 | |||
| a5f31c5cb5 | |||
| 2f72cf0118 | |||
| c56e77c2c1 | |||
| e9aa7a6f8b | |||
| a75a8a4087 | |||
| ba27d2e300 | |||
| 381e37efce | |||
| 97dfc08f4d | |||
| d281ac5a70 | |||
| 812bbf8555 | |||
| e6b7a1aa27 | |||
| 90f47ad018 | |||
| cc458e8972 | |||
| 79bf00abfd | |||
| d77277857f | |||
| 3232239a85 | |||
| 014781de5e | |||
| 68b1c65384 | |||
| e8bb553349 | |||
| e4ecb8723f | |||
| 5edbaacf81 | |||
| 8421f472f2 | |||
| 51f9bdc2b5 | |||
| 02bc42fbf0 | |||
| 832a8b0bda | |||
| 57c4b86430 | |||
| 43fad7beed | |||
| 8dd002d159 | |||
| 2373b55d8b | |||
| 81530636ec | |||
| 190199b36d | |||
| 73aa89f8f4 | |||
| 941f3ea4b0 | |||
| c5770006f7 | |||
| 84c1aef51f | |||
| 460d2be9e9 | |||
| 090d341244 | |||
| 079e84f729 | |||
| dd23d9d163 | |||
| ad7178ebcb | |||
| 0543266c92 | |||
| 7e2038ecac | |||
| 75e41eafc4 | |||
| 73b9d05357 | |||
| 9d81f96f31 | |||
| d2e1c7a629 | |||
| 83d5ead471 | |||
| d339133981 | |||
| 7be129ad40 | |||
| 179048e565 | |||
| 8fe64ae8c5 | |||
| b6574872cc | |||
| 5e6dfc75ad | |||
| 087a264e97 | |||
| d9ca0b74e2 | |||
| 999740bc99 | |||
| 4c53b7cec9 | |||
| 535b839bfc | |||
| e3dd461d04 | |||
| e55a81568f | |||
| 83fbba433e | |||
| a494b162cd | |||
| 83b7139643 | |||
| 5dca9369dd |
@@ -0,0 +1,51 @@
|
|||||||
|
name: Release
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
runs-on: FreeBSD
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
# - name: Set up Python
|
||||||
|
# uses: actions/setup-python@v5
|
||||||
|
# with:
|
||||||
|
# python-version: '3.11'
|
||||||
|
- name: Set up Python
|
||||||
|
# Use a generic run step for FreeBSD if actions/setup-python
|
||||||
|
# fails in restricted environments.
|
||||||
|
run: |
|
||||||
|
python3 --version
|
||||||
|
python3 -m ensurepip --upgrade
|
||||||
|
|
||||||
|
- name: Install build tools
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
python3 -m pip install build twine
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python3 -m build
|
||||||
|
|
||||||
|
- name: Extract version from tag
|
||||||
|
id: get_version
|
||||||
|
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Upload to Gitea PyPI registry
|
||||||
|
env:
|
||||||
|
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||||
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||||
|
run: |
|
||||||
|
python3 -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/*
|
||||||
|
|
||||||
|
- name: Create release
|
||||||
|
uses: actions/gitea-release-action@v1
|
||||||
|
with:
|
||||||
|
files: |
|
||||||
|
dist/*.whl
|
||||||
|
dist/*.tar.gz
|
||||||
|
title: "Release ${{ steps.get_version.outputs.VERSION }}"
|
||||||
|
body: "Release version ${{ steps.get_version.outputs.VERSION }}"
|
||||||
+5
-1
@@ -7,4 +7,8 @@ __pycache__/
|
|||||||
.venv/
|
.venv/
|
||||||
test/
|
test/
|
||||||
build/
|
build/
|
||||||
*.egg-info/
|
dist/
|
||||||
|
*.egg-info/
|
||||||
|
ssl/
|
||||||
|
uv.lock
|
||||||
|
.hb.yaml
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
#name: "w02"
|
|
||||||
hb_port: 50003
|
|
||||||
hbd_host: ''
|
|
||||||
#logfile: "/home/andreas/public_html/messages/andreas"
|
|
||||||
logfile: "/Users/andreas/public_html/messages/andreas"
|
|
||||||
logfmt: "msg"
|
|
||||||
grace: 40
|
|
||||||
interval: 10
|
|
||||||
watchhosts:
|
|
||||||
# "localhost":
|
|
||||||
# "haschloss" :
|
|
||||||
# "cotgate":
|
|
||||||
# "wentworth":
|
|
||||||
"y":
|
|
||||||
notify: +4915123456789
|
|
||||||
src: "signal"
|
|
||||||
"winter":
|
|
||||||
notify: +14168226179
|
|
||||||
src: "signal"
|
|
||||||
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
|
|
||||||
drophosts: {"unknown", "wookie15", "wort"}
|
|
||||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
|
||||||
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
|
|
||||||
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
|
|
||||||
pushsrv: "pushover"
|
|
||||||
|
|
||||||
dyndomains: {"wrede.org"}
|
|
||||||
Vendored
+7
-6
@@ -4,12 +4,13 @@
|
|||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd (module)",
|
"name": "Python: Run hbd (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "hbd.cli",
|
"module": "hbd.server.cli",
|
||||||
"args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
|
"args": ["-c", "~/.hb.yaml", "-f", "-v"],
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
"env": {
|
"env": {
|
||||||
"PYTHONPATH": "${workspaceFolder}"
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
@@ -28,14 +29,14 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd with debugpy (listen)",
|
"name": "Python: Run hbc (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "debugpy",
|
"module": "hbd.client.main",
|
||||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
|
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||||
|
"cwd": "${workspaceFolder}",
|
||||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
Vendored
+4
-1
@@ -2,5 +2,8 @@
|
|||||||
"python.pythonPath": "/usr/bin/python3",
|
"python.pythonPath": "/usr/bin/python3",
|
||||||
"python.linting.enabled": true,
|
"python.linting.enabled": true,
|
||||||
"python.formatting.provider": "black",
|
"python.formatting.provider": "black",
|
||||||
"python.linting.flake8Enabled": true
|
"python.linting.flake8Enabled": true,
|
||||||
|
"chat.tools.terminal.autoApprove": {
|
||||||
|
"mv": true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
1. Don't assume. Don't hide confusion. Surface tradeoffs.
|
||||||
|
2. Minimum code that solves the problem. Nothing speculative.
|
||||||
|
3. Touch only what you must. Clean up only your own mess.
|
||||||
|
4. Define success criteria. Loop until verified.
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
|
|
||||||
|
|
||||||
# Heartbeat Daemon (hbd) ✅
|
# Heartbeat Daemon (hbd) ✅
|
||||||
|
|
||||||
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
||||||
@@ -13,75 +11,540 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
|||||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
||||||
- WebSocket API for live updates (hosts & messages) ✅
|
- WebSocket API for live updates (hosts & messages) ✅
|
||||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
||||||
|
- **User management & access control** ✅
|
||||||
|
- Optional user accounts with bcrypt-style password hashing (stdlib only)
|
||||||
|
- Per-host roles: owner, manager, monitor
|
||||||
|
- Session-based auth with cookie support (browser login page included)
|
||||||
|
- Backwards compatible: no auth required when no users are configured
|
||||||
|
- **HTTP API & Web UI** ✅
|
||||||
|
- REST API for plugin data, alerts, host information, and user management
|
||||||
|
- Live dashboard with WebSocket updates
|
||||||
|
- Interactive plugin metrics visualization
|
||||||
|
- Alerts dashboard with filtering and summaries
|
||||||
|
- **Message journal with automatic log rotation** ✅
|
||||||
|
- Logs all received messages in JSON format
|
||||||
|
- Size-based automatic rotation
|
||||||
|
- Configurable retention and backup management
|
||||||
|
- **Plugin system for extensible monitoring** ✅
|
||||||
|
- Collect system metrics (CPU, memory, disk, network)
|
||||||
|
- Execute existing Nagios monitoring plugins
|
||||||
|
- Create custom plugins with simple Python classes
|
||||||
|
- **Threshold alerting system** ✅
|
||||||
|
- Monitor metrics against configurable WARNING/CRITICAL thresholds
|
||||||
|
- Hysteresis to prevent alert flapping
|
||||||
|
- Automatic notifications on state changes
|
||||||
|
- Re-notification for ongoing alerts
|
||||||
- Modular codebase suitable for unit testing and CI ✅
|
- Modular codebase suitable for unit testing and CI ✅
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 🔌 Plugin System
|
||||||
|
|
||||||
|
Heartbeat includes a comprehensive plugin architecture that extends monitoring beyond simple heartbeats. The plugin system allows you to:
|
||||||
|
|
||||||
|
- **Collect system information**: OS details, hardware info, system configuration
|
||||||
|
- **Monitor resources**: CPU usage, memory, disk space, network statistics
|
||||||
|
- **Run Nagios plugins**: Execute thousands of existing Nagios monitoring plugins without modification
|
||||||
|
- **Create custom plugins**: Build your own monitoring logic with simple Python classes
|
||||||
|
|
||||||
|
### Plugin Types
|
||||||
|
|
||||||
|
- **InfoPlugin**: Collects static information once (e.g., OS version, hardware specs)
|
||||||
|
- **MonitorPlugin**: Collects metrics periodically (e.g., CPU usage every 30 seconds)
|
||||||
|
|
||||||
|
### Built-in Plugins
|
||||||
|
|
||||||
|
- `os_info`: Collects OS, kernel, distribution, and architecture information
|
||||||
|
- `cpu_monitor`: Monitors CPU usage, load average, frequency, and process counts
|
||||||
|
- `memory_monitor`: Monitors RAM and swap usage, available memory
|
||||||
|
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
|
||||||
|
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
|
||||||
|
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
|
||||||
|
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
|
||||||
|
|
||||||
|
### Nagios Integration
|
||||||
|
|
||||||
|
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
|
||||||
|
|
||||||
|
- Executes plugins via subprocess with timeout protection
|
||||||
|
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
|
||||||
|
- Extracts performance data with thresholds
|
||||||
|
- Reports aggregated status across all configured checks
|
||||||
|
|
||||||
|
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
|
||||||
|
|
||||||
|
### Creating Custom Plugins
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
class DiskMonitorPlugin(MonitorPlugin):
|
||||||
|
name = "disk_monitor"
|
||||||
|
interval = 60 # Run every 60 seconds
|
||||||
|
|
||||||
|
async def collect(self):
|
||||||
|
return {
|
||||||
|
"disk_usage": get_disk_usage(),
|
||||||
|
"timestamp": time.time()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Place plugins in `hbd/client/plugins/` and they'll be automatically discovered and loaded by the client.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Message Journal
|
||||||
|
|
||||||
|
Heartbeat includes a message journal that logs all received messages with automatic rotation.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **JSON Format**: All messages logged in JSONL (JSON Lines) format for easy parsing
|
||||||
|
- **Automatic Rotation**: Size-based rotation with configurable thresholds
|
||||||
|
- **Backup Management**: Keeps configurable number of rotated log files
|
||||||
|
- **Non-blocking**: Async logging with minimal performance impact
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Message journal settings
|
||||||
|
journal_enabled: true # Enable/disable journaling
|
||||||
|
journal_dir: /var/log/heartbeat # Journal directory
|
||||||
|
journal_file: messages.journal # Base filename
|
||||||
|
journal_max_size: 104857600 # Max size (100MB default)
|
||||||
|
journal_max_backups: 10 # Number of backups to keep
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Journal Entry
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Analyzing Journal Files
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View recent messages
|
||||||
|
tail -100 /var/log/heartbeat/messages.journal | jq .
|
||||||
|
|
||||||
|
# Count messages by type
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||||
|
|
||||||
|
# Filter by hostname
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||||
|
```
|
||||||
|
|
||||||
|
See [docs/MESSAGE_JOURNAL.md](docs/MESSAGE_JOURNAL.md) for complete documentation including rotation behavior, integration with log management systems, and analysis examples.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚨 Threshold Alerting
|
||||||
|
|
||||||
|
Heartbeat includes a sophisticated threshold alerting system that monitors plugin metrics and triggers notifications when values exceed configured limits.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **Multi-level alerts**: WARNING and CRITICAL severity levels
|
||||||
|
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
|
||||||
|
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
|
||||||
|
- **Smart notifications**: Alerts only on state changes, not every check
|
||||||
|
- **Re-notifications**: Periodic reminders for ongoing alerts
|
||||||
|
- **Journal integration**: All threshold events logged for audit trail
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
# RTT (Round-Trip Time) thresholds for heartbeat monitoring
|
||||||
|
# These are checked on every HTB message arrival
|
||||||
|
rtt:
|
||||||
|
webserver01:
|
||||||
|
warning: 100.0 # Warn when RTT > 100ms
|
||||||
|
critical: 500.0 # Critical when RTT > 500ms
|
||||||
|
|
||||||
|
database01:
|
||||||
|
warning: 50.0
|
||||||
|
critical: 200.0
|
||||||
|
|
||||||
|
# Plugin metric thresholds
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0 # Warn when CPU > 80%
|
||||||
|
critical: 90.0 # Critical when CPU > 90%
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
free_gb:
|
||||||
|
warning: 10.0 # Alert when < 10GB free
|
||||||
|
critical: 5.0
|
||||||
|
operator: "<" # Inverse threshold
|
||||||
|
|
||||||
|
# Global settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
|
||||||
|
```
|
||||||
|
|
||||||
|
### RTT Monitoring
|
||||||
|
|
||||||
|
Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
|
||||||
|
|
||||||
|
- **Per-host configuration**: Set different thresholds for each monitored host
|
||||||
|
- **Real-time checking**: Thresholds evaluated on every HTB message arrival
|
||||||
|
- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
|
||||||
|
- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
|
||||||
|
- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
|
||||||
|
- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
|
||||||
|
- **Re-notification**: Periodic reminders for ongoing RTT issues
|
||||||
|
- **Event & journal logging**: All RTT events logged for audit trail
|
||||||
|
|
||||||
|
**Configuration format:**
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
rtt:
|
||||||
|
<hostname>:
|
||||||
|
warning: <milliseconds> # Warn when RTT > this value
|
||||||
|
critical: <milliseconds> # Critical when RTT > this value
|
||||||
|
hysteresis: 0.1 # Optional: 10% hysteresis (default)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example alerts:**
|
||||||
|
```
|
||||||
|
WARNING: webserver01 - rtt.webserver01 = 125.3
|
||||||
|
CRITICAL: database01 - rtt.database01 = 520.1
|
||||||
|
RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
|
||||||
|
```
|
||||||
|
|
||||||
|
RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
|
||||||
|
|
||||||
|
### Alert Behavior
|
||||||
|
|
||||||
|
1. **State Changes**: Notifications sent when crossing thresholds
|
||||||
|
- OK → WARNING: Early notification
|
||||||
|
- WARNING → CRITICAL: Escalation
|
||||||
|
- CRITICAL → OK: Recovery
|
||||||
|
|
||||||
|
2. **Hysteresis**: Prevents rapid state transitions
|
||||||
|
```
|
||||||
|
Critical threshold: 90%
|
||||||
|
Hysteresis: 10%
|
||||||
|
Recovery threshold: 81% (90 - 10% of 90)
|
||||||
|
|
||||||
|
Value 91% → CRITICAL (threshold crossed)
|
||||||
|
Value 85% → CRITICAL (still above 81%)
|
||||||
|
Value 79% → OK (below recovery threshold)
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Re-notifications**: Periodic reminders for ongoing alerts
|
||||||
|
- Default: Every 60 minutes
|
||||||
|
- Configurable via `threshold_renotify_interval`
|
||||||
|
|
||||||
|
### Example Notifications
|
||||||
|
|
||||||
|
```
|
||||||
|
WARNING: webserver01 - cpu_monitor.cpu_percent = 85.0
|
||||||
|
CRITICAL: webserver01 - memory_monitor.percent = 96.0
|
||||||
|
RECOVERED: database01 - disk_monitor./.percent = 75.0 (WARNING -> OK)
|
||||||
|
REMINDER (CRITICAL): mailserver - cpu_monitor.load_1min = 12.5 (ongoing for 3600s)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Supported Metrics
|
||||||
|
|
||||||
|
All plugin metrics can be thresholded:
|
||||||
|
|
||||||
|
- **CPU**: cpu_percent, load_1min, load_5min, load_15min
|
||||||
|
- **Memory**: percent, available_mb, swap_percent
|
||||||
|
- **Disk**: Per-partition percent, free_gb, free_mb
|
||||||
|
- **Network**: errors_total, dropped packets, connection counts
|
||||||
|
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL)
|
||||||
|
|
||||||
|
See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 👥 User Management
|
||||||
|
|
||||||
|
Heartbeat supports optional user accounts with role-based access control per host.
|
||||||
|
|
||||||
|
### Roles
|
||||||
|
|
||||||
|
- **monitor** — view status, plugin data, alerts
|
||||||
|
- **manager** — monitor + queue commands, trigger DNS, queue upgrades
|
||||||
|
- **owner** — manager + drop host, transfer ownership, update access
|
||||||
|
- **admin** (user flag) — owner-level access on every host
|
||||||
|
|
||||||
|
When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||||
|
|
||||||
|
### Quick setup
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
password: pbkdf2:sha256:... # hbd passwd alice
|
||||||
|
admin: true
|
||||||
|
|
||||||
|
default_owner: alice
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate a password hash
|
||||||
|
hbd passwd alice
|
||||||
|
```
|
||||||
|
|
||||||
|
Browser users are redirected to `/login` automatically. The session cookie is set on login, so `fetch()` calls from dashboards work without any JavaScript changes.
|
||||||
|
|
||||||
|
See [docs/USERS.md](docs/USERS.md) for complete user management documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🌐 HTTP API & Web UI
|
||||||
|
|
||||||
|
Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST API and web-based dashboards for monitoring and visualization.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **User auth**: Optional session-based authentication with per-host role enforcement
|
||||||
|
- **REST API**: JSON endpoints for accessing plugin data, alerts, host information, and user management
|
||||||
|
- **Live Dashboard**: Real-time WebSocket-powered host status view
|
||||||
|
- **Plugin Metrics**: Interactive visualization of all plugin data with auto-refresh
|
||||||
|
- **Alerts Dashboard**: Comprehensive alert monitoring with filtering and summaries
|
||||||
|
|
||||||
|
### Web Dashboards
|
||||||
|
|
||||||
|
- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
|
||||||
|
- **Live View** (`/live`): Real-time host connectivity, latency, and messages
|
||||||
|
- **Plugin Metrics** (`/plugins`): Browse and visualize metrics from all plugins
|
||||||
|
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering
|
||||||
|
|
||||||
|
### API Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log in (when auth is configured)
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||||
|
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||||
|
|
||||||
|
# List all monitored hosts
|
||||||
|
curl $AUTH http://localhost:50004/api/0/hosts
|
||||||
|
|
||||||
|
# Get all plugin data for a host
|
||||||
|
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||||
|
|
||||||
|
# Get detailed plugin history (last 50 samples)
|
||||||
|
curl $AUTH "http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=50"
|
||||||
|
|
||||||
|
# Get alert states for a specific host
|
||||||
|
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||||
|
|
||||||
|
# Get all active alerts across all hosts
|
||||||
|
curl $AUTH http://localhost:50004/api/0/alerts
|
||||||
|
|
||||||
|
# View/update host access roles
|
||||||
|
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/access
|
||||||
|
```
|
||||||
|
|
||||||
|
See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation including response formats, error handling, and integration examples.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## ⚙️ Quickstart
|
## ⚙️ Quickstart
|
||||||
|
|
||||||
Prerequisites:
|
Prerequisites:
|
||||||
- Python 3.10+ (project uses language features from recent Python)
|
|
||||||
|
- Python 3.11+ (project uses language features from recent Python)
|
||||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
- `nsupdate` (for DNS updates) if using dynamic DNS
|
||||||
|
|
||||||
Install dependencies (recommended into a venv):
|
Install dependencies (recommended into a venv):
|
||||||
|
|
||||||
```bash
|
This project now declares its dependencies in `pyproject.toml`. Instead
|
||||||
python3 -m venv .venv
|
of the old `requirements.txt` flow, install the package into a virtualenv
|
||||||
source .venv/bin/activate
|
using `pip`:
|
||||||
python -m pip install --upgrade pip
|
|
||||||
python -m pip install -r requirements.txt
|
See `scripts/hb_install.sh` for a way to install.
|
||||||
# for development/testing tools
|
|
||||||
python -m pip install -r requirements-dev.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Run the daemon (example):
|
Run the daemon (example):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run with default config lookup (~/.hb.yaml)
|
# run with default config lookup (~/.hb.yaml)
|
||||||
PYTHONPATH=. hbd -c .hb.yaml -f -v
|
hbd -c .hb.yaml -f -v
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also run it directly via the package entrypoint after installation:
|
You can also run it directly via the package entrypoint after installation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m hbd.cli -c /path/to/config.yaml
|
python -m hbd.server.cli -c /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Running the Client
|
||||||
|
|
||||||
|
The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Basic usage pointing to server (host is a positional argument)
|
||||||
|
hbc your-server.example.com
|
||||||
|
|
||||||
|
# Run as daemon with a config file
|
||||||
|
hbc -d -c /etc/hbc.yaml your-server.example.com
|
||||||
|
|
||||||
|
# Send a one-off boot message
|
||||||
|
hbc --boot your-server.example.com
|
||||||
|
|
||||||
|
# Verbose output
|
||||||
|
hbc -v your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also run it via the module entrypoint:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m hbd.client.main your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Client configuration can also be specified in YAML:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
server: hbd.example.com
|
||||||
|
port: 50003
|
||||||
|
interval: 30
|
||||||
|
plugins:
|
||||||
|
cpu_monitor:
|
||||||
|
interval: 300 # Check every 5 minutes (default)
|
||||||
|
per_core: true
|
||||||
|
memory_monitor:
|
||||||
|
interval: 300 # Check every 5 minutes (default)
|
||||||
|
disk_monitor:
|
||||||
|
interval: 300 # Check every 5 minutes (default)
|
||||||
|
network_monitor:
|
||||||
|
interval: 300 # Check every 5 minutes (default)
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300 # Check every 5 minutes (default)
|
||||||
|
commands:
|
||||||
|
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
```
|
||||||
|
|
||||||
|
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
|
||||||
|
|
||||||
|
### hbc_mini — single-file client (no external dependencies)
|
||||||
|
|
||||||
|
`scripts/hbc_mini.py` is a self-contained version of the heartbeat client that requires only Python 3.8+ and no external packages. Copy it to any host and run it directly — no virtualenv, no `pip install`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Basic usage
|
||||||
|
python3 hbc_mini.py your-server.example.com
|
||||||
|
|
||||||
|
# Run as daemon
|
||||||
|
python3 hbc_mini.py -d your-server.example.com
|
||||||
|
|
||||||
|
# Send a boot message
|
||||||
|
python3 hbc_mini.py -b your-server.example.com
|
||||||
|
|
||||||
|
# Send a one-off message
|
||||||
|
python3 hbc_mini.py -m "maintenance starting" your-server.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
**Config:** `~/.hbc.json` (same keys as `~/.hbc.yaml`, JSON format). Example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hb_port": 50003,
|
||||||
|
"interval": 30,
|
||||||
|
"plugins": {
|
||||||
|
"ping_monitor": {
|
||||||
|
"interval": 60,
|
||||||
|
"hosts": ["8.8.8.8", "192.168.1.1"]
|
||||||
|
},
|
||||||
|
"nagios_runner": {
|
||||||
|
"interval": 300,
|
||||||
|
"commands": [
|
||||||
|
{"name": "check_load", "command": "/usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Plugin availability:**
|
||||||
|
|
||||||
|
| Plugin | Platform | Data source |
|
||||||
|
|---|---|---|
|
||||||
|
| `os_info` | all | `platform` stdlib |
|
||||||
|
| `ping_monitor` | all | `ping` subprocess |
|
||||||
|
| `nagios_runner` | all (not Windows) | subprocess |
|
||||||
|
| `cpu_monitor` | Linux | `/proc/stat` |
|
||||||
|
| `memory_monitor` | Linux | `/proc/meminfo` |
|
||||||
|
| `disk_monitor` | Linux, macOS, BSD | `df -P` subprocess |
|
||||||
|
| `network_monitor` | Linux | `/proc/net/dev` |
|
||||||
|
|
||||||
|
**What is not available compared to the full `hbc`:**
|
||||||
|
|
||||||
|
- No YAML config (use JSON instead)
|
||||||
|
- No `filesystem_info` plugin
|
||||||
|
- `cpu_monitor` does not report per-core usage or CPU frequency (no psutil)
|
||||||
|
- Plugins cannot be loaded from external `.py` files — all plugins are compiled in
|
||||||
|
|
||||||
|
Everything else — heartbeat protocol, ACK/CMD/UPD handling, `hb_install.sh`-based self-update, daemonize, syslog — is identical to the full client.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 🐞 Debugging in VS Code
|
## 🐞 Debugging in VS Code
|
||||||
|
|
||||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
||||||
|
|
||||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
||||||
- Use **F5** and pick one of these configurations from the Run view:
|
- Use **F5** and pick one of these configurations from the Run view:
|
||||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
- **Python: Run hbd (module)** — runs `hbd.server.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
||||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
||||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
||||||
|
|
||||||
To start `hbd` manually and wait for the debugger to attach, run:
|
To start `hbd` manually and wait for the debugger to attach, run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli -c .hb.yaml -f -v
|
||||||
```
|
```
|
||||||
|
|
||||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
Set breakpoints in modules such as `hbd/server/udp.py`, `hbd/server/dns.py`, or `hbd/server/main.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🛠 Configuration
|
## 🛠 Configuration
|
||||||
|
|
||||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/server/config.py`):
|
||||||
|
|
||||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
||||||
- `hbd_port`: internal control port (default: 50004)
|
- `hbd_port`: internal control port (default: 50004)
|
||||||
- `hbd_host`: bind address for HTTP/WSS
|
- `hbd_host`: bind address for HTTP/WSS
|
||||||
- `pickfile`: path for persisted state
|
- `pickfile`: path for persisted state
|
||||||
- `logfile`: path to log file
|
- `logfile`: path to log file
|
||||||
- `logfmt`: `text` or `msg`
|
|
||||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
||||||
- `interval` / `grace`: heartbeat timing configuration
|
- `interval` / `grace`: heartbeat timing configuration
|
||||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
||||||
- `nsupdate_bin`: path to nsupdate binary
|
- `nsupdate_bin`: path to nsupdate binary
|
||||||
|
- `ws_port`: port for plain WebSocket connections (default: 50005)
|
||||||
|
- `wss_port`: port for secure WebSocket (WSS) connections (default: none).
|
||||||
|
If set, `hbd` will attempt to serve WSS on this port when `wss_pem` and
|
||||||
|
`wss_key` SSL files are available under `cert_path` (see below).
|
||||||
|
- `cert_path`: directory where TLS certificate and key are looked up (default: /usr/local/etc/ssl/)
|
||||||
|
- `wss_pem`: filename for the certificate chain (default: fullchain.pem)
|
||||||
|
- `wss_key`: filename for the private key (default: privkey.pem)
|
||||||
|
- `users`: mapping of username → user attributes (full_name, avatar, password, admin, notification_channels)
|
||||||
|
- `default_owner`: username that owns hosts with no explicit owner (falls back to first admin user)
|
||||||
|
|
||||||
Example `.hb.yaml` (minimal):
|
Example `.hb.yaml` (minimal):
|
||||||
|
|
||||||
@@ -94,24 +557,53 @@ nsupdate_bin: /usr/bin/nsupdate
|
|||||||
pushsrv: pushover
|
pushsrv: pushover
|
||||||
```
|
```
|
||||||
|
|
||||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
> Tip: `SERVER_DEFAULTS` in `hbd/server/config.py` contains the canonical defaults and accepted configuration keys.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🔧 Architecture & Modules
|
## 🔧 Architecture & Modules
|
||||||
|
|
||||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
The package is organized into three subpackages:
|
||||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
|
||||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and a background DNS thread (`start_dns_thread`)
|
**`hbd.common`** — shared code used by both client and server:
|
||||||
- `hbd.notify` — email and push notification helpers
|
- `hbd.common.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
|
||||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
- `hbd.common.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
||||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
|
||||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
**`hbd.server`** — the heartbeat daemon (`hbd`):
|
||||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
- `hbd.server.cli` — CLI entrypoint and argument parsing
|
||||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
- `hbd.server.main` — async orchestration to run UDP/HTTP/WSS components
|
||||||
|
- `hbd.server.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
||||||
|
- `hbd.server.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
|
||||||
|
The DNS worker runs as an `asyncio` task and the package exposes a small thread-safe bridge
|
||||||
|
so legacy synchronous code can `put()` updates into the queue.
|
||||||
|
- `hbd.server.notify` — email and push notification helpers
|
||||||
|
- `hbd.server.ws` — WebSocket server and thread-safe broadcast helpers
|
||||||
|
- `hbd.server.http` — HTTP handler factory for the status UI/API
|
||||||
|
- `hbd.server.journal` — message journal with size-based log rotation and backup management
|
||||||
|
- `hbd.server.threshold` — threshold alerting engine
|
||||||
|
- `hbd.server.monitor` — host state monitoring
|
||||||
|
- `hbd.server.hbdclass` — `Host` class and shared server state
|
||||||
|
- `hbd.server.config` — configuration loader and defaults
|
||||||
|
|
||||||
|
**`hbd.client`** — the heartbeat client (`hbc`):
|
||||||
|
- `hbd.client.main` — client entrypoint; sends heartbeats and plugin data to the server
|
||||||
|
- `hbd.client.plugin` — plugin framework with base classes, registry, and dynamic loader
|
||||||
|
- `hbd.client.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
|
||||||
|
- `hbd.client.config` — client configuration loader
|
||||||
|
|
||||||
This modular layout makes the code easier to test and maintain.
|
This modular layout makes the code easier to test and maintain.
|
||||||
|
|
||||||
|
**Runtime & Shutdown**
|
||||||
|
|
||||||
|
- The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks.
|
||||||
|
- On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit.
|
||||||
|
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.server.hbdclass.Host.dnsQ`.
|
||||||
|
|
||||||
|
**Templates & Static Files**
|
||||||
|
|
||||||
|
- Template files are located under `hbd/server/templates`. The HTTP server resolves templates relative to the `hbd.server` package but the path can be overridden with the `templates_dir` config key.
|
||||||
|
- Static assets (CSS/JS/images) are served from `hbd/server/static` via the `/static/<path>` HTTP route.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🧪 Testing & Dev
|
## 🧪 Testing & Dev
|
||||||
@@ -126,8 +618,8 @@ pytest -q
|
|||||||
```
|
```
|
||||||
|
|
||||||
Developer tooling included:
|
Developer tooling included:
|
||||||
|
|
||||||
- `pyproject.toml` — project metadata and dependencies
|
- `pyproject.toml` — project metadata and dependencies
|
||||||
- `requirements-dev.txt` — dev/test dependencies
|
|
||||||
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
||||||
|
|
||||||
To run linters and type checks locally:
|
To run linters and type checks locally:
|
||||||
@@ -153,6 +645,7 @@ tox -e mypy
|
|||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
Contributions welcome! Please:
|
Contributions welcome! Please:
|
||||||
|
|
||||||
1. Open an issue to discuss larger changes.
|
1. Open an issue to discuss larger changes.
|
||||||
2. Create a topic branch and a clear PR.
|
2. Create a topic branch and a clear PR.
|
||||||
3. Add tests for new features and run linters.
|
3. Add tests for new features and run linters.
|
||||||
@@ -167,8 +660,8 @@ This repository is licensed under the MIT license. See `LICENSE` for details.
|
|||||||
---
|
---
|
||||||
|
|
||||||
If you'd like, I can also:
|
If you'd like, I can also:
|
||||||
|
|
||||||
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
||||||
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
||||||
|
|
||||||
Which one should I do next? ✨
|
Which one should I do next? ✨
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,40 @@
|
|||||||
|
async def send_sms(hass, user, password, sender_did, call):
|
||||||
|
"""Send SMS message using multipart form-data like MMS."""
|
||||||
|
_LOGGER = logging.getLogger(__name__)
|
||||||
|
recipient = call.data.get("recipient")
|
||||||
|
message = call.data.get("message")
|
||||||
|
|
||||||
|
if not recipient or not message:
|
||||||
|
_LOGGER.error("Recipient or message missing.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build form data dictionary
|
||||||
|
form_data = {
|
||||||
|
'api_username': str(user),
|
||||||
|
'api_password': str(password),
|
||||||
|
'did': str(sender_did),
|
||||||
|
'dst': str(recipient),
|
||||||
|
'message': str(message),
|
||||||
|
'method': 'sendSMS'
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
with aiohttp.MultipartWriter("form-data") as mp:
|
||||||
|
for key, value in form_data.items():
|
||||||
|
part = mp.append(value)
|
||||||
|
part.set_content_disposition('form-data', name=key)
|
||||||
|
|
||||||
|
_LOGGER.error("voipms_sms: sending SMS: %s", mp)
|
||||||
|
async with session.post(REST_ENDPOINT, data=mp) as response:
|
||||||
|
response_text = await response.text()
|
||||||
|
if response.status == 200:
|
||||||
|
response_json = json.loads(response_text)
|
||||||
|
if response_json['status'] == "success":
|
||||||
|
_LOGGER.info("voipms_sms: SMS sent successfully: %s", response_text)
|
||||||
|
else:
|
||||||
|
_LOGGER.error("voipms_sms: SMS not sent: %s", response_text)
|
||||||
|
else:
|
||||||
|
_LOGGER.error("voipms_sms: Failed to send SMS. Status: %s, Response: %s", response.status, response_text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,320 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Demonstration of the threshold alerting system.
|
||||||
|
|
||||||
|
This script shows how thresholds work by simulating plugin data
|
||||||
|
with values that cross various threshold boundaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from hbd.threshold import ThresholdChecker, AlertLevel
|
||||||
|
|
||||||
|
|
||||||
|
def demo_basic_thresholds():
|
||||||
|
"""Demonstrate basic threshold checking."""
|
||||||
|
print("=" * 70)
|
||||||
|
print("DEMO 1: Basic Threshold Checking")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"thresholds": {
|
||||||
|
"cpu_monitor": {
|
||||||
|
"cpu_percent": {
|
||||||
|
"warning": 80.0,
|
||||||
|
"critical": 90.0,
|
||||||
|
"operator": ">",
|
||||||
|
"hysteresis": 0.1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
notifications = []
|
||||||
|
|
||||||
|
def notifier(msg):
|
||||||
|
notifications.append(msg)
|
||||||
|
print(f" 📧 NOTIFICATION: {msg}")
|
||||||
|
|
||||||
|
checker = ThresholdChecker(config, notification_callback=notifier)
|
||||||
|
alert_states = {}
|
||||||
|
|
||||||
|
# Simulate CPU values over time
|
||||||
|
test_values = [
|
||||||
|
(50.0, "Normal operation"),
|
||||||
|
(85.0, "Crosses WARNING threshold"),
|
||||||
|
(87.0, "Still in WARNING"),
|
||||||
|
(95.0, "Escalates to CRITICAL"),
|
||||||
|
(92.0, "Still CRITICAL (in hysteresis)"),
|
||||||
|
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
|
||||||
|
(79.0, "Recovers to OK"),
|
||||||
|
(50.0, "Back to normal"),
|
||||||
|
]
|
||||||
|
|
||||||
|
print("\nSimulating CPU usage over time:")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
for value, description in test_values:
|
||||||
|
print(f"\n📊 CPU: {value}% - {description}")
|
||||||
|
|
||||||
|
plugin_data = {"cpu_percent": value}
|
||||||
|
state_changes = checker.check_plugin_data(
|
||||||
|
host_name="testhost",
|
||||||
|
plugin_name="cpu_monitor",
|
||||||
|
data=plugin_data,
|
||||||
|
alert_states=alert_states,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||||
|
if current_state:
|
||||||
|
print(f" Current state: {current_state.level.name}")
|
||||||
|
|
||||||
|
if state_changes:
|
||||||
|
for metric, old_level, new_level, val in state_changes:
|
||||||
|
print(f" ⚠️ State change: {old_level.name} → {new_level.name}")
|
||||||
|
|
||||||
|
print(f"\n📈 Summary: {len(notifications)} notifications sent")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
def demo_multiple_metrics():
|
||||||
|
"""Demonstrate monitoring multiple metrics."""
|
||||||
|
print("\n\n" + "=" * 70)
|
||||||
|
print("DEMO 2: Multiple Metrics and Alert Summary")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"thresholds": {
|
||||||
|
"cpu_monitor": {
|
||||||
|
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
||||||
|
"load_1min": {"warning": 4.0, "critical": 8.0},
|
||||||
|
},
|
||||||
|
"memory_monitor": {
|
||||||
|
"percent": {"warning": 85.0, "critical": 95.0},
|
||||||
|
"available_mb": {
|
||||||
|
"warning": 1000,
|
||||||
|
"critical": 500,
|
||||||
|
"operator": "<",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
notifications = []
|
||||||
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||||
|
alert_states = {}
|
||||||
|
|
||||||
|
# Simulate problematic system state
|
||||||
|
print("\nSimulating a system under load:")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
scenarios = [
|
||||||
|
{
|
||||||
|
"name": "Initial state - all OK",
|
||||||
|
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
|
||||||
|
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CPU spikes to WARNING",
|
||||||
|
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||||
|
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Memory also reaches WARNING",
|
||||||
|
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||||
|
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CPU escalates to CRITICAL",
|
||||||
|
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
|
||||||
|
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "System recovering",
|
||||||
|
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
|
||||||
|
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for scenario in scenarios:
|
||||||
|
print(f"\n📍 {scenario['name']}")
|
||||||
|
|
||||||
|
# Check CPU metrics
|
||||||
|
checker.check_plugin_data(
|
||||||
|
"testhost",
|
||||||
|
"cpu_monitor",
|
||||||
|
scenario["cpu_monitor"],
|
||||||
|
alert_states
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check memory metrics
|
||||||
|
checker.check_plugin_data(
|
||||||
|
"testhost",
|
||||||
|
"memory_monitor",
|
||||||
|
scenario["memory_monitor"],
|
||||||
|
alert_states
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show alert summary
|
||||||
|
summary = checker.get_alert_summary(alert_states)
|
||||||
|
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
|
||||||
|
|
||||||
|
# Show active alerts
|
||||||
|
active = checker.get_active_alerts(alert_states)
|
||||||
|
if active:
|
||||||
|
print(f" Active alerts:")
|
||||||
|
for alert in active:
|
||||||
|
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
|
||||||
|
|
||||||
|
print(f"\n📈 Total notifications sent: {len(notifications)}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
def demo_hysteresis():
|
||||||
|
"""Demonstrate hysteresis effect."""
|
||||||
|
print("\n\n" + "=" * 70)
|
||||||
|
print("DEMO 3: Hysteresis Prevents Flapping")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"thresholds": {
|
||||||
|
"cpu_monitor": {
|
||||||
|
"cpu_percent": {
|
||||||
|
"warning": 80.0,
|
||||||
|
"critical": 90.0,
|
||||||
|
"hysteresis": 0.1, # 10% hysteresis
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
notifications = []
|
||||||
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||||
|
alert_states = {}
|
||||||
|
|
||||||
|
print("\nCritical threshold: 90%")
|
||||||
|
print("Hysteresis: 10%")
|
||||||
|
print("Recovery threshold: 81% (90 - 10% of 90)")
|
||||||
|
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
# Simulate fluctuating values
|
||||||
|
test_values = [
|
||||||
|
(75.0, "Normal"),
|
||||||
|
(92.0, "Crosses CRITICAL"),
|
||||||
|
(88.0, "Drops but still above 81% (stays CRITICAL)"),
|
||||||
|
(86.0, "Still above 81% (stays CRITICAL)"),
|
||||||
|
(83.0, "Still above 81% (stays CRITICAL)"),
|
||||||
|
(80.0, "Below 81% - recovers to OK"),
|
||||||
|
(88.0, "Rises again but below 90% (stays OK)"),
|
||||||
|
(91.0, "Crosses CRITICAL again"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for value, description in test_values:
|
||||||
|
print(f"\n📊 CPU: {value:5.1f}% - {description}")
|
||||||
|
|
||||||
|
plugin_data = {"cpu_percent": value}
|
||||||
|
state_changes = checker.check_plugin_data(
|
||||||
|
"testhost",
|
||||||
|
"cpu_monitor",
|
||||||
|
plugin_data,
|
||||||
|
alert_states,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||||
|
print(f" State: {current_state.level.name}")
|
||||||
|
|
||||||
|
if state_changes:
|
||||||
|
print(f" 📧 Notification sent (state changed)")
|
||||||
|
else:
|
||||||
|
print(f" ✓ No notification (state unchanged - hysteresis working)")
|
||||||
|
|
||||||
|
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
def demo_inverse_threshold():
|
||||||
|
"""Demonstrate inverse thresholds (less than)."""
|
||||||
|
print("\n\n" + "=" * 70)
|
||||||
|
print("DEMO 4: Inverse Thresholds (Alert When Low)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"thresholds": {
|
||||||
|
"memory_monitor": {
|
||||||
|
"available_mb": {
|
||||||
|
"warning": 1000, # Warn when < 1000 MB
|
||||||
|
"critical": 500, # Critical when < 500 MB
|
||||||
|
"operator": "<",
|
||||||
|
"hysteresis": 0.1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
notifications = []
|
||||||
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||||
|
alert_states = {}
|
||||||
|
|
||||||
|
print("\nMonitoring available memory (alert when LOW):")
|
||||||
|
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
test_values = [
|
||||||
|
(2000, "Plenty of memory"),
|
||||||
|
(800, "Drops below 1000 MB - WARNING"),
|
||||||
|
(450, "Drops below 500 MB - CRITICAL"),
|
||||||
|
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
|
||||||
|
(600, "Enough recovery - back to WARNING"),
|
||||||
|
(1200, "Fully recovered - OK"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for value, description in test_values:
|
||||||
|
print(f"\n💾 Available: {value} MB - {description}")
|
||||||
|
|
||||||
|
plugin_data = {"available_mb": value}
|
||||||
|
state_changes = checker.check_plugin_data(
|
||||||
|
"testhost",
|
||||||
|
"memory_monitor",
|
||||||
|
plugin_data,
|
||||||
|
alert_states,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_state = alert_states.get("memory_monitor.available_mb")
|
||||||
|
print(f" State: {current_state.level.name}")
|
||||||
|
|
||||||
|
if state_changes:
|
||||||
|
for metric, old_level, new_level, val in state_changes:
|
||||||
|
print(f" 📧 {old_level.name} → {new_level.name}")
|
||||||
|
|
||||||
|
print(f"\n📈 Notifications sent: {len(notifications)}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("\n")
|
||||||
|
print("╔" + "═" * 68 + "╗")
|
||||||
|
print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
|
||||||
|
print("╚" + "═" * 68 + "╝")
|
||||||
|
|
||||||
|
demo_basic_thresholds()
|
||||||
|
demo_multiple_metrics()
|
||||||
|
demo_hysteresis()
|
||||||
|
demo_inverse_threshold()
|
||||||
|
|
||||||
|
print("\n\n" + "=" * 70)
|
||||||
|
print("DEMONSTRATION COMPLETE")
|
||||||
|
print("=" * 70)
|
||||||
|
print("\nKey takeaways:")
|
||||||
|
print(" • Thresholds detect when metrics exceed configured limits")
|
||||||
|
print(" • Notifications sent only on state changes, not every check")
|
||||||
|
print(" • Hysteresis prevents alert flapping")
|
||||||
|
print(" • Supports both 'greater than' and 'less than' thresholds")
|
||||||
|
print(" • Multiple metrics can be monitored simultaneously")
|
||||||
|
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
@@ -0,0 +1,291 @@
|
|||||||
|
# Configuration Reload
|
||||||
|
|
||||||
|
The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
|
||||||
|
|
||||||
|
## How to Reload Configuration
|
||||||
|
|
||||||
|
Send a SIGHUP signal to the running hbd process:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Find the process ID
|
||||||
|
ps aux | grep hbd
|
||||||
|
|
||||||
|
# Or use pidof/pgrep
|
||||||
|
pidof hbd
|
||||||
|
pgrep -f hbd
|
||||||
|
|
||||||
|
# Send SIGHUP signal
|
||||||
|
kill -HUP <pid>
|
||||||
|
|
||||||
|
# Or if using systemd
|
||||||
|
systemctl reload heartbeat
|
||||||
|
```
|
||||||
|
|
||||||
|
## What Can Be Reloaded
|
||||||
|
|
||||||
|
The following configuration sections can be reloaded without restarting:
|
||||||
|
|
||||||
|
### ✅ Fully Reloadable
|
||||||
|
|
||||||
|
- **Notification Channels** (`notification_channels`)
|
||||||
|
- Add, remove, or modify notification channel definitions
|
||||||
|
- Update tokens, API keys, SMTP credentials
|
||||||
|
- Change recipient lists
|
||||||
|
|
||||||
|
- **Threshold Configurations** (`threshold_configs`)
|
||||||
|
- Modify warning and critical thresholds
|
||||||
|
- Add or remove threshold rules
|
||||||
|
- Change operators and hysteresis values
|
||||||
|
- Update display formats
|
||||||
|
|
||||||
|
- **Host Configuration** (`hosts`)
|
||||||
|
- Change watch status
|
||||||
|
- Update notification channel assignments
|
||||||
|
- Modify threshold config assignments
|
||||||
|
- Change dyndns status
|
||||||
|
|
||||||
|
- **Host Lists**
|
||||||
|
- `watchhosts` - hosts to monitor
|
||||||
|
- `dyndnshosts` - hosts with dynamic DNS
|
||||||
|
- `drophosts` - hosts to ignore
|
||||||
|
|
||||||
|
- **Runtime Settings**
|
||||||
|
- `grace` - grace period multiplier
|
||||||
|
- `interval` - expected heartbeat interval
|
||||||
|
- `threshold_renotify_interval` - re-notification interval
|
||||||
|
- `debug` - debug level
|
||||||
|
- `verbose` - verbose output
|
||||||
|
|
||||||
|
- **DNS Settings**
|
||||||
|
- `dyndomains` - dynamic DNS domains
|
||||||
|
- `nsupdate_bin` - nsupdate binary path
|
||||||
|
- `rndc_key` - RNDC key path
|
||||||
|
|
||||||
|
### ⚠️ Requires Restart
|
||||||
|
|
||||||
|
The following settings **cannot** be reloaded and require a service restart:
|
||||||
|
|
||||||
|
- **Network Ports**
|
||||||
|
- `hb_port` - UDP heartbeat port
|
||||||
|
- `hbd_port` - HTTP API port
|
||||||
|
- `ws_port` - WebSocket port
|
||||||
|
- `wss_port` - Secure WebSocket port
|
||||||
|
|
||||||
|
- **SSL/TLS Settings**
|
||||||
|
- `cert_path` - SSL certificate path
|
||||||
|
- `wss_pem` - SSL certificate file
|
||||||
|
- `wss_key` - SSL key file
|
||||||
|
|
||||||
|
- **Persistence**
|
||||||
|
- `pickfile` - Pickle file path
|
||||||
|
|
||||||
|
- **Logging**
|
||||||
|
- `logfile` - Log file path
|
||||||
|
|
||||||
|
- **Journal Settings**
|
||||||
|
- `journal_enabled` - Enable/disable journaling
|
||||||
|
- `journal_dir` - Journal directory
|
||||||
|
- `journal_file` - Journal filename
|
||||||
|
- `journal_max_size` - Maximum journal size
|
||||||
|
- `journal_max_backups` - Number of backup files
|
||||||
|
|
||||||
|
## Reload Process
|
||||||
|
|
||||||
|
When a SIGHUP signal is received:
|
||||||
|
|
||||||
|
1. **Configuration File Loading**
|
||||||
|
- The config file is re-read from disk
|
||||||
|
- YAML parsing is performed
|
||||||
|
- Validation checks are run
|
||||||
|
|
||||||
|
2. **Component Updates**
|
||||||
|
- Notification system is updated with new channel definitions
|
||||||
|
- Threshold checker reloads all threshold configurations
|
||||||
|
- Alert states are preserved to maintain hysteresis
|
||||||
|
|
||||||
|
3. **Error Handling**
|
||||||
|
- If reload fails, the previous configuration is kept
|
||||||
|
- Error messages are logged
|
||||||
|
- Service continues running with old configuration
|
||||||
|
|
||||||
|
4. **Logging**
|
||||||
|
- Reload start and completion are logged
|
||||||
|
- Each component reports its reload status
|
||||||
|
- Total number of thresholds is reported
|
||||||
|
|
||||||
|
## Example Reload Session
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Terminal 1: Watch the logs
|
||||||
|
tail -f /var/log/heartbeat.log
|
||||||
|
|
||||||
|
# Terminal 2: Edit configuration
|
||||||
|
vim /path/to/.hb.yaml
|
||||||
|
|
||||||
|
# Make changes to notification channels or thresholds
|
||||||
|
# Save the file
|
||||||
|
|
||||||
|
# Terminal 3: Trigger reload
|
||||||
|
kill -HUP $(pgrep -f hbd)
|
||||||
|
|
||||||
|
# Terminal 1: See reload messages
|
||||||
|
2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Starting configuration reload...
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
|
||||||
|
2026-04-01 12:34:56 INFO: Notification configuration reloaded
|
||||||
|
2026-04-01 12:34:56 INFO: Reloading threshold configuration...
|
||||||
|
2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
2026-04-01 12:34:56 INFO: Configuration reload completed successfully
|
||||||
|
2026-04-01 12:34:56 INFO: ============================================================
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Use Cases
|
||||||
|
|
||||||
|
### 1. Update Notification Credentials
|
||||||
|
|
||||||
|
If you need to rotate API keys or update SMTP passwords:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
pushover_standard:
|
||||||
|
type: pushover
|
||||||
|
token: new-token-here # Updated
|
||||||
|
user: new-user-key-here # Updated
|
||||||
|
```
|
||||||
|
|
||||||
|
Just edit the config file and send SIGHUP - no restart needed.
|
||||||
|
|
||||||
|
### 2. Adjust Threshold Values
|
||||||
|
|
||||||
|
Fine-tune alerting thresholds based on observed behavior:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 85.0 # Increased from 80.0
|
||||||
|
critical: 95.0 # Increased from 90.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Send SIGHUP to apply the new thresholds immediately.
|
||||||
|
|
||||||
|
### 3. Add New Notification Channels
|
||||||
|
|
||||||
|
Add a new notification destination:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
email_oncall:
|
||||||
|
type: email
|
||||||
|
recipients: [oncall@example.com]
|
||||||
|
sender: alerts@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
critical_server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard, email_oncall] # Added
|
||||||
|
```
|
||||||
|
|
||||||
|
The new channel becomes active immediately after SIGHUP.
|
||||||
|
|
||||||
|
### 4. Update Watch List
|
||||||
|
|
||||||
|
Start or stop monitoring hosts without restart:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
new_server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true # Start watching
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Test Configuration Before Reload**
|
||||||
|
- Validate YAML syntax before sending SIGHUP
|
||||||
|
- Check for typos in channel names
|
||||||
|
- Verify threshold values are reasonable
|
||||||
|
|
||||||
|
2. **Monitor Reload Logs**
|
||||||
|
- Always check logs after reload to confirm success
|
||||||
|
- Look for error messages if reload fails
|
||||||
|
- Verify expected number of thresholds loaded
|
||||||
|
|
||||||
|
3. **Backup Before Changes**
|
||||||
|
- Keep a backup of working configuration
|
||||||
|
- Use version control (git) for config files
|
||||||
|
- Document why changes were made
|
||||||
|
|
||||||
|
4. **Gradual Rollout**
|
||||||
|
- Test changes on development server first
|
||||||
|
- Apply to one production server at a time
|
||||||
|
- Verify behavior before applying everywhere
|
||||||
|
|
||||||
|
5. **Plan for Restart-Required Changes**
|
||||||
|
- Schedule downtime for port or SSL changes
|
||||||
|
- Use blue-green deployment if possible
|
||||||
|
- Keep service downtime minimal
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Reload Doesn't Apply Changes
|
||||||
|
|
||||||
|
**Check:**
|
||||||
|
- Is the config file path correct?
|
||||||
|
- Did you save the file after editing?
|
||||||
|
- Are there YAML syntax errors?
|
||||||
|
- Check the logs for error messages
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Validate YAML syntax
|
||||||
|
python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
|
||||||
|
|
||||||
|
# Check file modification time
|
||||||
|
ls -l .hb.yaml
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
journalctl -u heartbeat -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Partial Configuration Applied
|
||||||
|
|
||||||
|
**Cause:** Some sections reloaded, others didn't.
|
||||||
|
|
||||||
|
**Solution:** Check logs to see which components failed. Common issues:
|
||||||
|
- Invalid channel type
|
||||||
|
- Missing required threshold fields
|
||||||
|
- Invalid host references
|
||||||
|
|
||||||
|
### Service Becomes Unresponsive
|
||||||
|
|
||||||
|
**Cause:** Malformed configuration caused an exception.
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
1. Revert to backup configuration
|
||||||
|
2. Send SIGHUP again to reload the good config
|
||||||
|
3. If service is completely stuck, restart it
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
The reload mechanism uses:
|
||||||
|
|
||||||
|
- **Signal Handling**: SIGHUP triggers reload event
|
||||||
|
- **Async-Safe Reloading**: Configuration is loaded asynchronously
|
||||||
|
- **Component Coordination**: All affected components are updated atomically
|
||||||
|
- **State Preservation**: Alert states and hysteresis information are maintained
|
||||||
|
- **Error Recovery**: Failed reloads don't affect running configuration
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
|
||||||
|
- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
|
||||||
|
- Configuration examples in `hbd/config_*.yaml`
|
||||||
@@ -0,0 +1,632 @@
|
|||||||
|
# HTTP API and Web UI Documentation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
All API endpoints are relative to the server base URL:
|
||||||
|
```
|
||||||
|
http://your-server:50004
|
||||||
|
```
|
||||||
|
|
||||||
|
Default port is `50004` (configurable via `hbd_port` in configuration).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
When [user accounts are configured](USERS.md), every request must be authenticated.
|
||||||
|
|
||||||
|
- **Browser requests** to HTML pages are redirected to `/login` automatically. JavaScript `fetch()` calls on the dashboards send the session cookie automatically — no JS changes are needed.
|
||||||
|
- **API / programmatic requests** must include the token in an `Authorization: Bearer <token>` header or an `X-Auth-Token` header.
|
||||||
|
|
||||||
|
Unauthenticated API requests receive `401 Unauthorized`. When no users are configured the server runs in unauthenticated mode and all endpoints are open.
|
||||||
|
|
||||||
|
### Login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||||
|
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
See [User Management](USERS.md) for full authentication documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
|
||||||
|
| Method | Path | Description | Auth required |
|
||||||
|
|--------|------|-------------|---------------|
|
||||||
|
| `POST` | `/api/0/auth/login` | Obtain session token | No |
|
||||||
|
| `POST` | `/api/0/auth/logout` | Invalidate session | Token |
|
||||||
|
|
||||||
|
### Users
|
||||||
|
|
||||||
|
| Method | Path | Description | Role |
|
||||||
|
|--------|------|-------------|------|
|
||||||
|
| `GET` | `/api/0/users` | List all users | Admin |
|
||||||
|
| `GET` | `/api/0/users/me` | Own profile | Authenticated |
|
||||||
|
|
||||||
|
### Host Management
|
||||||
|
|
||||||
|
#### GET /api/0/hosts
|
||||||
|
Get list of all monitored hosts with their state information. When auth is enabled, only hosts the caller has at least **monitor** access to are returned.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "webserver01",
|
||||||
|
"dyn": false,
|
||||||
|
"owner": "alice",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"],
|
||||||
|
"connections": [...]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/messages
|
||||||
|
Get recent heartbeat messages (last 30).
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"time": 1711234567.123,
|
||||||
|
"host": "webserver01",
|
||||||
|
"msg": "heartbeat received"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Plugin Data Endpoints
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/plugins
|
||||||
|
Get all plugin data for a specific host.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"plugins": {
|
||||||
|
"cpu_monitor": {
|
||||||
|
"timestamp": 1711234567.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 45.2,
|
||||||
|
"load_1min": 2.5,
|
||||||
|
"load_5min": 2.1,
|
||||||
|
"load_15min": 1.8
|
||||||
|
},
|
||||||
|
"sample_count": 100
|
||||||
|
},
|
||||||
|
"memory_monitor": {
|
||||||
|
"timestamp": 1711234568.456,
|
||||||
|
"data": {
|
||||||
|
"percent": 65.4,
|
||||||
|
"available_mb": 4096,
|
||||||
|
"total_mb": 16384
|
||||||
|
},
|
||||||
|
"sample_count": 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
|
||||||
|
Get detailed historical data for a specific plugin.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
- `plugin_name` (path): Name of the plugin
|
||||||
|
- `limit` (query, optional): Number of recent samples to return (default: 10)
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"plugin": "cpu_monitor",
|
||||||
|
"samples": [
|
||||||
|
{
|
||||||
|
"timestamp": 1711234567.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 45.2,
|
||||||
|
"load_1min": 2.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"timestamp": 1711234267.123,
|
||||||
|
"data": {
|
||||||
|
"cpu_percent": 42.1,
|
||||||
|
"load_1min": 2.3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sample_count": 2
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
```bash
|
||||||
|
# Get last 1 sample (most recent)
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
|
||||||
|
|
||||||
|
# Get last 50 samples
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
|
||||||
|
|
||||||
|
# Get disk monitor data
|
||||||
|
curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Host Access
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/access
|
||||||
|
Get owner/managers/monitors for a host. Requires **monitor** role or higher.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "alice",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### PUT /api/0/hosts/{hostname}/access
|
||||||
|
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||||
|
|
||||||
|
**Request body** (all fields optional):
|
||||||
|
```json
|
||||||
|
{ "owner": "bob", "managers": ["carol"], "monitors": [] }
|
||||||
|
```
|
||||||
|
|
||||||
|
Changes take effect immediately but are not written back to the config file. Update the config file and send `SIGHUP` to make them permanent.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Alert Endpoints
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/alerts
|
||||||
|
Get alert states for a specific host.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname` (path): Name of the host
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"alerts": [
|
||||||
|
{
|
||||||
|
"metric_path": "cpu_monitor.cpu_percent",
|
||||||
|
"level": "WARNING",
|
||||||
|
"since": 1711234000.0,
|
||||||
|
"last_value": 85.5,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metric_path": "disk_monitor./.percent",
|
||||||
|
"level": "OK",
|
||||||
|
"since": 1711230000.0,
|
||||||
|
"last_value": 65.0,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"ok": 15,
|
||||||
|
"warning": 1,
|
||||||
|
"critical": 0,
|
||||||
|
"unknown": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||||
|
```
|
||||||
|
|
||||||
|
#### GET /api/0/alerts
|
||||||
|
Get all active alerts across all monitored hosts.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"alerts": [
|
||||||
|
{
|
||||||
|
"hostname": "webserver01",
|
||||||
|
"metric_path": "cpu_monitor.cpu_percent",
|
||||||
|
"level": "CRITICAL",
|
||||||
|
"since": 1711234000.0,
|
||||||
|
"last_value": 95.5,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hostname": "database01",
|
||||||
|
"metric_path": "memory_monitor.percent",
|
||||||
|
"level": "WARNING",
|
||||||
|
"since": 1711233000.0,
|
||||||
|
"last_value": 88.2,
|
||||||
|
"last_check": 1711234567.123,
|
||||||
|
"notification_count": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"critical": 1,
|
||||||
|
"warning": 1,
|
||||||
|
"unknown": 0,
|
||||||
|
"total": 2
|
||||||
|
},
|
||||||
|
"host_count": 5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:50004/api/0/alerts | jq .
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Web UI Pages
|
||||||
|
|
||||||
|
### Login
|
||||||
|
**URL:** `/login`
|
||||||
|
|
||||||
|
Shown automatically when a browser request is made without a valid session (when users are configured). After successful login the browser is redirected to the originally requested page.
|
||||||
|
|
||||||
|
### Logout
|
||||||
|
**URL:** `/logout`
|
||||||
|
|
||||||
|
Clears the session cookie and redirects to `/login`.
|
||||||
|
|
||||||
|
### Live Dashboard
|
||||||
|
**URL:** `/live`
|
||||||
|
|
||||||
|
Real-time dashboard showing:
|
||||||
|
- Host connection states
|
||||||
|
- IPv4/IPv6 connectivity
|
||||||
|
- Latency metrics
|
||||||
|
- Recent messages
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- WebSocket-powered live updates
|
||||||
|
- Sortable columns
|
||||||
|
- Color-coded status indicators
|
||||||
|
|
||||||
|
### Plugin Metrics
|
||||||
|
**URL:** `/plugins`
|
||||||
|
|
||||||
|
Interactive visualization of plugin metrics:
|
||||||
|
- Select host and plugin from dropdown
|
||||||
|
- View current metric values
|
||||||
|
- Automatic refresh every 30 seconds
|
||||||
|
- Support for nested metrics (e.g., per-partition disk stats)
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Card-based metric display
|
||||||
|
- Unit formatting (%, MB, GB)
|
||||||
|
- Nested object visualization
|
||||||
|
- Auto-refresh
|
||||||
|
|
||||||
|
**Screenshots of available data:**
|
||||||
|
- CPU usage, load average, frequency
|
||||||
|
- Memory usage, available memory, swap
|
||||||
|
- Disk usage per partition, I/O statistics
|
||||||
|
- Network interface statistics, connection counts
|
||||||
|
- Custom plugin data
|
||||||
|
|
||||||
|
### Alerts Dashboard
|
||||||
|
**URL:** `/alerts`
|
||||||
|
|
||||||
|
Comprehensive alert monitoring:
|
||||||
|
- Summary cards (Critical, Warning, Total Hosts)
|
||||||
|
- Filter by severity (All, Critical, Warning)
|
||||||
|
- Alert details with duration
|
||||||
|
- Auto-refresh every 15 seconds
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Color-coded alert levels
|
||||||
|
- Duration tracking
|
||||||
|
- Filterable list
|
||||||
|
- Real-time updates
|
||||||
|
- Summary statistics
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration Examples
|
||||||
|
|
||||||
|
### Monitoring Script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# Check for critical alerts and send notification
|
||||||
|
|
||||||
|
# Log in first (when auth is configured)
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"monitor","password":"secret"}' | jq -r .token)
|
||||||
|
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||||
|
|
||||||
|
RESPONSE=$(curl -s $AUTH http://localhost:50004/api/0/alerts)
|
||||||
|
CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
|
||||||
|
|
||||||
|
if [ "$CRITICAL_COUNT" -gt 0 ]; then
|
||||||
|
echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
|
||||||
|
echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
|
||||||
|
# Send notification
|
||||||
|
# mail -s "Critical Alerts" admin@example.com < alert_details.txt
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python Client
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
BASE = 'http://localhost:50004'
|
||||||
|
|
||||||
|
# Log in (skip if auth not configured)
|
||||||
|
resp = requests.post(f'{BASE}/api/0/auth/login',
|
||||||
|
json={"username": "alice", "password": "secret"})
|
||||||
|
token = resp.json().get("token")
|
||||||
|
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
||||||
|
|
||||||
|
# Get all plugin data for a host
|
||||||
|
response = requests.get(f'{BASE}/api/0/hosts/webserver01/plugins', headers=headers)
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
print(f"Host: {data['hostname']}")
|
||||||
|
print(f"Plugins: {', '.join(data['plugins'].keys())}")
|
||||||
|
|
||||||
|
for plugin, info in data['plugins'].items():
|
||||||
|
print(f"\n{plugin}:")
|
||||||
|
for metric, value in info['data'].items():
|
||||||
|
print(f" {metric}: {value}")
|
||||||
|
|
||||||
|
# Check for alerts
|
||||||
|
response = requests.get(f'{BASE}/api/0/alerts', headers=headers)
|
||||||
|
alerts = response.json()
|
||||||
|
|
||||||
|
if alerts['summary']['critical'] > 0:
|
||||||
|
print(f"\n⚠️ {alerts['summary']['critical']} CRITICAL ALERTS!")
|
||||||
|
for alert in alerts['alerts']:
|
||||||
|
if alert['level'] == 'CRITICAL':
|
||||||
|
print(f" - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Grafana Integration
|
||||||
|
|
||||||
|
The API endpoints can be used with Grafana's JSON datasource plugin:
|
||||||
|
|
||||||
|
1. Install the SimpleJSON datasource plugin
|
||||||
|
2. Configure datasource URL: `http://your-server:50004`
|
||||||
|
3. Create queries:
|
||||||
|
- Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
|
||||||
|
- Alerts: `/api/0/alerts`
|
||||||
|
|
||||||
|
### Prometheus Integration
|
||||||
|
|
||||||
|
Export metrics in Prometheus format (future enhancement):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example prometheus exporter
|
||||||
|
from prometheus_client import Gauge, generate_latest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
|
||||||
|
memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
|
||||||
|
|
||||||
|
def collect_metrics():
|
||||||
|
hosts = requests.get('http://localhost:50004/api/0/hosts').json()
|
||||||
|
for host in hosts:
|
||||||
|
hostname = host['name']
|
||||||
|
plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
|
||||||
|
|
||||||
|
if 'cpu_monitor' in plugins['plugins']:
|
||||||
|
cpu_data = plugins['plugins']['cpu_monitor']['data']
|
||||||
|
cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
|
||||||
|
|
||||||
|
if 'memory_monitor' in plugins['plugins']:
|
||||||
|
mem_data = plugins['plugins']['memory_monitor']['data']
|
||||||
|
memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Response Formats
|
||||||
|
|
||||||
|
### Success Response
|
||||||
|
All successful API calls return HTTP 200 with JSON body:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"field": "value",
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Response
|
||||||
|
API errors return appropriate HTTP status codes with JSON:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "Host 'unknown-host' not found"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Common Status Codes:**
|
||||||
|
- `200 OK` - Success
|
||||||
|
- `400 Bad Request` - Invalid parameters
|
||||||
|
- `401 Unauthorized` - Missing or invalid session token
|
||||||
|
- `403 Forbidden` - Authenticated but insufficient role
|
||||||
|
- `404 Not Found` - Resource not found
|
||||||
|
- `500 Internal Server Error` - Server error
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## WebSocket API
|
||||||
|
|
||||||
|
For real-time updates, connect to the WebSocket endpoint:
|
||||||
|
|
||||||
|
**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
|
||||||
|
|
||||||
|
**Messages:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "host",
|
||||||
|
"data": {
|
||||||
|
"name": "webserver01",
|
||||||
|
"state": "UP"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "plugin",
|
||||||
|
"data": {
|
||||||
|
"host": "webserver01",
|
||||||
|
"plugin": "cpu_monitor",
|
||||||
|
"data": {...},
|
||||||
|
"timestamp": 1711234567.123
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Enable HTTP Server
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# In your hbd configuration file
|
||||||
|
hbd_host: "" # Listen on all interfaces
|
||||||
|
hbd_port: 50004 # HTTP port
|
||||||
|
ws_port: 50005 # WebSocket port (optional)
|
||||||
|
# wss_port: 50006 # Secure WebSocket (requires SSL)
|
||||||
|
```
|
||||||
|
|
||||||
|
### SSL/TLS Configuration
|
||||||
|
|
||||||
|
For secure WebSocket connections:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
wss_port: 50006
|
||||||
|
cert_path: /etc/heartbeat/certs/
|
||||||
|
wss_pem: server.pem
|
||||||
|
wss_key: server.key
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
|
||||||
|
The API currently does not implement rate limiting. For production use, consider:
|
||||||
|
|
||||||
|
- Placing behind a reverse proxy (nginx, Apache)
|
||||||
|
- Using API gateway for rate limiting
|
||||||
|
- Implementing caching for frequently accessed endpoints
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CORS Support
|
||||||
|
|
||||||
|
By default, CORS is not enabled. To enable for web applications:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# In http.py, add CORS middleware
|
||||||
|
from aiohttp_cors import setup as cors_setup
|
||||||
|
|
||||||
|
app = web.Application()
|
||||||
|
cors = cors_setup(app)
|
||||||
|
|
||||||
|
# Configure CORS for all routes
|
||||||
|
for route in list(app.router.routes()):
|
||||||
|
cors.add(route, {
|
||||||
|
"*": aiohttp_cors.ResourceOptions(
|
||||||
|
allow_credentials=True,
|
||||||
|
expose_headers="*",
|
||||||
|
allow_headers="*",
|
||||||
|
)
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Caching
|
||||||
|
- Plugin data is cached in memory (last 100 samples per plugin)
|
||||||
|
- No database queries required
|
||||||
|
- Responses are fast (<10ms typical)
|
||||||
|
|
||||||
|
### Scalability
|
||||||
|
- Each host stores its own data independently
|
||||||
|
- Memory usage: ~1KB per host + ~1KB per plugin sample
|
||||||
|
- For 100 hosts with 5 plugins: ~50MB memory
|
||||||
|
|
||||||
|
### Best Practices
|
||||||
|
1. Use `limit` parameter to control response size
|
||||||
|
2. Cache responses on client side when appropriate
|
||||||
|
3. Use WebSocket for real-time updates instead of polling
|
||||||
|
4. Consider pagination for large deployments (future enhancement)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### API Returns 401
|
||||||
|
- Auth is configured — include `Authorization: Bearer <token>` header
|
||||||
|
- Token may have expired (24 h TTL) — log in again
|
||||||
|
|
||||||
|
### API Returns 403
|
||||||
|
- Authenticated user lacks the required role for this host/action
|
||||||
|
- Check host's `owner`, `managers`, `monitors` config
|
||||||
|
|
||||||
|
### API Returns 404
|
||||||
|
- Verify hostname in URL matches actual host name
|
||||||
|
- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
|
||||||
|
|
||||||
|
### No Plugin Data
|
||||||
|
- Verify client is configured with plugins
|
||||||
|
- Check client logs for plugin errors
|
||||||
|
- Ensure plugins are sending data (check journal logs)
|
||||||
|
|
||||||
|
### Empty Alerts
|
||||||
|
- Verify thresholds are configured
|
||||||
|
- Check host is in `watchhosts` list
|
||||||
|
- Ensure plugins are collecting metrics
|
||||||
|
- Review server logs for threshold checker errors
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [User Management](USERS.md)
|
||||||
|
- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
|
||||||
|
- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
|
||||||
|
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
||||||
|
- Configuration examples: `hbd/config_example.yaml`
|
||||||
@@ -0,0 +1,413 @@
|
|||||||
|
# Message Journal
|
||||||
|
|
||||||
|
The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
|
||||||
|
- Audit message history
|
||||||
|
- Debug connection issues
|
||||||
|
- Analyze traffic patterns
|
||||||
|
- Replay messages for testing
|
||||||
|
- Create historical reports
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **JSON Format**: Each message is logged as a single JSON line for easy parsing
|
||||||
|
- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
|
||||||
|
- **Automatic Cleanup**: Keeps only a configurable number of backup files
|
||||||
|
- **Thread-Safe**: Safe for concurrent access from multiple async tasks
|
||||||
|
- **Configurable**: All settings controllable via configuration file
|
||||||
|
- **Performance**: Non-blocking async operation with minimal overhead
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Message journal configuration
|
||||||
|
journal_enabled: true # Enable/disable journaling
|
||||||
|
journal_dir: /var/log/heartbeat # Directory for journal files
|
||||||
|
journal_file: messages.journal # Base filename
|
||||||
|
journal_max_size: 104857600 # Max size in bytes (100MB default)
|
||||||
|
journal_max_backups: 10 # Number of backup files to keep
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration Options
|
||||||
|
|
||||||
|
| Option | Default | Description |
|
||||||
|
|--------|---------|-------------|
|
||||||
|
| `journal_enabled` | `true` | Enable or disable message journaling |
|
||||||
|
| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
|
||||||
|
| `journal_file` | `messages.journal` | Base filename for the journal |
|
||||||
|
| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
|
||||||
|
| `journal_max_backups` | `10` | Number of rotated backup files to keep |
|
||||||
|
|
||||||
|
## File Format
|
||||||
|
|
||||||
|
Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||||
|
{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Entry Structure
|
||||||
|
|
||||||
|
Each journal entry contains:
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|-------|------|-------------|
|
||||||
|
| `timestamp` | float | Unix timestamp (seconds since epoch) |
|
||||||
|
| `datetime` | string | ISO 8601 formatted datetime |
|
||||||
|
| `source_ip` | string | Source IP address |
|
||||||
|
| `source_port` | integer | Source UDP port |
|
||||||
|
| `message` | object | Complete parsed message dictionary |
|
||||||
|
|
||||||
|
## Log Rotation
|
||||||
|
|
||||||
|
### How Rotation Works
|
||||||
|
|
||||||
|
1. Journal writes messages to the current file
|
||||||
|
2. When file size exceeds `journal_max_size`, rotation is triggered
|
||||||
|
3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
|
||||||
|
4. New empty file is created as the current journal
|
||||||
|
5. Old backup files exceeding `journal_max_backups` are deleted
|
||||||
|
|
||||||
|
### Example File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/log/heartbeat/
|
||||||
|
├── messages.journal # Current active journal
|
||||||
|
├── messages.journal.20260328-120000 # Rotated backup
|
||||||
|
├── messages.journal.20260328-140000 # Rotated backup
|
||||||
|
└── messages.journal.20260328-160000 # Rotated backup (oldest)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rotation Behavior
|
||||||
|
|
||||||
|
- Rotation is triggered when the next message would exceed the size limit
|
||||||
|
- Rotation is automatic and requires no manual intervention
|
||||||
|
- Old backups are deleted in FIFO order (oldest first)
|
||||||
|
- Rotation is thread-safe and won't lose messages
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Reading Journal Files
|
||||||
|
|
||||||
|
#### Using Python
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Read all entries from current journal
|
||||||
|
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
entry = json.loads(line)
|
||||||
|
print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using jq (command line)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View all messages
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq .
|
||||||
|
|
||||||
|
# Filter by message type
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
|
||||||
|
|
||||||
|
# Filter by hostname
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||||
|
|
||||||
|
# Count messages by type
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||||
|
|
||||||
|
# Extract timestamps and source IPs
|
||||||
|
cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using shell tools
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Count total messages
|
||||||
|
wc -l /var/log/heartbeat/messages.journal
|
||||||
|
|
||||||
|
# View recent messages
|
||||||
|
tail -n 100 /var/log/heartbeat/messages.journal | jq .
|
||||||
|
|
||||||
|
# Search for specific host
|
||||||
|
grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
|
||||||
|
|
||||||
|
# Check journal file size
|
||||||
|
du -h /var/log/heartbeat/messages.journal
|
||||||
|
```
|
||||||
|
|
||||||
|
### Analyzing Historical Data
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Combine all journal files (current + backups)
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
|
||||||
|
|
||||||
|
# Count messages per host
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
|
||||||
|
|
||||||
|
# Find all plugin messages
|
||||||
|
cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
|
||||||
|
|
||||||
|
# Extract CPU metrics from plugin messages
|
||||||
|
cat /var/log/heartbeat/messages.journal* | \
|
||||||
|
jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with Log Management
|
||||||
|
|
||||||
|
### Logrotate
|
||||||
|
|
||||||
|
While the journal has built-in rotation, you can also use logrotate for additional management:
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/log/heartbeat/messages.journal.* {
|
||||||
|
daily
|
||||||
|
rotate 30
|
||||||
|
compress
|
||||||
|
delaycompress
|
||||||
|
missingok
|
||||||
|
notifempty
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Elasticsearch/OpenSearch
|
||||||
|
|
||||||
|
Import journal data into Elasticsearch for advanced analysis:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
import json
|
||||||
|
|
||||||
|
es = Elasticsearch(['localhost:9200'])
|
||||||
|
|
||||||
|
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
entry = json.loads(line)
|
||||||
|
es.index(index='heartbeat-messages', body=entry)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Splunk
|
||||||
|
|
||||||
|
Create a Splunk input for the journal:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[monitor:///var/log/heartbeat/messages.journal*]
|
||||||
|
sourcetype = heartbeat_json
|
||||||
|
index = heartbeat
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Overhead
|
||||||
|
|
||||||
|
- Journal writing is async and non-blocking
|
||||||
|
- Typical overhead: < 1ms per message
|
||||||
|
- Minimal impact on heartbeat processing
|
||||||
|
|
||||||
|
### Disk Usage
|
||||||
|
|
||||||
|
Calculate expected disk usage:
|
||||||
|
|
||||||
|
```
|
||||||
|
Messages per day = (86400 seconds / interval) * number_of_hosts
|
||||||
|
Average message size ≈ 200-500 bytes
|
||||||
|
Daily disk usage = Messages per day * Average message size
|
||||||
|
|
||||||
|
Example:
|
||||||
|
- 100 hosts
|
||||||
|
- 30 second interval
|
||||||
|
- 2880 messages/day per host
|
||||||
|
- 288,000 messages/day total
|
||||||
|
- ~60-140 MB/day
|
||||||
|
```
|
||||||
|
|
||||||
|
### Recommendations
|
||||||
|
|
||||||
|
- **Small deployments** (< 50 hosts): Default settings work well
|
||||||
|
- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
|
||||||
|
- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### Check Journal Status
|
||||||
|
|
||||||
|
The journal exposes statistics that can be queried:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.journal import get_journal
|
||||||
|
|
||||||
|
journal = get_journal()
|
||||||
|
stats = journal.get_stats()
|
||||||
|
print(f"Current size: {stats['current_size']:,} bytes")
|
||||||
|
print(f"Rotation threshold: {stats['rotation_threshold']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log Messages
|
||||||
|
|
||||||
|
Journal operations are logged at appropriate levels:
|
||||||
|
|
||||||
|
- `INFO`: Initialization, rotation events, cleanup
|
||||||
|
- `DEBUG`: Individual message logging
|
||||||
|
- `WARNING`: Non-critical issues
|
||||||
|
- `ERROR`: Critical failures
|
||||||
|
|
||||||
|
Check hbd logs for journal-related messages:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
grep journal /var/log/heartbeat.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Journal Files Not Created
|
||||||
|
|
||||||
|
**Problem**: No journal files appear in the configured directory.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Check `journal_enabled: true` in configuration
|
||||||
|
- Verify directory exists and hbd has write permissions
|
||||||
|
- Check hbd logs for initialization errors
|
||||||
|
- Verify disk space is available
|
||||||
|
|
||||||
|
### Rotation Not Working
|
||||||
|
|
||||||
|
**Problem**: Journal file grows beyond `journal_max_size`.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Check that `journal_max_size` is properly configured
|
||||||
|
- Verify hbd has permission to rename/create files
|
||||||
|
- Check for filesystem issues
|
||||||
|
- Review hbd logs for rotation errors
|
||||||
|
|
||||||
|
### Missing Messages
|
||||||
|
|
||||||
|
**Problem**: Some messages don't appear in journal.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Verify `journal_enabled: true`
|
||||||
|
- Check for write errors in hbd logs
|
||||||
|
- Verify sufficient disk space
|
||||||
|
- Check if filesystem is read-only
|
||||||
|
|
||||||
|
### Performance Issues
|
||||||
|
|
||||||
|
**Problem**: Journal causing slow message processing.
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
- Use faster storage (SSD) for journal directory
|
||||||
|
- Increase `journal_max_size` to reduce rotation frequency
|
||||||
|
- Disable journal if not needed: `journal_enabled: false`
|
||||||
|
- Consider async syslog forwarding instead
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### File Permissions
|
||||||
|
|
||||||
|
Ensure proper permissions on journal files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Journal directory
|
||||||
|
chmod 750 /var/log/heartbeat
|
||||||
|
chown hbd:hbd /var/log/heartbeat
|
||||||
|
|
||||||
|
# Journal files
|
||||||
|
chmod 640 /var/log/heartbeat/messages.journal*
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sensitive Data
|
||||||
|
|
||||||
|
Journal files may contain:
|
||||||
|
- Hostnames and IP addresses
|
||||||
|
- System metrics
|
||||||
|
- Custom message content
|
||||||
|
|
||||||
|
**Recommendations**:
|
||||||
|
- Restrict read access to authorized users only
|
||||||
|
- Consider encryption for archived journals
|
||||||
|
- Implement log retention policies
|
||||||
|
- Sanitize data if sharing for debugging
|
||||||
|
|
||||||
|
## API Reference
|
||||||
|
|
||||||
|
### MessageJournal Class
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MessageJournal:
|
||||||
|
def __init__(self, config: Dict[str, Any])
|
||||||
|
async def initialize(self) -> bool
|
||||||
|
async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
|
||||||
|
async def close(self)
|
||||||
|
def get_stats(self) -> Dict[str, Any]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Module Functions
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_journal(config: Dict = None) -> MessageJournal
|
||||||
|
async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example: Custom Message Processing
|
||||||
|
|
||||||
|
Process journal messages in real-time:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
async def tail_journal(journal_path):
|
||||||
|
"""Follow journal file and process new messages."""
|
||||||
|
path = Path(journal_path)
|
||||||
|
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
# Jump to end
|
||||||
|
f.seek(0, 2)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
entry = json.loads(line)
|
||||||
|
await process_message(entry)
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
async def process_message(entry):
|
||||||
|
"""Process a journal entry."""
|
||||||
|
msg = entry['message']
|
||||||
|
|
||||||
|
# Alert on boot messages
|
||||||
|
if msg.get('boot'):
|
||||||
|
print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
|
||||||
|
|
||||||
|
# Track CPU usage
|
||||||
|
if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
|
||||||
|
cpu = msg.get('cpu_percent', 0)
|
||||||
|
if cpu > 90:
|
||||||
|
print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
Potential improvements for future versions:
|
||||||
|
|
||||||
|
- Compression of rotated logs (gzip)
|
||||||
|
- Time-based rotation in addition to size-based
|
||||||
|
- Filtering to exclude certain message types
|
||||||
|
- Structured logging output formats (CEF, GELF)
|
||||||
|
- Remote syslog forwarding
|
||||||
|
- Message deduplication
|
||||||
|
- Journal file encryption
|
||||||
|
- Signed journal entries
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [Configuration Guide](../hbd/config.py) - Full configuration options
|
||||||
|
- [UDP Protocol](../hbd/udp.py) - Message handling
|
||||||
|
- [Server Architecture](../hbd/server.py) - Server initialization
|
||||||
@@ -0,0 +1,331 @@
|
|||||||
|
# Nagios Plugin Integration Guide
|
||||||
|
|
||||||
|
The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Install Nagios Plugins
|
||||||
|
|
||||||
|
**Debian/Ubuntu:**
|
||||||
|
```bash
|
||||||
|
sudo apt-get install nagios-plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
**RHEL/CentOS/Fedora:**
|
||||||
|
```bash
|
||||||
|
sudo yum install nagios-plugins-all
|
||||||
|
# or
|
||||||
|
sudo dnf install nagios-plugins-all
|
||||||
|
```
|
||||||
|
|
||||||
|
**Arch Linux:**
|
||||||
|
```bash
|
||||||
|
sudo pacman -S monitoring-plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure Heartbeat
|
||||||
|
|
||||||
|
Add the `nagios_runner` section to your `~/.hb.yaml` config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
interval: 60 # Run plugins every 60 seconds
|
||||||
|
timeout: 30 # Command timeout in seconds
|
||||||
|
commands:
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Start Heartbeat Client
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbc -v localhost
|
||||||
|
```
|
||||||
|
|
||||||
|
The client will now execute the configured Nagios plugins and send their results to the server.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Nagios Plugin Standard
|
||||||
|
|
||||||
|
Nagios plugins follow a simple interface:
|
||||||
|
|
||||||
|
1. **Exit Codes:**
|
||||||
|
- `0` = OK
|
||||||
|
- `1` = WARNING
|
||||||
|
- `2` = CRITICAL
|
||||||
|
- `3` = UNKNOWN
|
||||||
|
|
||||||
|
2. **Output Format:**
|
||||||
|
```
|
||||||
|
STATUS - Message | performance_data
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Performance Data Format:**
|
||||||
|
```
|
||||||
|
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Plugin Output
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
|
||||||
|
```
|
||||||
|
|
||||||
|
This output includes:
|
||||||
|
- **Status:** `DISK OK`
|
||||||
|
- **Message:** `free space: / 156 GB (78%)`
|
||||||
|
- **Performance Data:** `/=44GB;127;142;0;159`
|
||||||
|
- Current value: 44GB
|
||||||
|
- Warning threshold: 127GB
|
||||||
|
- Critical threshold: 142GB
|
||||||
|
- Min: 0GB
|
||||||
|
- Max: 159GB
|
||||||
|
|
||||||
|
### Data Collected
|
||||||
|
|
||||||
|
The `nagios_runner` plugin collects:
|
||||||
|
|
||||||
|
**For each configured command:**
|
||||||
|
- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
|
||||||
|
- `{name}_status_code` - Numeric exit code (0-3)
|
||||||
|
- `{name}_output` - Status message
|
||||||
|
- `{name}_{metric}` - Each performance metric value
|
||||||
|
- `{name}_{metric}_uom` - Unit of measurement (if present)
|
||||||
|
- `{name}_{metric}_warn` - Warning threshold (if present)
|
||||||
|
- `{name}_{metric}_crit` - Critical threshold (if present)
|
||||||
|
- `{name}_{metric}_min` - Minimum value (if present)
|
||||||
|
- `{name}_{metric}_max` - Maximum value (if present)
|
||||||
|
|
||||||
|
**Overall:**
|
||||||
|
- `overall_status` - Worst status from all commands
|
||||||
|
- `overall_status_code` - Worst status code
|
||||||
|
- `plugin_count` - Number of Nagios plugins executed
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
# Collection interval in seconds (default: 60)
|
||||||
|
interval: 60
|
||||||
|
|
||||||
|
# Command execution timeout in seconds (default: 30)
|
||||||
|
timeout: 30
|
||||||
|
|
||||||
|
# Execute commands via shell (default: true)
|
||||||
|
# Set to false for direct execution (more secure but less flexible)
|
||||||
|
shell: true
|
||||||
|
|
||||||
|
# List of Nagios plugins to run
|
||||||
|
commands:
|
||||||
|
- name: unique_name # Required: unique identifier
|
||||||
|
command: /path/to/plugin [args] # Required: full command to execute
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Nagios Plugins
|
||||||
|
|
||||||
|
### System Resources
|
||||||
|
|
||||||
|
**Disk Space:**
|
||||||
|
```yaml
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
```
|
||||||
|
|
||||||
|
**Load Average:**
|
||||||
|
```yaml
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
```
|
||||||
|
|
||||||
|
**Swap Usage:**
|
||||||
|
```yaml
|
||||||
|
- name: check_swap
|
||||||
|
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
```
|
||||||
|
|
||||||
|
**Process Count:**
|
||||||
|
```yaml
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
```
|
||||||
|
|
||||||
|
**Users Logged In:**
|
||||||
|
```yaml
|
||||||
|
- name: check_users
|
||||||
|
command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network Services
|
||||||
|
|
||||||
|
**SSH:**
|
||||||
|
```yaml
|
||||||
|
- name: check_ssh
|
||||||
|
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||||
|
```
|
||||||
|
|
||||||
|
**HTTP:**
|
||||||
|
```yaml
|
||||||
|
- name: check_http_local
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||||
|
|
||||||
|
- name: check_http_ssl
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
|
||||||
|
```
|
||||||
|
|
||||||
|
**DNS:**
|
||||||
|
```yaml
|
||||||
|
- name: check_dns
|
||||||
|
command: /usr/lib/nagios/plugins/check_dns -H google.com
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ping:**
|
||||||
|
```yaml
|
||||||
|
- name: check_ping_gateway
|
||||||
|
command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
|
||||||
|
```
|
||||||
|
|
||||||
|
### Databases
|
||||||
|
|
||||||
|
**MySQL:**
|
||||||
|
```yaml
|
||||||
|
- name: check_mysql
|
||||||
|
command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
|
||||||
|
```
|
||||||
|
|
||||||
|
**PostgreSQL:**
|
||||||
|
```yaml
|
||||||
|
- name: check_pgsql
|
||||||
|
command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
|
||||||
|
```
|
||||||
|
|
||||||
|
## Writing Custom Nagios Plugins
|
||||||
|
|
||||||
|
You can write your own Nagios-compatible plugins in any language. Here's a simple example:
|
||||||
|
|
||||||
|
**Bash:**
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# /usr/local/bin/check_example.sh
|
||||||
|
|
||||||
|
# Get the value to check
|
||||||
|
value=$(some_command)
|
||||||
|
|
||||||
|
# Define thresholds
|
||||||
|
warn=80
|
||||||
|
crit=90
|
||||||
|
|
||||||
|
# Check and output result
|
||||||
|
if [ $value -ge $crit ]; then
|
||||||
|
echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 2
|
||||||
|
elif [ $value -ge $warn ]; then
|
||||||
|
echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
**Python:**
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# /usr/local/bin/check_example.py
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def check_something():
|
||||||
|
value = get_value() # Your check logic here
|
||||||
|
warn = 80
|
||||||
|
crit = 90
|
||||||
|
|
||||||
|
perfdata = f"value={value};{warn};{crit};0;100"
|
||||||
|
|
||||||
|
if value >= crit:
|
||||||
|
print(f"CRITICAL - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(2)
|
||||||
|
elif value >= warn:
|
||||||
|
print(f"WARNING - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"OK - Value is {value} | {perfdata}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
check_something()
|
||||||
|
```
|
||||||
|
|
||||||
|
Then configure in Heartbeat:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
commands:
|
||||||
|
- name: my_custom_check
|
||||||
|
command: /usr/local/bin/check_example.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Plugin not found
|
||||||
|
```
|
||||||
|
Error: Command not found
|
||||||
|
```
|
||||||
|
**Solution:** Use the full path to the plugin. Common locations:
|
||||||
|
- `/usr/lib/nagios/plugins/`
|
||||||
|
- `/usr/lib64/nagios/plugins/`
|
||||||
|
- `/usr/local/nagios/libexec/`
|
||||||
|
|
||||||
|
### Permission denied
|
||||||
|
```
|
||||||
|
Error: Permission denied
|
||||||
|
```
|
||||||
|
**Solution:** Ensure the plugin is executable:
|
||||||
|
```bash
|
||||||
|
chmod +x /path/to/plugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Timeout errors
|
||||||
|
```
|
||||||
|
Command timed out after 30s
|
||||||
|
```
|
||||||
|
**Solution:** Increase the timeout in config:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
timeout: 60 # Increase timeout
|
||||||
|
```
|
||||||
|
|
||||||
|
### No performance data
|
||||||
|
If performance data is not being parsed:
|
||||||
|
1. Check plugin output includes `|` separator
|
||||||
|
2. Verify performance data format: `'label'=value[UOM];...`
|
||||||
|
3. Enable debug logging: `hbc -v -x localhost`
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
|
||||||
|
2. **No Rewriting:** Use plugins as-is without modification
|
||||||
|
3. **Community Support:** Well-documented and maintained plugins
|
||||||
|
4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
|
||||||
|
5. **Standard Interface:** Consistent exit codes and output format
|
||||||
|
6. **Performance Data:** Automatic extraction of metrics
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
|
||||||
|
- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
|
||||||
|
- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
|
||||||
|
- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- Configure threshold alerts based on Nagios plugin status codes
|
||||||
|
- View plugin data in the Heartbeat web UI
|
||||||
|
- Create custom plugins for your specific monitoring needs
|
||||||
|
- Integrate with existing Nagios/Icinga configurations
|
||||||
@@ -0,0 +1,295 @@
|
|||||||
|
# Notification System
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Notifications are dispatched to the **owner and managers** of a host, each via their own configured notification channels. Channel definitions are global; users reference them by name. No users configured → no notifications sent.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Alert event (udp.py / threshold.py)
|
||||||
|
└─ notify.send_notification(host_name, Notification)
|
||||||
|
├─ look up host.owner + host.managers
|
||||||
|
├─ for each user → user.notification_channels
|
||||||
|
└─ for each channel → _dispatch_to_channel (filtered by min_level)
|
||||||
|
```
|
||||||
|
|
||||||
|
Every notification carries:
|
||||||
|
- **title** — `[LEVEL] hostname` (e.g. `[CRITICAL] webserver01`)
|
||||||
|
- **body** — detail message (metric value, threshold, duration)
|
||||||
|
- **url** — link to the plugin metrics page (`{base_url}/plugins#{hostname}`)
|
||||||
|
- **level** — `RECOVER | WARNING | CRITICAL | INFO`
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Base URL
|
||||||
|
|
||||||
|
Set `base_url` so notification links point to your hbd instance:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_url: https://hbd.example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
### Global channel definitions
|
||||||
|
|
||||||
|
Define channels once; reference them by name from user configs:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
|
||||||
|
pushover_ops:
|
||||||
|
type: pushover
|
||||||
|
token: your-app-token
|
||||||
|
user: your-user-key
|
||||||
|
min_level: WARNING # optional, default: WARNING
|
||||||
|
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com]
|
||||||
|
sender: hbd@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: hbd@example.com
|
||||||
|
smtp_password: secret
|
||||||
|
min_level: WARNING
|
||||||
|
|
||||||
|
matrix_oncall:
|
||||||
|
type: matrix
|
||||||
|
homeserver: https://matrix.example.org
|
||||||
|
access_token: syt_xxx
|
||||||
|
room_id: "!abc:matrix.example.org"
|
||||||
|
min_level: CRITICAL # only send critical alerts to this room
|
||||||
|
|
||||||
|
sms_oncall:
|
||||||
|
type: sms_voipms
|
||||||
|
api_user: me@example.com
|
||||||
|
api_password: secret
|
||||||
|
did: "5551234567" # your voip.ms DID number
|
||||||
|
dst: "5559876543" # destination number
|
||||||
|
min_level: CRITICAL
|
||||||
|
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234
|
||||||
|
recipient: +12025559999
|
||||||
|
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
```
|
||||||
|
|
||||||
|
### Users with notification channels
|
||||||
|
|
||||||
|
Each user lists which global channels they receive notifications on:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
admin: true
|
||||||
|
notification_channels: [pushover_ops, email_ops]
|
||||||
|
|
||||||
|
bob:
|
||||||
|
full_name: Bob Jones
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
notification_channels: [sms_oncall, matrix_oncall]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host access — owner and managers
|
||||||
|
|
||||||
|
Notifications for a host go to its owner and all managers:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice # receives all notifications for this host
|
||||||
|
managers: [bob] # also receives notifications
|
||||||
|
threshold_config: default
|
||||||
|
watch: true # bold in dashboard (cosmetic only)
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dbserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
threshold_config: database
|
||||||
|
dyndns: false
|
||||||
|
```
|
||||||
|
|
||||||
|
`watch: true` only affects display (bold name in the live dashboard). Notifications are now controlled entirely by owner/managers.
|
||||||
|
|
||||||
|
## Channel Types
|
||||||
|
|
||||||
|
### `min_level` filtering
|
||||||
|
|
||||||
|
Every channel accepts an optional `min_level` field:
|
||||||
|
|
||||||
|
| Value | Channels receive |
|
||||||
|
|---|---|
|
||||||
|
| `WARNING` (default) | WARNING, CRITICAL, RECOVER |
|
||||||
|
| `CRITICAL` | CRITICAL only (and RECOVER) |
|
||||||
|
|
||||||
|
`RECOVER` is always passed through — you don't want to miss a recovery.
|
||||||
|
|
||||||
|
### pushover
|
||||||
|
|
||||||
|
Sends push notifications via [Pushover](https://pushover.net). Includes title, body, and a clickable URL.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: pushover
|
||||||
|
token: your-app-token # Required: Pushover application token
|
||||||
|
user: your-user-key # Required: Recipient's user key
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
### email
|
||||||
|
|
||||||
|
Sends via SMTP. Subject = title, body = message + URL on final line.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, oncall@example.com]
|
||||||
|
sender: hbd@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587 # 587 = STARTTLS (default), 465 = SSL
|
||||||
|
smtp_user: hbd@example.com
|
||||||
|
smtp_password: secret
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
### matrix
|
||||||
|
|
||||||
|
Sends a formatted HTML message to a Matrix room via [matrix-nio](https://github.com/poljar/matrix-nio).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: matrix
|
||||||
|
homeserver: https://matrix.example.org
|
||||||
|
access_token: syt_xxx # Bot account access token
|
||||||
|
room_id: "!abc:matrix.example.org"
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
1. Create a bot Matrix account
|
||||||
|
2. Obtain its access token (Element → Settings → Help & About → Access Token)
|
||||||
|
3. Invite the bot to the target room and note the room ID
|
||||||
|
|
||||||
|
### sms_voipms
|
||||||
|
|
||||||
|
Sends SMS via the [voip.ms REST API](https://voip.ms/api/v1/rest.php). Message is truncated to 160 characters.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: sms_voipms
|
||||||
|
api_user: me@example.com # voip.ms account email
|
||||||
|
api_password: secret # voip.ms API password
|
||||||
|
did: "5551234567" # Your voip.ms DID (sending number)
|
||||||
|
dst: "5559876543" # Destination number
|
||||||
|
min_level: CRITICAL
|
||||||
|
```
|
||||||
|
|
||||||
|
### signal
|
||||||
|
|
||||||
|
Sends via [signal-cli](https://github.com/AsamK/signal-cli).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234 # Your registered Signal number
|
||||||
|
recipient: +12025559999 # Recipient number
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
```bash
|
||||||
|
signal-cli -u +12025551234 register
|
||||||
|
signal-cli -u +12025551234 verify CODE
|
||||||
|
```
|
||||||
|
|
||||||
|
### mattermost
|
||||||
|
|
||||||
|
Sends via Mattermost incoming webhook. Message is formatted as Markdown.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot # Optional: display name
|
||||||
|
icon: https://…/icon.png # Optional: bot icon URL
|
||||||
|
min_level: WARNING
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notification events
|
||||||
|
|
||||||
|
| Source | Level | Title example | Body example |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Host overdue | CRITICAL | `[CRITICAL] webserver01` | `IPv4 overdue` |
|
||||||
|
| Host recover | RECOVER | `[RECOVER] webserver01` | `IPv4 back after being overdue for 5:23` |
|
||||||
|
| Host boot | INFO | `[INFO] webserver01` | `webserver01 booted` |
|
||||||
|
| Host shutdown | INFO | `[INFO] webserver01` | `IPv4 shutdown` |
|
||||||
|
| Threshold breach | WARNING/CRITICAL | `[CRITICAL] webserver01` | `cpu_percent = 95.2 (threshold: > 90.0)` |
|
||||||
|
| Threshold reminder | CRITICAL | `[REMINDER/CRITICAL] webserver01` | `REMINDER (CRITICAL): … ongoing for 3600s` |
|
||||||
|
| Connection issue | WARNING | `[WARNING] webserver01` | `new address detected …` |
|
||||||
|
|
||||||
|
Reminder notifications (re-notify) are sent only for CRITICAL level alerts.
|
||||||
|
|
||||||
|
## API reference
|
||||||
|
|
||||||
|
### `send_notification(host_name, notif) -> dict`
|
||||||
|
|
||||||
|
Main entry point. Dispatches to owner + managers.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.server.notify import send_notification, Notification
|
||||||
|
|
||||||
|
send_notification(
|
||||||
|
"webserver01",
|
||||||
|
Notification(
|
||||||
|
title="[CRITICAL] webserver01",
|
||||||
|
body="cpu_percent = 95.2 (threshold: > 90.0)",
|
||||||
|
level="CRITICAL",
|
||||||
|
url="https://hbd.example.com/plugins#webserver01",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns `{channel_name: bool}` for each channel dispatched.
|
||||||
|
|
||||||
|
### `setup(cfg, loop=None)`
|
||||||
|
|
||||||
|
Called once at startup from `main.py`. Pass the running asyncio event loop so Matrix sends work correctly.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**No notifications sent:**
|
||||||
|
- Check that users are configured (`users:` section in yaml)
|
||||||
|
- Check that the host has an `owner` or `managers` set
|
||||||
|
- Check that users have `notification_channels` listed
|
||||||
|
- Check that the channel names in user config match keys under `notification_channels:`
|
||||||
|
|
||||||
|
**min_level filtering too aggressive:**
|
||||||
|
- Default is `WARNING` — both WARNING and CRITICAL are sent
|
||||||
|
- Set `min_level: WARNING` explicitly if you were expecting warnings but set CRITICAL
|
||||||
|
|
||||||
|
**Matrix sends time out:**
|
||||||
|
- Verify the access token is valid and the bot is in the room
|
||||||
|
- `matrix-nio` must be installed: `pip install matrix-nio`
|
||||||
|
|
||||||
|
**voip.ms SMS fails:**
|
||||||
|
- Enable the API in your voip.ms account (Account → API)
|
||||||
|
- Verify the DID is SMS-capable in your voip.ms account
|
||||||
|
|
||||||
|
**Signal not found:**
|
||||||
|
- Specify full `cli_path`
|
||||||
|
- Run `signal-cli -u +NUMBER receive` to sync trust store
|
||||||
|
|
||||||
|
**Email authentication failed:**
|
||||||
|
- Use app-specific passwords for Gmail/Fastmail
|
||||||
|
- Verify port: 587 for STARTTLS, 465 for SSL
|
||||||
|
|
||||||
|
**Pushover `400` errors:**
|
||||||
|
- Double-check `token` (app) and `user` (user key) — they are different values
|
||||||
@@ -0,0 +1,544 @@
|
|||||||
|
# Plugin Development Guide
|
||||||
|
|
||||||
|
This guide explains how to create custom plugins for the Heartbeat monitoring system.
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
- [Plugin Architecture](#plugin-architecture)
|
||||||
|
- [Plugin Types](#plugin-types)
|
||||||
|
- [Creating a Plugin](#creating-a-plugin)
|
||||||
|
- [Plugin Lifecycle](#plugin-lifecycle)
|
||||||
|
- [Configuration](#configuration)
|
||||||
|
- [Best Practices](#best-practices)
|
||||||
|
- [Examples](#examples)
|
||||||
|
- [Testing](#testing)
|
||||||
|
|
||||||
|
## Plugin Architecture
|
||||||
|
|
||||||
|
Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
|
||||||
|
|
||||||
|
### Key Concepts
|
||||||
|
|
||||||
|
- **Plugin Registry**: Central registry that manages all loaded plugins
|
||||||
|
- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
|
||||||
|
- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
|
||||||
|
- **Async/Await**: All plugin methods are async for non-blocking operation
|
||||||
|
|
||||||
|
## Plugin Types
|
||||||
|
|
||||||
|
### InfoPlugin
|
||||||
|
|
||||||
|
InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
|
||||||
|
|
||||||
|
- **Runs once** at startup (interval = 0)
|
||||||
|
- **Cached** - data is collected once and reused
|
||||||
|
- **Lightweight** - no periodic overhead
|
||||||
|
|
||||||
|
**Use InfoPlugin for:**
|
||||||
|
- Operating system details
|
||||||
|
- Hardware information
|
||||||
|
- Software versions
|
||||||
|
- Configuration data
|
||||||
|
- Static inventory
|
||||||
|
|
||||||
|
### MonitorPlugin
|
||||||
|
|
||||||
|
MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
|
||||||
|
|
||||||
|
- **Runs periodically** based on configured interval
|
||||||
|
- **Scheduled** - collected at regular intervals
|
||||||
|
- **Dynamic** - captures changing system state
|
||||||
|
|
||||||
|
**Use MonitorPlugin for:**
|
||||||
|
- Resource usage (CPU, memory, disk, network)
|
||||||
|
- Performance metrics
|
||||||
|
- Counters and gauges
|
||||||
|
- Time-series data
|
||||||
|
|
||||||
|
## Creating a Plugin
|
||||||
|
|
||||||
|
### Step 1: Choose Plugin Type
|
||||||
|
|
||||||
|
Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
|
||||||
|
|
||||||
|
### Step 2: Create Plugin File
|
||||||
|
|
||||||
|
Create a new Python file in `hbd/plugins/` directory:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""
|
||||||
|
My awesome plugin for Heartbeat.
|
||||||
|
|
||||||
|
Brief description of what this plugin does.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
# Import psutil or other dependencies if needed
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.plugin import MonitorPlugin # or InfoPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MyAwesomePlugin(MonitorPlugin): # or InfoPlugin
|
||||||
|
"""
|
||||||
|
One-line description of the plugin.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- List of metrics/data collected
|
||||||
|
- Another metric
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 60)
|
||||||
|
option1: Description of option1 (default: value)
|
||||||
|
option2: Description of option2 (default: value)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "my_awesome_plugin" # Unique plugin name
|
||||||
|
interval = 60 # For MonitorPlugin, use 0 for InfoPlugin
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""Initialize the plugin with optional configuration."""
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Extract configuration options
|
||||||
|
self.option1 = self.config.get('option1', 'default_value')
|
||||||
|
self.option2 = self.config.get('option2', True)
|
||||||
|
|
||||||
|
# Check dependencies
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil is required for my_awesome_plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""
|
||||||
|
Initialize the plugin.
|
||||||
|
|
||||||
|
This is called once when the plugin is loaded.
|
||||||
|
Use this to verify dependencies, establish connections, etc.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization successful, False otherwise
|
||||||
|
"""
|
||||||
|
logger.info(f"My awesome plugin initialized (option1: {self.option1})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect data.
|
||||||
|
|
||||||
|
This is called periodically (MonitorPlugin) or once (InfoPlugin).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of collected data (will be sent to server)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected {len(data)} metrics")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting data: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to collect actual metrics."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect your data here
|
||||||
|
metrics['metric1'] = self._get_metric1()
|
||||||
|
metrics['metric2'] = self._get_metric2()
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
def _get_metric1(self):
|
||||||
|
"""Helper method for metric collection."""
|
||||||
|
# Implementation here
|
||||||
|
return 42
|
||||||
|
|
||||||
|
def _get_metric2(self):
|
||||||
|
"""Helper method for metric collection."""
|
||||||
|
# Implementation here
|
||||||
|
return "hello"
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""
|
||||||
|
Cleanup resources.
|
||||||
|
|
||||||
|
This is called when the plugin is unloaded or the client shuts down.
|
||||||
|
Use this to close connections, release resources, etc.
|
||||||
|
"""
|
||||||
|
logger.info("My awesome plugin cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = MyAwesomePlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Test Your Plugin
|
||||||
|
|
||||||
|
Create a test script to verify your plugin works:
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
|
||||||
|
|
||||||
|
async def test():
|
||||||
|
# Create plugin instance
|
||||||
|
plugin = MyAwesomePlugin({'option1': 'test_value'})
|
||||||
|
|
||||||
|
# Initialize
|
||||||
|
if not await plugin.initialize():
|
||||||
|
print("Failed to initialize")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Collect data
|
||||||
|
data = await plugin.collect()
|
||||||
|
print(f"Collected data: {data}")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
await plugin.cleanup()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
success = asyncio.run(test())
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Plugin Lifecycle
|
||||||
|
|
||||||
|
Understanding the plugin lifecycle helps you implement plugins correctly:
|
||||||
|
|
||||||
|
```
|
||||||
|
1. Plugin Discovery
|
||||||
|
└─> Loader scans hbd/plugins/ directory
|
||||||
|
└─> Finds Python files (except those starting with _)
|
||||||
|
└─> Imports modules
|
||||||
|
|
||||||
|
2. Plugin Instantiation
|
||||||
|
└─> Creates instance with configuration
|
||||||
|
└─> __init__() is called
|
||||||
|
|
||||||
|
3. Plugin Initialization
|
||||||
|
└─> initialize() is called
|
||||||
|
└─> Plugin verifies dependencies, establishes connections
|
||||||
|
└─> Returns True/False for success/failure
|
||||||
|
|
||||||
|
4. Plugin Registration
|
||||||
|
└─> If initialization succeeds, plugin is registered
|
||||||
|
└─> Plugin becomes active
|
||||||
|
|
||||||
|
5. Data Collection
|
||||||
|
└─> For InfoPlugin: collect() called once after initialization
|
||||||
|
└─> For MonitorPlugin: collect() called periodically based on interval
|
||||||
|
└─> Data is sent to server via PLG message
|
||||||
|
|
||||||
|
6. Plugin Shutdown
|
||||||
|
└─> cleanup() is called
|
||||||
|
└─> Plugin releases resources, closes connections
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Plugin-Specific Configuration
|
||||||
|
|
||||||
|
Plugins receive configuration through the `config` parameter in `__init__`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Access configuration with defaults
|
||||||
|
self.interval = self.config.get('interval', 60)
|
||||||
|
self.threshold = self.config.get('threshold', 80)
|
||||||
|
self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Client Configuration File
|
||||||
|
|
||||||
|
Users configure plugins in the client configuration YAML:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
my_awesome_plugin:
|
||||||
|
enabled: true
|
||||||
|
interval: 120
|
||||||
|
option1: custom_value
|
||||||
|
option2: false
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### 1. Error Handling
|
||||||
|
|
||||||
|
Always handle errors gracefully:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
return await self._collect_metrics()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Logging
|
||||||
|
|
||||||
|
Use appropriate log levels:
|
||||||
|
|
||||||
|
```python
|
||||||
|
logger.debug("Detailed information for debugging")
|
||||||
|
logger.info("Normal operation messages")
|
||||||
|
logger.warning("Warning messages for unusual but handled situations")
|
||||||
|
logger.error("Error messages for failures")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Dependencies
|
||||||
|
|
||||||
|
Check for optional dependencies:
|
||||||
|
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
import some_optional_library
|
||||||
|
except ImportError:
|
||||||
|
some_optional_library = None
|
||||||
|
|
||||||
|
# Later in __init__:
|
||||||
|
if some_optional_library is None:
|
||||||
|
raise ImportError("some_optional_library is required")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Performance
|
||||||
|
|
||||||
|
- Keep collection methods fast (< 1 second)
|
||||||
|
- Use async/await for I/O operations
|
||||||
|
- Cache expensive computations
|
||||||
|
- Don't block the event loop
|
||||||
|
|
||||||
|
### 5. Data Structure
|
||||||
|
|
||||||
|
Return clean, structured data:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
'metric_name': value,
|
||||||
|
'nested_data': {
|
||||||
|
'sub_metric': value
|
||||||
|
},
|
||||||
|
'list_data': [item1, item2],
|
||||||
|
'timestamp': time.time() # Optional timestamp
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Documentation
|
||||||
|
|
||||||
|
Document your plugin thoroughly:
|
||||||
|
|
||||||
|
- Class docstring with description and configuration
|
||||||
|
- Method docstrings explaining purpose and return values
|
||||||
|
- Inline comments for complex logic
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Example 1: Simple InfoPlugin
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import InfoPlugin
|
||||||
|
import platform
|
||||||
|
|
||||||
|
class SimpleInfoPlugin(InfoPlugin):
|
||||||
|
"""Collect basic system information."""
|
||||||
|
|
||||||
|
name = "simple_info"
|
||||||
|
interval = 0 # InfoPlugin
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
'hostname': platform.node(),
|
||||||
|
'system': platform.system(),
|
||||||
|
'python_version': platform.python_version()
|
||||||
|
}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = SimpleInfoPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: MonitorPlugin with State
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import MonitorPlugin
|
||||||
|
import time
|
||||||
|
|
||||||
|
class CounterPlugin(MonitorPlugin):
|
||||||
|
"""Track a counter over time."""
|
||||||
|
|
||||||
|
name = "counter"
|
||||||
|
interval = 30
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._counter = 0
|
||||||
|
self._start_time = time.time()
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
self._counter += 1
|
||||||
|
uptime = time.time() - self._start_time
|
||||||
|
|
||||||
|
return {
|
||||||
|
'count': self._counter,
|
||||||
|
'uptime': uptime,
|
||||||
|
'rate': self._counter / uptime
|
||||||
|
}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = CounterPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 3: Plugin with External Command
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.plugin import MonitorPlugin
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class CommandPlugin(MonitorPlugin):
|
||||||
|
"""Execute external command and capture output."""
|
||||||
|
|
||||||
|
name = "command_executor"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super().__init__(config)
|
||||||
|
self.command = self.config.get('command', 'echo "no command"')
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
process = await asyncio.create_subprocess_shell(
|
||||||
|
self.command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
stdout, stderr = await asyncio.wait_for(
|
||||||
|
process.communicate(),
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'exit_code': process.returncode,
|
||||||
|
'stdout': stdout.decode('utf-8'),
|
||||||
|
'stderr': stderr.decode('utf-8')
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {'error': str(e)}
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
plugin = CommandPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Unit Testing
|
||||||
|
|
||||||
|
Create unit tests for your plugins:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import unittest
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class TestMyPlugin(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.plugin = MyAwesomePlugin({'option1': 'test'})
|
||||||
|
|
||||||
|
def test_initialization(self):
|
||||||
|
result = asyncio.run(self.plugin.initialize())
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
def test_collection(self):
|
||||||
|
asyncio.run(self.plugin.initialize())
|
||||||
|
data = asyncio.run(self.plugin.collect())
|
||||||
|
|
||||||
|
self.assertIsInstance(data, dict)
|
||||||
|
self.assertIn('metric1', data)
|
||||||
|
self.assertGreater(data['metric1'], 0)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
asyncio.run(self.plugin.cleanup())
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Integration Testing
|
||||||
|
|
||||||
|
Test your plugin with the actual client:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create test configuration
|
||||||
|
cat > test_config.yaml <<EOF
|
||||||
|
server: localhost
|
||||||
|
plugins:
|
||||||
|
my_awesome_plugin:
|
||||||
|
enabled: true
|
||||||
|
interval: 10
|
||||||
|
option1: test_value
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Run client in test mode
|
||||||
|
python -m hbd.hbc -c test_config.yaml --verbose
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### My plugin isn't loading
|
||||||
|
|
||||||
|
1. Check filename doesn't start with underscore
|
||||||
|
2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
|
||||||
|
3. Check `initialize()` returns True
|
||||||
|
4. Look for import errors in logs
|
||||||
|
|
||||||
|
### Plugin loads but doesn't collect data
|
||||||
|
|
||||||
|
1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
|
||||||
|
2. Verify `collect()` returns a dictionary
|
||||||
|
3. Check for exceptions in `collect()` method
|
||||||
|
4. Enable DEBUG logging to see detailed errors
|
||||||
|
|
||||||
|
### Data isn't appearing on server
|
||||||
|
|
||||||
|
1. Verify client is connected to server
|
||||||
|
2. Check server logs for PLG message handling
|
||||||
|
3. Verify returned data is JSON-serializable
|
||||||
|
4. Check for large data sizes (may exceed UDP packet size)
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
|
||||||
|
- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
|
||||||
|
- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
|
||||||
|
- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
|
||||||
File diff suppressed because it is too large
Load Diff
+242
@@ -0,0 +1,242 @@
|
|||||||
|
# User Management
|
||||||
|
|
||||||
|
Heartbeat supports optional user accounts with role-based access control per host. When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Users are defined in the server config file. Each host can have an **owner**, zero or more **managers**, and zero or more **monitors**. A **default owner** catches any host that does not name an explicit owner.
|
||||||
|
|
||||||
|
### Roles
|
||||||
|
|
||||||
|
| Role | Inherits | Permissions |
|
||||||
|
|------|----------|-------------|
|
||||||
|
| **monitor** | — | View host status, plugin data, alerts; acknowledge alerts they were notified for |
|
||||||
|
| **manager** | monitor | + Queue commands (`/c`), trigger DNS re-registration (`/n`), queue upgrades (`/u`); add/remove monitors |
|
||||||
|
| **owner** | manager | + Drop host (`/d`); add/remove managers; transfer ownership; update host access |
|
||||||
|
| **admin** *(flag)* | owner on all hosts | Full access to every host and the user list |
|
||||||
|
|
||||||
|
`admin` is a flag on the user, not a per-host role. An admin user has owner-level access on every host without being listed as owner/manager/monitor.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Defining users
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
users:
|
||||||
|
andreas:
|
||||||
|
full_name: Andreas Wrede
|
||||||
|
avatar: /path/to/avatar.png # file path, URL, or base64 data URI (optional)
|
||||||
|
password: pbkdf2:sha256:... # generated with: hbd passwd andreas
|
||||||
|
admin: true # optional — grants server-wide owner access
|
||||||
|
|
||||||
|
bob:
|
||||||
|
full_name: Bob Smith
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
carol:
|
||||||
|
full_name: Carol Jones
|
||||||
|
password: pbkdf2:sha256:...
|
||||||
|
|
||||||
|
default_owner: andreas # owns hosts with no explicit owner
|
||||||
|
# falls back to the first admin user if omitted
|
||||||
|
```
|
||||||
|
|
||||||
|
### Assigning roles to hosts
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: andreas
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
unattended-host: # no owner → owned by default_owner
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generating a password hash
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hbd passwd andreas
|
||||||
|
```
|
||||||
|
|
||||||
|
Enter and confirm the password when prompted. Paste the printed hash into the config file under the user's `password` key.
|
||||||
|
|
||||||
|
You can also generate a hash non-interactively from Python:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.server.users import hash_password
|
||||||
|
print(hash_password("mysecret"))
|
||||||
|
```
|
||||||
|
|
||||||
|
Passwords are stored as PBKDF2-HMAC-SHA256 hashes (260 000 iterations). No third-party libraries are required — only Python's standard `hashlib`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
When at least one user is defined, every request must be authenticated. Unauthenticated requests to HTML pages are redirected to `/login`; unauthenticated API requests receive `401 Unauthorized`.
|
||||||
|
|
||||||
|
### Browser login
|
||||||
|
|
||||||
|
Navigate to any page — you will be redirected to `/login` automatically. After submitting valid credentials the server sets an `hbd_session` cookie (HttpOnly, SameSite=Lax, 24 h lifetime). All subsequent requests, including JavaScript `fetch()` calls on the dashboards, carry the cookie automatically.
|
||||||
|
|
||||||
|
To log out, visit `/logout`.
|
||||||
|
|
||||||
|
### API / programmatic login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log in and capture the token
|
||||||
|
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"username":"andreas","password":"mysecret"}' | jq -r .token)
|
||||||
|
|
||||||
|
# Use the token in subsequent requests
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
The token is identical to the session cookie value — both mechanisms work simultaneously.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Log out
|
||||||
|
curl -s -X POST http://localhost:50004/api/0/auth/logout \
|
||||||
|
-H "Authorization: Bearer $TOKEN"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Authentication
|
||||||
|
|
||||||
|
#### POST /api/0/auth/login
|
||||||
|
Obtain a session token.
|
||||||
|
|
||||||
|
**Request body:**
|
||||||
|
```json
|
||||||
|
{ "username": "andreas", "password": "mysecret" }
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{ "token": "<opaque-hex-token>", "username": "andreas" }
|
||||||
|
```
|
||||||
|
Also sets the `hbd_session` cookie for browser clients.
|
||||||
|
|
||||||
|
**Status codes:** `200 OK`, `401 Unauthorized`, `404` (auth not configured)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### POST /api/0/auth/logout
|
||||||
|
Invalidate the current session.
|
||||||
|
|
||||||
|
**Headers:** `Authorization: Bearer <token>` or cookie
|
||||||
|
|
||||||
|
**Response:** `{ "success": true }`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Users
|
||||||
|
|
||||||
|
#### GET /api/0/users
|
||||||
|
List all users. **Admin only.**
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{ "username": "andreas", "full_name": "Andreas Wrede", "avatar": "", "admin": true, "notification_channels": [] },
|
||||||
|
{ "username": "bob", "full_name": "Bob Smith", "avatar": "", "admin": false, "notification_channels": ["pushover_standard"] }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### GET /api/0/users/me
|
||||||
|
Return the currently authenticated user's profile.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{ "username": "carol", "full_name": "Carol Jones", "avatar": "", "admin": false, "notification_channels": [] }
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Host Access
|
||||||
|
|
||||||
|
#### GET /api/0/hosts/{hostname}/access
|
||||||
|
Return owner/managers/monitors for a host. Requires at least **monitor** role.
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "andreas",
|
||||||
|
"managers": ["bob"],
|
||||||
|
"monitors": ["carol"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### PUT /api/0/hosts/{hostname}/access
|
||||||
|
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||||
|
|
||||||
|
**Request body** (all fields optional):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"owner": "bob",
|
||||||
|
"managers": ["carol"],
|
||||||
|
"monitors": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Changes take effect immediately in memory. They are not written back to the config file — reload (`SIGHUP`) will re-apply config values. To make changes permanent, update the config file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Host visibility
|
||||||
|
|
||||||
|
When users are configured, `GET /api/0/hosts` only returns hosts the authenticated user has at least monitor access to. Admins see all hosts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config reload
|
||||||
|
|
||||||
|
On `SIGHUP`, the server reloads the config file, re-loads the user registry, and re-applies `owner`/`managers`/`monitors` from config to all known hosts. Existing sessions remain valid after a reload.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## No-auth mode
|
||||||
|
|
||||||
|
If `users:` is absent or empty, the server starts in **unauthenticated mode**:
|
||||||
|
|
||||||
|
- No login required — all pages and API endpoints are accessible without credentials.
|
||||||
|
- All permission checks pass unconditionally.
|
||||||
|
- `/login`, `/logout`, and the auth/user API endpoints return `404`.
|
||||||
|
|
||||||
|
This preserves full backwards compatibility with existing deployments.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security notes
|
||||||
|
|
||||||
|
- Session tokens are 64-character cryptographically random hex strings (`secrets.token_hex(32)`).
|
||||||
|
- Sessions expire after 24 hours (configurable via `users_mod.SESSION_TTL`).
|
||||||
|
- Cookies are `HttpOnly` and `SameSite=Lax` — they are not accessible to JavaScript and are not sent on cross-site requests.
|
||||||
|
- The HTTP API does not yet enforce TLS. For production use, place hbd behind a TLS-terminating reverse proxy (nginx, Caddy, etc.) or enable WSS.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
- [HTTP API Documentation](HTTP_API.md)
|
||||||
|
- [Notifications](NOTIFICATIONS.md)
|
||||||
|
- Configuration example: `hbd/config_example.yaml`
|
||||||
@@ -0,0 +1,602 @@
|
|||||||
|
# Plugin Error Checking Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** Improve plugin error checking in hbc, especially for nagios_runner, and fix logger messages silently discarded in daemon mode.
|
||||||
|
|
||||||
|
**Architecture:** Three focused changes across three files: (1) `hbd/client/plugin.py` gains a `skip_reason` attribute on Plugin and updated PluginLoader messaging; (2) `hbd/client/plugins/nagios_runner.py` gains async subprocess execution, stderr capture, signal-killed process handling, and init-time command path validation; (3) `hbd/client/main.py` gains proper post-fork logging reconfiguration to syslog.
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.11+, asyncio, `logging.handlers.SysLogHandler`, pytest
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Map
|
||||||
|
|
||||||
|
| Action | Path | What changes |
|
||||||
|
|---|---|---|
|
||||||
|
| Modify | `hbd/client/plugin.py` | `Plugin.__init__` gains `skip_reason`; `PluginLoader` checks it |
|
||||||
|
| Modify | `hbd/client/plugins/nagios_runner.py` | async subprocess, stderr, signal codes, init validation, `skip_reason` |
|
||||||
|
| Modify | `hbd/client/main.py` | `_reconfigure_logging_for_daemon()` helper; remove redundant syslog calls |
|
||||||
|
| Create | `tests/test_plugin.py` | PluginLoader messaging tests |
|
||||||
|
| Create | `tests/test_nagios_runner.py` | NagiosRunnerPlugin behaviour tests |
|
||||||
|
|
||||||
|
Run tests throughout with:
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: Plugin.skip_reason + PluginLoader messaging
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `hbd/client/plugin.py:40-48` (Plugin.__init__)
|
||||||
|
- Modify: `hbd/client/plugin.py:369-381` (PluginLoader.load_from_directory)
|
||||||
|
- Create: `tests/test_plugin.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing tests**
|
||||||
|
|
||||||
|
Create `tests/test_plugin.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import textwrap
|
||||||
|
|
||||||
|
from hbd.client.plugin import Plugin, PluginLoader, PluginRegistry
|
||||||
|
|
||||||
|
|
||||||
|
def test_plugin_skip_reason_defaults_none(tmp_path):
|
||||||
|
plugin_code = textwrap.dedent("""
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
class MinimalPlugin(MonitorPlugin):
|
||||||
|
name = "minimal"
|
||||||
|
version = "1.0.0"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _collect_metrics(self):
|
||||||
|
return {}
|
||||||
|
""")
|
||||||
|
(tmp_path / "minimal.py").write_text(plugin_code)
|
||||||
|
registry = PluginRegistry()
|
||||||
|
loader = PluginLoader(registry)
|
||||||
|
asyncio.run(loader.load_from_directory(tmp_path))
|
||||||
|
plugin = registry.get("minimal")
|
||||||
|
assert plugin is not None
|
||||||
|
assert plugin.skip_reason is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_loader_logs_info_when_skip_reason_set(tmp_path, caplog):
|
||||||
|
plugin_code = textwrap.dedent("""
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
class SkippablePlugin(MonitorPlugin):
|
||||||
|
name = "skippable"
|
||||||
|
version = "1.0.0"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
self.skip_reason = "not configured in yaml"
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _collect_metrics(self):
|
||||||
|
return {}
|
||||||
|
""")
|
||||||
|
(tmp_path / "skippable.py").write_text(plugin_code)
|
||||||
|
registry = PluginRegistry()
|
||||||
|
loader = PluginLoader(registry)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.INFO, logger="plugin.loader"):
|
||||||
|
count = asyncio.run(loader.load_from_directory(tmp_path))
|
||||||
|
|
||||||
|
assert count == 0
|
||||||
|
assert any("skipped: not configured in yaml" in r.message for r in caplog.records)
|
||||||
|
assert not any("failed initialization" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
|
def test_loader_logs_warning_when_no_skip_reason(tmp_path, caplog):
|
||||||
|
plugin_code = textwrap.dedent("""
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
class FailPlugin(MonitorPlugin):
|
||||||
|
name = "fail"
|
||||||
|
version = "1.0.0"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _collect_metrics(self):
|
||||||
|
return {}
|
||||||
|
""")
|
||||||
|
(tmp_path / "fail_plugin.py").write_text(plugin_code)
|
||||||
|
registry = PluginRegistry()
|
||||||
|
loader = PluginLoader(registry)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING, logger="plugin.loader"):
|
||||||
|
count = asyncio.run(loader.load_from_directory(tmp_path))
|
||||||
|
|
||||||
|
assert count == 0
|
||||||
|
assert any("failed initialization" in r.message for r in caplog.records)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_plugin.py -v
|
||||||
|
```
|
||||||
|
Expected: `test_plugin_skip_reason_defaults_none` FAILS (attribute missing), others may error.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Add `skip_reason` to `Plugin.__init__`**
|
||||||
|
|
||||||
|
In `hbd/client/plugin.py`, in `Plugin.__init__` (around line 46), add one line:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
self.config = config or {}
|
||||||
|
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||||
|
self._initialized = False
|
||||||
|
self.skip_reason: Optional[str] = None
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Update PluginLoader messaging**
|
||||||
|
|
||||||
|
In `hbd/client/plugin.py`, replace the `if not initialized:` block (around line 372):
|
||||||
|
|
||||||
|
```python
|
||||||
|
if not initialized:
|
||||||
|
if plugin.skip_reason:
|
||||||
|
self.logger.info(
|
||||||
|
f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Plugin {plugin.name} failed initialization, skipping"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run tests to verify they pass**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_plugin.py -v
|
||||||
|
```
|
||||||
|
Expected: all 3 tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add hbd/client/plugin.py tests/test_plugin.py
|
||||||
|
git commit -m "feat: add skip_reason to Plugin; improve PluginLoader init messaging"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: NagiosRunnerPlugin — skip_reason when no commands
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `hbd/client/plugins/nagios_runner.py:88-105` (initialize)
|
||||||
|
- Modify: `tests/test_nagios_runner.py` (create)
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing test**
|
||||||
|
|
||||||
|
Create `tests/test_nagios_runner.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import stat
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from hbd.client.plugins.nagios_runner import (
|
||||||
|
NagiosRunnerPlugin,
|
||||||
|
NAGIOS_OK,
|
||||||
|
NAGIOS_WARNING,
|
||||||
|
NAGIOS_CRITICAL,
|
||||||
|
NAGIOS_UNKNOWN,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_commands_sets_skip_reason():
|
||||||
|
plugin = NagiosRunnerPlugin(config={"commands": []})
|
||||||
|
result = asyncio.run(plugin.initialize())
|
||||||
|
assert result is False
|
||||||
|
assert plugin.skip_reason is not None
|
||||||
|
assert "nagios_runner.commands" in plugin.skip_reason
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run test to verify it fails**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_nagios_runner.py::test_no_commands_sets_skip_reason -v
|
||||||
|
```
|
||||||
|
Expected: FAIL — `plugin.skip_reason` is `None`.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Set skip_reason in NagiosRunnerPlugin.initialize()**
|
||||||
|
|
||||||
|
In `hbd/client/plugins/nagios_runner.py`, replace the early-return block in `initialize()` (around line 96):
|
||||||
|
|
||||||
|
```python
|
||||||
|
if not self.commands:
|
||||||
|
self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
|
||||||
|
self.logger.info("No Nagios commands configured")
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run test to verify it passes**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_nagios_runner.py::test_no_commands_sets_skip_reason -v
|
||||||
|
```
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
|
||||||
|
git commit -m "feat: set skip_reason on nagios_runner when no commands configured"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: NagiosRunnerPlugin — async subprocess, stderr capture, negative return codes
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `hbd/client/plugins/nagios_runner.py` (imports + `_run_nagios_plugin`)
|
||||||
|
- Modify: `tests/test_nagios_runner.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing tests**
|
||||||
|
|
||||||
|
Append to `tests/test_nagios_runner.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_stderr_used_when_stdout_empty(tmp_path):
|
||||||
|
script = tmp_path / "check_err.sh"
|
||||||
|
script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n")
|
||||||
|
script.chmod(script.stat().st_mode | stat.S_IEXEC)
|
||||||
|
|
||||||
|
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert "error from stderr" in data["t_output"]
|
||||||
|
assert data["t_status_code"] == NAGIOS_CRITICAL
|
||||||
|
|
||||||
|
|
||||||
|
def test_stderr_appended_when_both_present(tmp_path):
|
||||||
|
script = tmp_path / "check_both.sh"
|
||||||
|
script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n")
|
||||||
|
script.chmod(script.stat().st_mode | stat.S_IEXEC)
|
||||||
|
|
||||||
|
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert "OK - all good" in data["t_output"]
|
||||||
|
assert "extra detail" in data["t_output"]
|
||||||
|
assert data["t_status_code"] == NAGIOS_OK
|
||||||
|
|
||||||
|
|
||||||
|
def test_negative_returncode_maps_to_unknown():
|
||||||
|
# kill -9 $$ kills the shell itself; asyncio sees returncode -9
|
||||||
|
config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert data["t_status_code"] == NAGIOS_UNKNOWN
|
||||||
|
assert "signal" in data["t_output"].lower()
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_nagios_runner.py::test_stderr_used_when_stdout_empty \
|
||||||
|
tests/test_nagios_runner.py::test_stderr_appended_when_both_present \
|
||||||
|
tests/test_nagios_runner.py::test_negative_returncode_maps_to_unknown -v
|
||||||
|
```
|
||||||
|
Expected: all FAIL — current implementation ignores stderr and doesn't handle negative codes.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Update imports in nagios_runner.py**
|
||||||
|
|
||||||
|
Replace the import block at the top of `hbd/client/plugins/nagios_runner.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
```
|
||||||
|
|
||||||
|
(Remove `import subprocess`; add `import asyncio` and `import os`.)
|
||||||
|
|
||||||
|
- [ ] **Step 4: Upgrade collection log level from DEBUG to INFO**
|
||||||
|
|
||||||
|
In `hbd/client/plugins/nagios_runner.py`, in `_collect_metrics()`, change the debug log (around line 144) so results are visible at INFO level:
|
||||||
|
|
||||||
|
```python
|
||||||
|
self.logger.info(
|
||||||
|
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Replace `_run_nagios_plugin` with async implementation**
|
||||||
|
|
||||||
|
Replace the entire `_run_nagios_plugin` method in `hbd/client/plugins/nagios_runner.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def _run_nagios_plugin(
|
||||||
|
self,
|
||||||
|
command: str
|
||||||
|
) -> Tuple[int, str, Dict[str, Any]]:
|
||||||
|
"""Execute a Nagios plugin and parse its output."""
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_shell(
|
||||||
|
command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||||
|
proc.communicate(), timeout=self.timeout
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
proc.kill()
|
||||||
|
await proc.communicate()
|
||||||
|
self.logger.error(f"Command timed out: {command}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||||
|
|
||||||
|
status_code = proc.returncode
|
||||||
|
|
||||||
|
if status_code < 0:
|
||||||
|
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
|
||||||
|
|
||||||
|
if status_code > 3:
|
||||||
|
status_code = NAGIOS_UNKNOWN
|
||||||
|
|
||||||
|
stdout = stdout_bytes.decode(errors="replace").strip()
|
||||||
|
stderr = stderr_bytes.decode(errors="replace").strip()
|
||||||
|
|
||||||
|
# Parse perfdata from stdout before mixing in stderr
|
||||||
|
perfdata = self._parse_perfdata(stdout)
|
||||||
|
|
||||||
|
# Build status message
|
||||||
|
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
|
||||||
|
|
||||||
|
if not stdout and stderr:
|
||||||
|
output_msg = stderr
|
||||||
|
elif stdout and stderr:
|
||||||
|
output_msg = f"{status_part} [stderr: {stderr}]"
|
||||||
|
else:
|
||||||
|
output_msg = status_part
|
||||||
|
|
||||||
|
return status_code, output_msg, perfdata
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error executing command: {e}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||||
|
```
|
||||||
|
|
||||||
|
Also remove the now-unused `self.shell` line from `__init__` (the `shell` config key is no longer used since `create_subprocess_shell` always uses a shell):
|
||||||
|
|
||||||
|
In `NagiosRunnerPlugin.__init__`, remove:
|
||||||
|
```python
|
||||||
|
self.shell: bool = config.get("shell", True) if config else True
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 6: Run tests to verify they pass**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_nagios_runner.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS including the 3 new ones.
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
|
||||||
|
git commit -m "feat: async subprocess in nagios_runner with stderr capture and signal handling"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: NagiosRunnerPlugin — command path validation at init
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `hbd/client/plugins/nagios_runner.py` (initialize)
|
||||||
|
- Modify: `tests/test_nagios_runner.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing tests**
|
||||||
|
|
||||||
|
Append to `tests/test_nagios_runner.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_absolute_path_not_found_warns(caplog):
|
||||||
|
fake_cmd = "/nonexistent_hbc_test_path/check_something"
|
||||||
|
config = {"commands": [{"name": "t", "command": fake_cmd}]}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
|
||||||
|
assert any("not found" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
|
def test_absolute_path_not_executable_warns(caplog, tmp_path):
|
||||||
|
non_exec = tmp_path / "check_test"
|
||||||
|
non_exec.write_text("#!/bin/sh\necho OK\n")
|
||||||
|
non_exec.chmod(0o644) # readable but not executable
|
||||||
|
|
||||||
|
config = {"commands": [{"name": "t", "command": str(non_exec)}]}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
|
||||||
|
assert any("not executable" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
|
def test_relative_path_not_checked(caplog):
|
||||||
|
# Relative paths (resolved via PATH) must not generate warnings
|
||||||
|
config = {"commands": [{"name": "t", "command": "echo OK"}]}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING, logger="plugin.nagios_runner"):
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
|
||||||
|
assert not any(
|
||||||
|
"not found" in r.message or "not executable" in r.message
|
||||||
|
for r in caplog.records
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_nagios_runner.py::test_absolute_path_not_found_warns \
|
||||||
|
tests/test_nagios_runner.py::test_absolute_path_not_executable_warns \
|
||||||
|
tests/test_nagios_runner.py::test_relative_path_not_checked -v
|
||||||
|
```
|
||||||
|
Expected: `test_absolute_path_not_found_warns` and `test_absolute_path_not_executable_warns` FAIL (no warnings logged); `test_relative_path_not_checked` may pass.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Add command path validation to `initialize()`**
|
||||||
|
|
||||||
|
In `hbd/client/plugins/nagios_runner.py`, extend `initialize()` by adding validation after the existing "log each command" loop (after line 103, before `return True`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Validate absolute command paths early
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name", "unnamed")
|
||||||
|
command = cmd_config.get("command", "")
|
||||||
|
if not command:
|
||||||
|
continue
|
||||||
|
exe = command.split()[0]
|
||||||
|
if os.path.isabs(exe):
|
||||||
|
if not os.path.isfile(exe):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not found: {exe}"
|
||||||
|
)
|
||||||
|
elif not os.access(exe, os.X_OK):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not executable: {exe}"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run full test suite to verify all pass**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add hbd/client/plugins/nagios_runner.py tests/test_nagios_runner.py
|
||||||
|
git commit -m "feat: validate absolute command paths at nagios_runner init"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 5: Daemon mode logging — route to syslog after fork
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `hbd/client/main.py` (new helper + updated daemon block)
|
||||||
|
|
||||||
|
No automated test for daemonization itself (fork behaviour is hard to unit-test). Manual verification steps are provided below.
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add `_reconfigure_logging_for_daemon` helper**
|
||||||
|
|
||||||
|
In `hbd/client/main.py`, add this function just before `def build_parser()` (around line 589):
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _reconfigure_logging_for_daemon(log_level: int) -> None:
|
||||||
|
"""Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
|
||||||
|
from logging.handlers import SysLogHandler
|
||||||
|
|
||||||
|
root = logging.getLogger()
|
||||||
|
for handler in root.handlers[:]:
|
||||||
|
root.removeHandler(handler)
|
||||||
|
handler.close()
|
||||||
|
|
||||||
|
try:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address="/dev/log",
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address=("localhost", 514),
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
# Attach the fallback first so the warning reaches syslog
|
||||||
|
syslog_handler.setFormatter(
|
||||||
|
logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
|
||||||
|
)
|
||||||
|
root.addHandler(syslog_handler)
|
||||||
|
root.setLevel(log_level)
|
||||||
|
logging.warning("/dev/log not found, using syslog UDP localhost:514")
|
||||||
|
return
|
||||||
|
|
||||||
|
syslog_handler.setFormatter(
|
||||||
|
logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
|
||||||
|
)
|
||||||
|
root.addHandler(syslog_handler)
|
||||||
|
root.setLevel(log_level)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Update the daemon block in `main()`**
|
||||||
|
|
||||||
|
In `hbd/client/main.py`, replace the entire `if args.daemon:` block (lines 664–675):
|
||||||
|
|
||||||
|
```python
|
||||||
|
if args.daemon:
|
||||||
|
print("Daemonizing...")
|
||||||
|
daemonize()
|
||||||
|
_reconfigure_logging_for_daemon(log_level)
|
||||||
|
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
This removes the `import syslog`, `syslog.openlog()`, and `syslog.syslog()` calls (now handled by the logging system) and removes the no-op second `logging.basicConfig()` call.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Run existing test suite to confirm no regressions**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pytest tests/test_plugin.py tests/test_nagios_runner.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests still PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 4: Manual smoke test — verify syslog output in daemon mode**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# In one terminal, tail syslog
|
||||||
|
sudo journalctl -f -t hbc
|
||||||
|
|
||||||
|
# In another terminal, start hbc in daemon mode (replace HOST with a real or dummy host)
|
||||||
|
python -m hbd.client.main -d -v localhost
|
||||||
|
|
||||||
|
# Expected in journalctl output:
|
||||||
|
# hbc[<pid>]: hbc.main INFO: Starting hbc for <hostname> -> ['localhost']
|
||||||
|
# hbc[<pid>]: hbc.main INFO: hbc starting, sending heartbeat to localhost
|
||||||
|
# hbc[<pid>]: plugin.loader INFO: ...
|
||||||
|
|
||||||
|
# Stop the daemon
|
||||||
|
pkill -f "hbd.client.main"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add hbd/client/main.py
|
||||||
|
git commit -m "fix: reconfigure logging to syslog after daemonize() instead of no-op basicConfig"
|
||||||
|
```
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
# Plugin Error Checking & Daemon Logging — Design Spec
|
||||||
|
|
||||||
|
**Date:** 2026-04-25
|
||||||
|
**Scope:** hbc client — daemon mode logging, nagios_runner plugin robustness, PluginLoader messaging
|
||||||
|
**Files affected:** `hbd/client/main.py`, `hbd/client/plugins/nagios_runner.py`, `hbd/client/plugin.py`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Daemon Mode Logging
|
||||||
|
|
||||||
|
### Problem
|
||||||
|
In `main()`, `logging.basicConfig()` is called before `daemonize()` (establishing a StreamHandler to stderr), then called again after `daemonize()`. The second call is a no-op — Python ignores `basicConfig()` when handlers are already configured. After daemonization, stderr is redirected to `/dev/null`, so all subsequent log output is silently discarded.
|
||||||
|
|
||||||
|
The existing `syslog.openlog()` / `syslog.syslog()` calls (lines 666–668) write a single startup message but do not integrate with the `logging` system, so plugin and connection log messages never reach syslog.
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
After `daemonize()`, explicitly reconfigure the root logger:
|
||||||
|
|
||||||
|
1. Remove all existing handlers (they now write to `/dev/null`).
|
||||||
|
2. Add `logging.handlers.SysLogHandler(address='/dev/log', facility=LOG_DAEMON)`.
|
||||||
|
3. Set formatter: `hbc[%(process)d]: %(name)s %(levelname)s: %(message)s`
|
||||||
|
4. Preserve the `log_level` already determined from `-v`/`-x` CLI flags.
|
||||||
|
|
||||||
|
Remove the redundant `syslog.openlog()` / `syslog.syslog()` calls — the logging system handles routing.
|
||||||
|
|
||||||
|
**Fallback:** If `/dev/log` does not exist (containers, some BSDs), fall back to `SysLogHandler(address=('localhost', 514))`. Log one warning (to stderr, before handlers are replaced) so the operator knows.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Nagios Runner Improvements
|
||||||
|
|
||||||
|
### 2a — Async Subprocess
|
||||||
|
`_run_nagios_plugin()` is declared `async def` but calls `subprocess.run()` synchronously, blocking the event loop for the full command duration.
|
||||||
|
|
||||||
|
**Fix:** Replace with `asyncio.create_subprocess_shell()` + `await proc.communicate()`. Enforce timeout with `asyncio.wait_for(..., timeout=self.timeout)` and catch `asyncio.TimeoutError`.
|
||||||
|
|
||||||
|
### 2b — Stderr Capture
|
||||||
|
Subprocess stderr is currently discarded (`capture_output=True` only captures stdout in the sync call; stderr content is lost).
|
||||||
|
|
||||||
|
**Fix:** Pass `stderr=asyncio.subprocess.PIPE` to `create_subprocess_shell`. After `communicate()`, if stdout is empty but stderr has content, use stderr as the output message. If both have content, append stderr to the output for visibility.
|
||||||
|
|
||||||
|
### 2c — Negative Return Codes
|
||||||
|
A negative `returncode` means the process was killed by a signal (SIGKILL, OOM, etc.). The current code treats these as-is, which may produce unexpected status values.
|
||||||
|
|
||||||
|
**Fix:** If `returncode < 0`, map to `NAGIOS_UNKNOWN` with message `"Process killed by signal {-returncode}"`.
|
||||||
|
|
||||||
|
### 2d — Command Path Validation at Init
|
||||||
|
`initialize()` currently only checks that the commands list is non-empty.
|
||||||
|
|
||||||
|
**Fix:** For each command entry during `initialize()`:
|
||||||
|
- Warn and skip the entry if `name` or `command` is missing.
|
||||||
|
- Extract the executable (first whitespace-delimited token of the command string).
|
||||||
|
- If the executable is an absolute path, check `os.path.isfile()` and `os.access(..., os.X_OK)`. Log a `WARNING` if either check fails.
|
||||||
|
- Commands with relative paths or shell builtins are not checked (they may be on PATH) — just noted.
|
||||||
|
- Validation warns only; all original entries in `self.commands` are retained and still attempted at collection time (where the existing missing-name/command guard already skips them). The plugin initializes successfully as long as the commands list is non-empty.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. PluginLoader Messaging
|
||||||
|
|
||||||
|
### Problem
|
||||||
|
When `initialize()` returns `False`, the loader always logs:
|
||||||
|
> `WARNING: Plugin X failed initialization, skipping`
|
||||||
|
|
||||||
|
This is alarming when the real reason is simply "no commands configured". There is no API to distinguish "not configured" from "genuinely broken".
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
Add an optional `skip_reason` attribute to `Plugin.__init__()` (defaults to `None`).
|
||||||
|
|
||||||
|
In `PluginLoader.load_from_directory()`, after `initialize()` returns `False`:
|
||||||
|
- If `plugin.skip_reason` is set → `logger.info(f"Plugin {plugin.name} skipped: {plugin.skip_reason}")`
|
||||||
|
- If `plugin.skip_reason` is `None` → `logger.warning(f"Plugin {plugin.name} failed initialization, skipping")` (existing behaviour)
|
||||||
|
|
||||||
|
In `NagiosRunnerPlugin.initialize()`, when no commands are configured:
|
||||||
|
```python
|
||||||
|
self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
|
Genuine failures (exceptions) continue to go through the existing `except` block in the loader, logging at `ERROR` with traceback — unchanged.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
| Topic | Decision |
|
||||||
|
|---|---|
|
||||||
|
| Daemon log destination | syslog only (LOG_DAEMON facility) |
|
||||||
|
| Syslog fallback | localhost:514 UDP if `/dev/log` absent |
|
||||||
|
| Nagios result log level | INFO for all statuses (OK/WARNING/CRITICAL/UNKNOWN) |
|
||||||
|
| Invalid command handling at init | Warn and continue; still attempt at collection time |
|
||||||
|
| PluginLoader API change | `skip_reason` attribute on Plugin base class, checked by loader |
|
||||||
+14
-8
@@ -1,11 +1,17 @@
|
|||||||
"""hbd package - scaffolding for heartbeat daemon
|
"""hbd package - heartbeat monitoring system
|
||||||
|
|
||||||
This package contains the refactored modules for the original monolithic
|
This package contains both the heartbeat client (hbc) and server (hbd) components,
|
||||||
`hbd` script. The initial implementation contains small scaffolds so you can
|
organized into separate subpackages:
|
||||||
start moving functionality into the package.
|
|
||||||
|
- hbd.client: Client component with system monitoring plugins
|
||||||
|
- hbd.server: Server/daemon component with web UI and notifications
|
||||||
|
- hbd.common: Shared utilities and protocol definitions
|
||||||
|
|
||||||
|
Install options:
|
||||||
|
- pip install hbd[client] # Client only
|
||||||
|
- pip install hbd[server] # Server only
|
||||||
|
- pip install hbd[all] # Both client and server
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = ["main", "__version__"]
|
__all__ = ["__version__"]
|
||||||
__version__ = "5.0"
|
__version__ = "5.1.11"
|
||||||
|
|
||||||
from .cli import main
|
|
||||||
|
|||||||
-45
@@ -1,45 +0,0 @@
|
|||||||
"""Command line interface for hbd package."""
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from .config import load_config
|
|
||||||
from .server import run as run_server
|
|
||||||
|
|
||||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
|
||||||
|
|
||||||
|
|
||||||
def build_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="hbd",
|
|
||||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
|
||||||
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
||||||
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS, help="Push service to use")
|
|
||||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
|
||||||
parser = build_parser()
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
config = load_config(args.configfile)
|
|
||||||
|
|
||||||
# Apply CLI overrides
|
|
||||||
if args.foreground:
|
|
||||||
config["foreground"] = True
|
|
||||||
if args.verbose:
|
|
||||||
config["verbose"] = True
|
|
||||||
if args.pushsrv:
|
|
||||||
config["pushsrv"] = args.pushsrv
|
|
||||||
if args.debug:
|
|
||||||
config.setdefault("debug", 0)
|
|
||||||
config["debug"] += args.debug
|
|
||||||
|
|
||||||
run_server(config)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""HeartBeat Client (hbc) - System monitoring client."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
CLIENT_DEFAULTS = {
|
||||||
|
# Network settings
|
||||||
|
"hb_port": 50003, # Port where hbd servers listen
|
||||||
|
"interval": 10, # Heartbeat interval in seconds
|
||||||
|
|
||||||
|
# Runtime flags
|
||||||
|
"foreground": False,
|
||||||
|
"verbose": False,
|
||||||
|
"debug": 0,
|
||||||
|
|
||||||
|
# Plugin configuration
|
||||||
|
"plugins": {}, # Per-plugin configuration
|
||||||
|
"thresholds": {}, # Threshold configuration for monitoring
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path=None):
|
||||||
|
"""Load configuration from a YAML file and merge with client defaults.
|
||||||
|
|
||||||
|
If YAML is not available or the file does not exist, defaults are returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with configuration
|
||||||
|
"""
|
||||||
|
cfg = CLIENT_DEFAULTS.copy()
|
||||||
|
if not path:
|
||||||
|
# default path (~/.hbc.yaml)
|
||||||
|
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||||
|
|
||||||
|
if os.path.exists(path):
|
||||||
|
if yaml:
|
||||||
|
logger.info("Loading configuration from %s", path)
|
||||||
|
with open(path) as fh:
|
||||||
|
data = yaml.safe_load(fh)
|
||||||
|
# Merge YAML data with defaults
|
||||||
|
# Keep all keys from YAML to support plugin configs and future extensions
|
||||||
|
for k, v in data.items():
|
||||||
|
cfg[k] = v
|
||||||
|
else:
|
||||||
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
|
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||||
|
return cfg
|
||||||
@@ -0,0 +1,722 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
HeartBeat Client (hbc) - Async version with plugin support.
|
||||||
|
|
||||||
|
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
|
||||||
|
system information via plugins.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from logging.handlers import SysLogHandler
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
# Import protocol and config
|
||||||
|
from .config import load_config
|
||||||
|
from ..common.proto import dicttos, stodict
|
||||||
|
|
||||||
|
# Import plugin system
|
||||||
|
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
PORT = 50003
|
||||||
|
INTERVAL = 10
|
||||||
|
MAXRECV = 32767
|
||||||
|
|
||||||
|
# Global state
|
||||||
|
running = True
|
||||||
|
dorestart = False
|
||||||
|
shutdown_event: Optional[asyncio.Event] = None
|
||||||
|
active_tasks: List[asyncio.Task] = []
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncConnection:
|
||||||
|
"""Async UDP connection to a heartbeat server."""
|
||||||
|
|
||||||
|
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
|
||||||
|
self.conn_id = conn_id
|
||||||
|
self.addr = addr
|
||||||
|
self.port = port
|
||||||
|
self.af = af
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
self.ackcount = 0
|
||||||
|
self.lastack = 0.0
|
||||||
|
self.send_count = 0
|
||||||
|
self.lastsend = 0.0
|
||||||
|
self.rtts = [0.0]
|
||||||
|
|
||||||
|
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||||
|
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||||
|
self._dead = False
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||||
|
|
||||||
|
async def open(self) -> bool:
|
||||||
|
"""Open the UDP connection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
# Create datagram endpoint
|
||||||
|
self.transport, self.protocol = await loop.create_datagram_endpoint(
|
||||||
|
lambda: HeartbeatProtocol(self),
|
||||||
|
family=self.af
|
||||||
|
)
|
||||||
|
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to open connection: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close the connection."""
|
||||||
|
if self.transport:
|
||||||
|
self.transport.close()
|
||||||
|
self.transport = None
|
||||||
|
self.protocol = None
|
||||||
|
|
||||||
|
async def sendto(self, msg: dict, msg_id: str = "HTB"):
|
||||||
|
"""Send a message to the server.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Message dictionary
|
||||||
|
msg_id: Message ID (HTB, PLG, etc.)
|
||||||
|
"""
|
||||||
|
if self._dead:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.transport:
|
||||||
|
await self.open()
|
||||||
|
|
||||||
|
if not self.transport:
|
||||||
|
self.logger.error("Cannot send - no transport")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Add standard fields
|
||||||
|
msg["name"] = shortname(self.name)
|
||||||
|
msg["id"] = self.conn_id
|
||||||
|
msg["time"] = time.time()
|
||||||
|
|
||||||
|
# Encode message
|
||||||
|
data = dicttos(msg_id, msg)
|
||||||
|
|
||||||
|
# Send
|
||||||
|
self.transport.sendto(data, (self.addr, self.port))
|
||||||
|
self.send_count += 1
|
||||||
|
self.lastsend = time.time()
|
||||||
|
|
||||||
|
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||||
|
|
||||||
|
def handle_ack(self, msg: dict, now: float):
|
||||||
|
"""Handle ACK message from server.
|
||||||
|
|
||||||
|
RTT is calculated as: (time ACK received) - (time HTB sent)
|
||||||
|
"""
|
||||||
|
self.lastack = now
|
||||||
|
|
||||||
|
# Calculate RTT: time ACK received minus time HTB sent
|
||||||
|
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
|
||||||
|
|
||||||
|
self.rtts.append(rtt)
|
||||||
|
if len(self.rtts) > 10:
|
||||||
|
self.rtts.pop(0)
|
||||||
|
|
||||||
|
self.ackcount += 1
|
||||||
|
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||||
|
|
||||||
|
|
||||||
|
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||||
|
"""Protocol handler for incoming UDP messages."""
|
||||||
|
|
||||||
|
def __init__(self, connection: AsyncConnection):
|
||||||
|
self.connection = connection
|
||||||
|
self.logger = logging.getLogger("hbc.protocol")
|
||||||
|
|
||||||
|
def datagram_received(self, data: bytes, addr):
|
||||||
|
"""Handle incoming datagram."""
|
||||||
|
try:
|
||||||
|
msg = stodict(data)
|
||||||
|
if not msg:
|
||||||
|
self.logger.warning(f"Failed to parse message from {addr}")
|
||||||
|
return
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
msg_id = msg.get("ID")
|
||||||
|
|
||||||
|
if msg_id == "ACK":
|
||||||
|
self.connection.handle_ack(msg, now)
|
||||||
|
elif msg_id == "CMD":
|
||||||
|
# Command from server
|
||||||
|
asyncio.create_task(handle_command(self.connection, msg))
|
||||||
|
elif msg_id == "UPD":
|
||||||
|
# Update from server
|
||||||
|
asyncio.create_task(handle_update(self.connection, msg))
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"Unknown message type: {msg_id}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||||
|
|
||||||
|
def error_received(self, exc):
|
||||||
|
"""Handle protocol errors."""
|
||||||
|
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — dropping connection")
|
||||||
|
self.connection._dead = True
|
||||||
|
self.connection.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_command(conn: AsyncConnection, msg: dict):
|
||||||
|
"""Execute a command received from server."""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
cmd = msg.get("cmd", "")
|
||||||
|
if not cmd:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.command")
|
||||||
|
logger.info(f"Executing command: {cmd}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.check_output(
|
||||||
|
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
|
||||||
|
).decode()
|
||||||
|
status = "OK"
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
result = str(e)
|
||||||
|
status = "CalledProcessError"
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
result = "Command timed out"
|
||||||
|
status = "Timeout"
|
||||||
|
except Exception as e:
|
||||||
|
result = str(e)
|
||||||
|
status = "Error"
|
||||||
|
|
||||||
|
# Send response
|
||||||
|
response = {
|
||||||
|
"service": "command",
|
||||||
|
"msg": f"{status} {result}"
|
||||||
|
}
|
||||||
|
await conn.sendto(response)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_update(conn: AsyncConnection, _msg: dict): # pyright: ignore[reportUnusedParameter]
|
||||||
|
"""Handle self-update by running hb_install.sh."""
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.update")
|
||||||
|
|
||||||
|
installer = shutil.which("hb_install.sh")
|
||||||
|
if installer is None:
|
||||||
|
candidate = Path(sys.argv[0]).parent / "hb_install.sh"
|
||||||
|
if candidate.exists():
|
||||||
|
installer = str(candidate)
|
||||||
|
|
||||||
|
if installer is None:
|
||||||
|
error = "hb_install.sh not found in PATH or alongside hbc"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Running installer: {installer}")
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
installer, "client",
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.STDOUT,
|
||||||
|
)
|
||||||
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=120)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
error = "Installer timed out"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
error = f"Installer failed: {e}"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
error = f"Installer exited {proc.returncode}: {out.decode().strip()}"
|
||||||
|
logger.error(error)
|
||||||
|
await conn.sendto({"service": "update", "msg": error})
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Update successful, restart required")
|
||||||
|
await conn.sendto({"service": "update", "msg": "OK"})
|
||||||
|
|
||||||
|
# Trigger restart
|
||||||
|
global dorestart
|
||||||
|
dorestart = True
|
||||||
|
stop()
|
||||||
|
|
||||||
|
|
||||||
|
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||||
|
"""Send periodic heartbeats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
interval: Heartbeat interval in seconds
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger("hbc.heartbeat")
|
||||||
|
|
||||||
|
while running:
|
||||||
|
try:
|
||||||
|
msg = {
|
||||||
|
"acks": conn.ackcount,
|
||||||
|
"rtt": conn.rtts[-1],
|
||||||
|
"interval": interval
|
||||||
|
}
|
||||||
|
await conn.sendto(msg, "HTB")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Heartbeat sender cancelled")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Wait for next interval or shutdown event
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
shutdown_event.wait(),
|
||||||
|
timeout=interval
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass # Normal timeout, continue loop
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Heartbeat sender cancelled during sleep")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||||
|
"""Collect and send plugin data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
registry: Plugin registry
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger("hbc.plugins")
|
||||||
|
|
||||||
|
# Collect InfoPlugins once at startup
|
||||||
|
info_plugins = registry.get_by_type(InfoPlugin)
|
||||||
|
for plugin in info_plugins:
|
||||||
|
try:
|
||||||
|
data = await plugin.collect()
|
||||||
|
if data:
|
||||||
|
# Create PLG message with plugin name
|
||||||
|
plugin_msg = {"plugin": plugin.name, **data}
|
||||||
|
await conn.sendto(plugin_msg, "PLG")
|
||||||
|
logger.info(f"Sent {plugin.name} data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||||
|
|
||||||
|
# Schedule MonitorPlugins
|
||||||
|
# Group plugins by interval
|
||||||
|
from collections import defaultdict
|
||||||
|
by_interval = defaultdict(list)
|
||||||
|
|
||||||
|
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||||
|
for plugin in monitor_plugins:
|
||||||
|
by_interval[plugin.interval].append(plugin)
|
||||||
|
|
||||||
|
# Create tasks for each interval
|
||||||
|
tasks = []
|
||||||
|
for interval, plugins in by_interval.items():
|
||||||
|
task = asyncio.create_task(
|
||||||
|
plugin_collector_interval(conn, plugins, interval)
|
||||||
|
)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
# Wait for all tasks
|
||||||
|
if tasks:
|
||||||
|
try:
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||||
|
for task in tasks:
|
||||||
|
if not task.done():
|
||||||
|
task.cancel()
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def plugin_collector_interval(
|
||||||
|
conn: AsyncConnection,
|
||||||
|
plugins: List,
|
||||||
|
interval: int
|
||||||
|
):
|
||||||
|
"""Collect plugins on a specific interval.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: Connection to send on
|
||||||
|
plugins: List of plugins to collect
|
||||||
|
interval: Collection interval in seconds
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(f"hbc.plugins.{interval}s")
|
||||||
|
|
||||||
|
while running:
|
||||||
|
for plugin in plugins:
|
||||||
|
try:
|
||||||
|
data = await plugin.collect()
|
||||||
|
if data:
|
||||||
|
# Don't use encode_plugin_data - create dict directly
|
||||||
|
plugin_msg = {"plugin": plugin.name, **data}
|
||||||
|
await conn.sendto(plugin_msg, "PLG")
|
||||||
|
logger.debug(f"Sent {plugin.name} data")
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Error collecting {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for next interval or shutdown event
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
shutdown_event.wait(),
|
||||||
|
timeout=interval
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass # Normal timeout, continue loop
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.debug("Plugin collector cancelled during sleep")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def shortname(name: str) -> str:
|
||||||
|
"""Extract short hostname."""
|
||||||
|
return name.split(".")[0]
|
||||||
|
|
||||||
|
|
||||||
|
def stop():
|
||||||
|
"""Stop the event loop."""
|
||||||
|
global running
|
||||||
|
running = False
|
||||||
|
|
||||||
|
# Set shutdown event to wake up sleeping tasks
|
||||||
|
if shutdown_event:
|
||||||
|
shutdown_event.set()
|
||||||
|
|
||||||
|
# Cancel all active tasks
|
||||||
|
for task in active_tasks:
|
||||||
|
if not task.done():
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup(connections: List[AsyncConnection]):
|
||||||
|
"""Cleanup connections on shutdown."""
|
||||||
|
logger = logging.getLogger("hbc.cleanup")
|
||||||
|
logger.info("Cleaning up connections")
|
||||||
|
|
||||||
|
for conn in connections:
|
||||||
|
try:
|
||||||
|
msg = {
|
||||||
|
"shutdown": 1,
|
||||||
|
"acks": conn.ackcount
|
||||||
|
}
|
||||||
|
await conn.sendto(msg)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending shutdown: {e}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Give messages time to send
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
|
async def async_main(args, config):
|
||||||
|
"""Async main function."""
|
||||||
|
global running, shutdown_event, active_tasks
|
||||||
|
|
||||||
|
# Create shutdown event
|
||||||
|
shutdown_event = asyncio.Event()
|
||||||
|
active_tasks = []
|
||||||
|
|
||||||
|
logger = logging.getLogger("hbc.main")
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
iam = socket.gethostname()
|
||||||
|
if args.name:
|
||||||
|
iam = args.name
|
||||||
|
|
||||||
|
hb_hosts = args.hosts
|
||||||
|
hb_port = config.get("hb_port", PORT)
|
||||||
|
interval = config.get("interval", INTERVAL)
|
||||||
|
|
||||||
|
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
||||||
|
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
||||||
|
|
||||||
|
# Create connections
|
||||||
|
connections = []
|
||||||
|
conn_id = 1
|
||||||
|
|
||||||
|
for host in hb_hosts:
|
||||||
|
try:
|
||||||
|
addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
||||||
|
except socket.gaierror as e:
|
||||||
|
logger.error(f"Cannot resolve {host}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for addr_info in addrs:
|
||||||
|
af = addr_info[0]
|
||||||
|
addr = addr_info[4][0]
|
||||||
|
|
||||||
|
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||||
|
if await conn.open():
|
||||||
|
connections.append(conn)
|
||||||
|
conn_id += 1
|
||||||
|
|
||||||
|
if not connections:
|
||||||
|
logger.error("No connections established")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
logger.info(f"Created {len(connections)} connections")
|
||||||
|
|
||||||
|
# Send boot/message if requested
|
||||||
|
if args.boot or args.message:
|
||||||
|
boot_msg = {}
|
||||||
|
if args.boot:
|
||||||
|
boot_msg["boot"] = 1
|
||||||
|
if args.message:
|
||||||
|
boot_msg["service"] = "service"
|
||||||
|
boot_msg["msg"] = args.message
|
||||||
|
|
||||||
|
boot_msg["acks"] = 0
|
||||||
|
for conn in connections:
|
||||||
|
await conn.sendto(boot_msg)
|
||||||
|
|
||||||
|
if args.message and not args.daemon:
|
||||||
|
# Message-only mode
|
||||||
|
await cleanup(connections)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Load plugins
|
||||||
|
registry = PluginRegistry()
|
||||||
|
loader = PluginLoader(registry)
|
||||||
|
|
||||||
|
plugin_dir = Path(__file__).parent / "plugins"
|
||||||
|
if plugin_dir.exists():
|
||||||
|
count = await loader.load_from_directory(plugin_dir, config)
|
||||||
|
logger.info(f"Loaded {count} plugins")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Plugin directory not found: {plugin_dir}")
|
||||||
|
|
||||||
|
# Setup signal handlers
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||||
|
loop.add_signal_handler(sig, stop)
|
||||||
|
|
||||||
|
# Start async tasks
|
||||||
|
# Heartbeat senders (one per connection)
|
||||||
|
for conn in connections:
|
||||||
|
task = asyncio.create_task(heartbeat_sender(conn, interval))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
# Plugin collector (uses all connections, but we'll use first one)
|
||||||
|
if connections and registry.get_enabled():
|
||||||
|
task = asyncio.create_task(plugin_collector(connections[0], registry))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
# Wait for stop or tasks to complete
|
||||||
|
try:
|
||||||
|
await asyncio.gather(*active_tasks, return_exceptions=True)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Tasks cancelled")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
logger.info("Shutting down...")
|
||||||
|
await cleanup(connections)
|
||||||
|
await loader.unload_all()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def daemonize(
|
||||||
|
working_dir="/",
|
||||||
|
stdin="/dev/zero",
|
||||||
|
stdout="/dev/null",
|
||||||
|
stderr="/dev/null"
|
||||||
|
):
|
||||||
|
"""UNIX double-fork daemonization."""
|
||||||
|
try:
|
||||||
|
pid = os.fork()
|
||||||
|
if pid > 0:
|
||||||
|
os._exit(0)
|
||||||
|
except OSError as e:
|
||||||
|
sys.stderr.write(f"fork #1 failed: {e}\n")
|
||||||
|
os._exit(1)
|
||||||
|
|
||||||
|
os.chdir(working_dir)
|
||||||
|
os.setsid()
|
||||||
|
os.umask(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
pid = os.fork()
|
||||||
|
if pid > 0:
|
||||||
|
os._exit(0)
|
||||||
|
except OSError as e:
|
||||||
|
sys.stderr.write(f"fork #2 failed: {e}\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sys.stdout.flush()
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
si = open(stdin, "r")
|
||||||
|
so = open(stdout, "a+")
|
||||||
|
se = open(stderr, "a+")
|
||||||
|
|
||||||
|
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||||
|
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||||
|
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||||
|
|
||||||
|
|
||||||
|
def _reconfigure_logging_for_daemon(log_level: int) -> None:
|
||||||
|
"""Replace StreamHandlers (now writing to /dev/null) with a SysLogHandler."""
|
||||||
|
root = logging.getLogger()
|
||||||
|
for handler in root.handlers[:]:
|
||||||
|
root.removeHandler(handler)
|
||||||
|
handler.close()
|
||||||
|
|
||||||
|
use_udp_fallback = not os.path.exists("/dev/log")
|
||||||
|
|
||||||
|
if use_udp_fallback:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address=("localhost", 514),
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
syslog_handler = SysLogHandler(
|
||||||
|
address="/dev/log",
|
||||||
|
facility=SysLogHandler.LOG_DAEMON,
|
||||||
|
)
|
||||||
|
|
||||||
|
syslog_handler.setFormatter(
|
||||||
|
logging.Formatter("hbc[%(process)d]: %(name)s %(levelname)s: %(message)s")
|
||||||
|
)
|
||||||
|
root.addHandler(syslog_handler)
|
||||||
|
root.setLevel(log_level)
|
||||||
|
|
||||||
|
if use_udp_fallback:
|
||||||
|
logging.warning("/dev/log not found, using syslog UDP localhost:514")
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser():
|
||||||
|
"""Build argument parser."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="hbc",
|
||||||
|
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-b", "--boot",
|
||||||
|
action="store_true",
|
||||||
|
help="Send a boot message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c", "--config",
|
||||||
|
dest="configfile",
|
||||||
|
help="Config file path (YAML)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-m", "--message",
|
||||||
|
dest="message",
|
||||||
|
help="Send a message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", "--name",
|
||||||
|
dest="name",
|
||||||
|
help="Name to use in heartbeat message"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d", "--daemon",
|
||||||
|
action="store_true",
|
||||||
|
help="Run in daemon mode"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--verbose",
|
||||||
|
action="store_true",
|
||||||
|
help="Verbose output"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-x", "--debug",
|
||||||
|
action="count",
|
||||||
|
default=0,
|
||||||
|
help="Increase debug level"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"hosts",
|
||||||
|
nargs="+",
|
||||||
|
help="Heartbeat daemon hosts to send to"
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
"""Main entry point."""
|
||||||
|
global running, dorestart
|
||||||
|
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
log_level = logging.WARNING
|
||||||
|
if args.verbose:
|
||||||
|
log_level = logging.INFO
|
||||||
|
if args.debug:
|
||||||
|
log_level = logging.DEBUG
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=log_level,
|
||||||
|
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
|
||||||
|
# Daemonize if requested
|
||||||
|
if args.daemon:
|
||||||
|
print("Daemonizing...")
|
||||||
|
daemonize()
|
||||||
|
_reconfigure_logging_for_daemon(log_level)
|
||||||
|
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||||
|
|
||||||
|
# Run async main
|
||||||
|
try:
|
||||||
|
exit_code = asyncio.run(async_main(args, config))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Interrupted by user")
|
||||||
|
exit_code = 0
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Fatal error: {e}", exc_info=True)
|
||||||
|
exit_code = 1
|
||||||
|
|
||||||
|
# Handle restart
|
||||||
|
if dorestart:
|
||||||
|
logging.info("Restarting...")
|
||||||
|
os.execv(sys.argv[0], sys.argv)
|
||||||
|
|
||||||
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,422 @@
|
|||||||
|
"""Plugin system for extending Heartbeat data collection and monitoring.
|
||||||
|
|
||||||
|
This module provides the base classes and infrastructure for the plugin system
|
||||||
|
that enables extending hbc (client) data collection and hbd (server) processing.
|
||||||
|
|
||||||
|
Plugin Types:
|
||||||
|
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
|
||||||
|
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
|
||||||
|
|
||||||
|
Plugins run on the client (hbc) to gather data, which is then sent to the server
|
||||||
|
(hbd) for storage, threshold checking, and display.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import inspect
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Type
|
||||||
|
|
||||||
|
|
||||||
|
class Plugin(ABC):
|
||||||
|
"""Base class for all plugins.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
|
||||||
|
version: Plugin version string
|
||||||
|
description: Human-readable description
|
||||||
|
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
|
||||||
|
enabled: Whether plugin is active (can be disabled via config)
|
||||||
|
skip_reason: Set by plugin before returning False from initialize(); causes loader to log INFO instead of WARNING.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = ""
|
||||||
|
version: str = "1.0.0"
|
||||||
|
description: str = ""
|
||||||
|
interval: int = 0
|
||||||
|
enabled: bool = True
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""Initialize plugin with optional configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
|
||||||
|
"""
|
||||||
|
self.config = config or {}
|
||||||
|
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||||
|
self._initialized = False
|
||||||
|
self.skip_reason: Optional[str] = None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize plugin (load resources, check dependencies).
|
||||||
|
|
||||||
|
Called once when plugin is loaded. Plugins should validate dependencies
|
||||||
|
(e.g., check if psutil is available) and prepare any resources.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization succeeded, False otherwise
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect data from the system.
|
||||||
|
|
||||||
|
This is the main method called on each collection interval. Should return
|
||||||
|
a dictionary of key-value pairs representing the collected data.
|
||||||
|
|
||||||
|
Keys should be strings (metric names). Values can be:
|
||||||
|
- Scalars: int, float, str, bool
|
||||||
|
- Lists/dicts (will be serialized appropriately)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of collected metrics, or empty dict on error
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def cleanup(self) -> None:
|
||||||
|
"""Cleanup plugin resources before shutdown.
|
||||||
|
|
||||||
|
Called when plugin is being unloaded or on system shutdown.
|
||||||
|
Override to release resources, close connections, etc.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def validate_data(self, data: Dict[str, Any]) -> bool:
|
||||||
|
"""Validate collected data before sending to server.
|
||||||
|
|
||||||
|
Override to implement custom validation logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data returned from collect()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if data is valid, False otherwise
|
||||||
|
"""
|
||||||
|
return isinstance(data, dict)
|
||||||
|
|
||||||
|
|
||||||
|
class InfoPlugin(Plugin):
|
||||||
|
"""Plugin for collecting static or rarely-changing information.
|
||||||
|
|
||||||
|
InfoPlugins collect data that doesn't change frequently:
|
||||||
|
- OS name and version
|
||||||
|
- Hardware specifications (CPU model, RAM size)
|
||||||
|
- Network interface MAC addresses
|
||||||
|
|
||||||
|
Characteristics:
|
||||||
|
- interval = 0 (collected once at startup by default)
|
||||||
|
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
|
||||||
|
- Data is cached and reused until next collection
|
||||||
|
"""
|
||||||
|
|
||||||
|
interval: int = 0 # Collect once at startup
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._cached_data: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get cached data if available (avoids re-collection).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached data dict, or None if not yet collected
|
||||||
|
"""
|
||||||
|
return self._cached_data
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect and cache static information."""
|
||||||
|
if self._cached_data is None:
|
||||||
|
self._cached_data = await self._collect_info()
|
||||||
|
return self._cached_data
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to perform actual data collection.
|
||||||
|
|
||||||
|
Override this method instead of collect() for InfoPlugins.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def invalidate_cache(self) -> None:
|
||||||
|
"""Force re-collection on next collect() call."""
|
||||||
|
self._cached_data = None
|
||||||
|
|
||||||
|
|
||||||
|
class MonitorPlugin(Plugin):
|
||||||
|
"""Plugin for collecting periodic monitoring data.
|
||||||
|
|
||||||
|
MonitorPlugins collect time-series metrics that change frequently:
|
||||||
|
- CPU usage percentage
|
||||||
|
- Memory consumption
|
||||||
|
- Disk I/O statistics
|
||||||
|
- Network traffic
|
||||||
|
|
||||||
|
Characteristics:
|
||||||
|
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
|
||||||
|
- Collected continuously on schedule
|
||||||
|
- Data includes timestamps for time-series tracking
|
||||||
|
"""
|
||||||
|
|
||||||
|
interval: int = 30 # Default: collect every 30 seconds
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self._last_reading: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
def get_last_reading(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get the last collected reading.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Last reading dict with timestamp, or None if not yet collected
|
||||||
|
"""
|
||||||
|
return self._last_reading
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""Collect monitoring data and store as last reading."""
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
if data:
|
||||||
|
# Add collection timestamp
|
||||||
|
import time
|
||||||
|
data['_timestamp'] = time.time()
|
||||||
|
self._last_reading = data
|
||||||
|
return data
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Internal method to perform actual metric collection.
|
||||||
|
|
||||||
|
Override this method instead of collect() for MonitorPlugins.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PluginRegistry:
|
||||||
|
"""Registry for managing loaded plugins.
|
||||||
|
|
||||||
|
Maintains a collection of loaded plugins and provides methods to
|
||||||
|
query plugins by name, type, or interval.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._plugins: Dict[str, Plugin] = {}
|
||||||
|
self.logger = logging.getLogger("plugin.registry")
|
||||||
|
|
||||||
|
def register(self, plugin: Plugin) -> bool:
|
||||||
|
"""Register a plugin instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin: Plugin instance to register
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if registered successfully, False if name conflict
|
||||||
|
"""
|
||||||
|
if plugin.name in self._plugins:
|
||||||
|
self.logger.error(f"Plugin '{plugin.name}' already registered")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self._plugins[plugin.name] = plugin
|
||||||
|
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def unregister(self, name: str) -> bool:
|
||||||
|
"""Unregister a plugin by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Plugin name to unregister
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if unregistered, False if not found
|
||||||
|
"""
|
||||||
|
if name in self._plugins:
|
||||||
|
del self._plugins[name]
|
||||||
|
self.logger.info(f"Unregistered plugin: {name}")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get(self, name: str) -> Optional[Plugin]:
|
||||||
|
"""Get plugin by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Plugin name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Plugin instance or None if not found
|
||||||
|
"""
|
||||||
|
return self._plugins.get(name)
|
||||||
|
|
||||||
|
def get_all(self) -> List[Plugin]:
|
||||||
|
"""Get all registered plugins."""
|
||||||
|
return list(self._plugins.values())
|
||||||
|
|
||||||
|
def get_enabled(self) -> List[Plugin]:
|
||||||
|
"""Get all enabled plugins."""
|
||||||
|
return [p for p in self._plugins.values() if p.enabled]
|
||||||
|
|
||||||
|
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
|
||||||
|
"""Get all plugins of a specific type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of plugins matching the type
|
||||||
|
"""
|
||||||
|
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
|
||||||
|
|
||||||
|
def get_by_interval(self, interval: int) -> List[Plugin]:
|
||||||
|
"""Get all plugins with a specific collection interval.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
interval: Interval in seconds (0 for one-time collection)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of plugins with matching interval
|
||||||
|
"""
|
||||||
|
return [p for p in self._plugins.values() if p.interval == interval]
|
||||||
|
|
||||||
|
|
||||||
|
class PluginLoader:
|
||||||
|
"""Load plugins from filesystem and instantiate them.
|
||||||
|
|
||||||
|
Scans plugin directories for Python modules containing Plugin subclasses,
|
||||||
|
loads them dynamically, and registers them with the PluginRegistry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, registry: PluginRegistry):
|
||||||
|
self.registry = registry
|
||||||
|
self.logger = logging.getLogger("plugin.loader")
|
||||||
|
self._loaded_modules: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
async def load_from_directory(
|
||||||
|
self,
|
||||||
|
directory: Path,
|
||||||
|
config: Optional[Dict[str, Any]] = None
|
||||||
|
) -> int:
|
||||||
|
"""Load all plugins from a directory.
|
||||||
|
|
||||||
|
Scans for .py files, imports them, finds Plugin subclasses,
|
||||||
|
instantiates them with config, initializes, and registers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Path to plugin directory
|
||||||
|
config: Configuration dict (may contain per-plugin config)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of plugins successfully loaded
|
||||||
|
"""
|
||||||
|
if not directory.exists() or not directory.is_dir():
|
||||||
|
self.logger.warning(f"Plugin directory not found: {directory}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
loaded_count = 0
|
||||||
|
raw_config = config or {}
|
||||||
|
# Per-plugin config lives under the 'plugins' key or at top-level.
|
||||||
|
# CLIENT_DEFAULTS seeds "plugins": {} so the key always exists; check
|
||||||
|
# both the subdict and top-level so that either layout in .hbc.yaml works.
|
||||||
|
plugins_subconfig = raw_config.get("plugins", {})
|
||||||
|
|
||||||
|
# Scan for Python files
|
||||||
|
for plugin_file in directory.glob("*.py"):
|
||||||
|
if plugin_file.name.startswith("_"):
|
||||||
|
continue # Skip __init__.py and private modules
|
||||||
|
|
||||||
|
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load module dynamically
|
||||||
|
module_name = f"plugins.{plugin_file.stem}"
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
|
||||||
|
if not spec or not spec.loader:
|
||||||
|
self.logger.warning(f"Could not create spec for {plugin_file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
sys.modules[module_name] = module
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
self._loaded_modules[module_name] = module
|
||||||
|
|
||||||
|
self.logger.debug(f"Loaded module: {module_name}")
|
||||||
|
|
||||||
|
# Track which plugin classes we've already processed to avoid duplicates
|
||||||
|
processed_classes = set()
|
||||||
|
|
||||||
|
# Find Plugin subclasses in module
|
||||||
|
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||||
|
# Skip base classes and non-Plugin classes
|
||||||
|
if obj in (Plugin, InfoPlugin, MonitorPlugin):
|
||||||
|
self.logger.debug(f"Skipping base class: {name}")
|
||||||
|
continue
|
||||||
|
if not issubclass(obj, Plugin):
|
||||||
|
self.logger.debug(f"Skipping non-Plugin class: {name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if we've already processed this class (handles module-level aliases)
|
||||||
|
if id(obj) in processed_classes:
|
||||||
|
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
|
||||||
|
continue
|
||||||
|
processed_classes.add(id(obj))
|
||||||
|
|
||||||
|
self.logger.debug(f"Found plugin class: {name}")
|
||||||
|
|
||||||
|
# Instantiate plugin with config — check plugins subdict first,
|
||||||
|
# then top-level keys (e.g. nagios_runner: ... at root of config).
|
||||||
|
plugin_instance_config = plugins_subconfig.get(obj.name) or raw_config.get(obj.name, {})
|
||||||
|
plugin = obj(config=plugin_instance_config)
|
||||||
|
|
||||||
|
# Initialize plugin
|
||||||
|
try:
|
||||||
|
initialized = await plugin.initialize()
|
||||||
|
if not initialized:
|
||||||
|
if plugin.skip_reason:
|
||||||
|
self.logger.info(
|
||||||
|
f"Plugin {plugin.name} skipped: {plugin.skip_reason}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Plugin {plugin.name} failed initialization, skipping"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error initializing plugin {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Register with registry
|
||||||
|
if self.registry.register(plugin):
|
||||||
|
loaded_count += 1
|
||||||
|
self.logger.info(
|
||||||
|
f"Loaded plugin: {plugin.name} v{plugin.version} "
|
||||||
|
f"(interval: {plugin.interval}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error loading plugin from {plugin_file}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return loaded_count
|
||||||
|
|
||||||
|
async def unload_all(self) -> None:
|
||||||
|
"""Unload all plugins and cleanup resources."""
|
||||||
|
for plugin in self.registry.get_all():
|
||||||
|
try:
|
||||||
|
await plugin.cleanup()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
f"Error cleaning up plugin {plugin.name}: {e}",
|
||||||
|
exc_info=True
|
||||||
|
)
|
||||||
|
self.registry.unregister(plugin.name)
|
||||||
|
|
||||||
|
# Remove loaded modules
|
||||||
|
for module_name in self._loaded_modules:
|
||||||
|
if module_name in sys.modules:
|
||||||
|
del sys.modules[module_name]
|
||||||
|
self._loaded_modules.clear()
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
"""CPU Monitoring Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
||||||
|
load average, and process counts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import from parent package
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class CPUMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Monitor CPU usage and load.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Overall CPU usage percentage
|
||||||
|
- Per-core CPU usage (if enabled in config)
|
||||||
|
- Load average (1min, 5min, 15min)
|
||||||
|
- Process count
|
||||||
|
- CPU frequency (if available)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "cpu_monitor"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "CPU usage and load monitoring"
|
||||||
|
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
self.psutil = None
|
||||||
|
self.per_core = config.get("per_core", False) if config else False
|
||||||
|
self.interval = config.get("interval", 300) if config else 300
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the CPU monitor plugin.
|
||||||
|
|
||||||
|
Checks if psutil is available.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if psutil is available, False otherwise
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
self.psutil = psutil
|
||||||
|
self.logger.info(f"{self.name} initialized successfully")
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
self.logger.error(
|
||||||
|
"psutil module not available. Install with: pip install psutil"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect CPU metrics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with CPU metrics
|
||||||
|
"""
|
||||||
|
if not self.psutil:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
||||||
|
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
||||||
|
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
||||||
|
|
||||||
|
# Per-core CPU usage (if enabled)
|
||||||
|
if self.per_core:
|
||||||
|
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
||||||
|
data["cpu_per_core"] = per_core_percents
|
||||||
|
data["cpu_core_count"] = len(per_core_percents)
|
||||||
|
else:
|
||||||
|
# Just report core count
|
||||||
|
data["cpu_core_count"] = self.psutil.cpu_count()
|
||||||
|
|
||||||
|
# Load average (Unix-like systems only)
|
||||||
|
try:
|
||||||
|
load_avg = self.psutil.getloadavg()
|
||||||
|
data["load_1min"] = round(load_avg[0], 2)
|
||||||
|
data["load_5min"] = round(load_avg[1], 2)
|
||||||
|
data["load_15min"] = round(load_avg[2], 2)
|
||||||
|
except (AttributeError, OSError):
|
||||||
|
# Not available on Windows
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Process count
|
||||||
|
try:
|
||||||
|
data["process_count"] = len(self.psutil.pids())
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not get process count: {e}")
|
||||||
|
|
||||||
|
# CPU frequency (if available)
|
||||||
|
try:
|
||||||
|
freq = self.psutil.cpu_freq()
|
||||||
|
if freq:
|
||||||
|
data["cpu_freq_current"] = round(freq.current, 2)
|
||||||
|
data["cpu_freq_min"] = round(freq.min, 2)
|
||||||
|
data["cpu_freq_max"] = round(freq.max, 2)
|
||||||
|
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
||||||
|
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
||||||
|
self.logger.debug(f"CPU frequency not available: {e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
# CPU times (user, system, idle, etc.)
|
||||||
|
try:
|
||||||
|
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
||||||
|
data["cpu_user"] = round(cpu_times.user, 1)
|
||||||
|
data["cpu_system"] = round(cpu_times.system, 1)
|
||||||
|
data["cpu_idle"] = round(cpu_times.idle, 1)
|
||||||
|
if hasattr(cpu_times, "iowait"):
|
||||||
|
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Could not get CPU times: {e}")
|
||||||
|
|
||||||
|
self.logger.debug(
|
||||||
|
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
||||||
|
return {}
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
"""
|
||||||
|
Disk monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects disk usage and I/O statistics using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DiskMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor disk usage and I/O statistics.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Disk partition information
|
||||||
|
- Disk usage per partition (total, used, free, percent)
|
||||||
|
- Disk I/O counters (read/write bytes, read/write count)
|
||||||
|
- Disk I/O time statistics
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
partitions: List of mount points to monitor (default: all)
|
||||||
|
include_io: Include disk I/O statistics (default: True)
|
||||||
|
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "disk_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the disk monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- partitions: List of specific mount points to monitor
|
||||||
|
- include_io: Include I/O statistics (default: True)
|
||||||
|
- exclude_types: List of filesystem types to exclude
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.partitions = self.config.get('partitions', None) # None = all partitions
|
||||||
|
self.include_io = self.config.get('include_io', True)
|
||||||
|
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for disk_monitor plugin")
|
||||||
|
|
||||||
|
# Store previous I/O counters for delta calculation
|
||||||
|
self._prev_io = {}
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - disk_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
|
||||||
|
|
||||||
|
# Initialize I/O counters if available
|
||||||
|
if self.include_io:
|
||||||
|
try:
|
||||||
|
self._prev_io = psutil.disk_io_counters(perdisk=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not initialize disk I/O counters: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current disk statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with disk metrics organized by partition:
|
||||||
|
- partitions: Dict of partition data, keyed by mount point
|
||||||
|
- device: Device name (e.g., /dev/sda1)
|
||||||
|
- fstype: Filesystem type (e.g., ext4)
|
||||||
|
- total: Total space in bytes
|
||||||
|
- used: Used space in bytes
|
||||||
|
- free: Free space in bytes
|
||||||
|
- percent: Usage percentage
|
||||||
|
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
|
||||||
|
- read_count: Number of reads
|
||||||
|
- write_count: Number of writes
|
||||||
|
- read_bytes: Bytes read
|
||||||
|
- write_bytes: Bytes written
|
||||||
|
- read_time: Time spent reading in ms
|
||||||
|
- write_time: Time spent writing in ms
|
||||||
|
- read_bytes_delta: Bytes read since last collection
|
||||||
|
- write_bytes_delta: Bytes written since last collection
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting disk metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect disk metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect partition usage
|
||||||
|
partitions_data = {}
|
||||||
|
partitions = psutil.disk_partitions(all=False)
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
# Skip unwanted filesystem types
|
||||||
|
if partition.fstype in self.exclude_types:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if we're only monitoring specific partitions
|
||||||
|
if self.partitions and partition.mountpoint not in self.partitions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
usage = psutil.disk_usage(partition.mountpoint)
|
||||||
|
partitions_data[partition.mountpoint] = {
|
||||||
|
'device': partition.device,
|
||||||
|
'fstype': partition.fstype,
|
||||||
|
'total': usage.total,
|
||||||
|
'used': usage.used,
|
||||||
|
'free': usage.free,
|
||||||
|
'percent': usage.percent
|
||||||
|
}
|
||||||
|
except PermissionError:
|
||||||
|
logger.debug(f"Permission denied accessing {partition.mountpoint}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error reading {partition.mountpoint}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
metrics['partitions'] = partitions_data
|
||||||
|
|
||||||
|
# Collect I/O statistics
|
||||||
|
if self.include_io:
|
||||||
|
try:
|
||||||
|
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||||
|
io_data = {}
|
||||||
|
|
||||||
|
for disk_name, counters in io_counters.items():
|
||||||
|
disk_stats = {
|
||||||
|
'read_count': counters.read_count,
|
||||||
|
'write_count': counters.write_count,
|
||||||
|
'read_bytes': counters.read_bytes,
|
||||||
|
'write_bytes': counters.write_bytes,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add time statistics if available
|
||||||
|
if hasattr(counters, 'read_time'):
|
||||||
|
disk_stats['read_time'] = counters.read_time
|
||||||
|
if hasattr(counters, 'write_time'):
|
||||||
|
disk_stats['write_time'] = counters.write_time
|
||||||
|
if hasattr(counters, 'busy_time'):
|
||||||
|
disk_stats['busy_time'] = counters.busy_time
|
||||||
|
|
||||||
|
# Calculate deltas from previous collection
|
||||||
|
if disk_name in self._prev_io:
|
||||||
|
prev = self._prev_io[disk_name]
|
||||||
|
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
|
||||||
|
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
|
||||||
|
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
|
||||||
|
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
|
||||||
|
|
||||||
|
io_data[disk_name] = disk_stats
|
||||||
|
|
||||||
|
metrics['io_counters'] = io_data
|
||||||
|
|
||||||
|
# Store current counters for next delta calculation
|
||||||
|
self._prev_io = io_counters
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect disk I/O statistics: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Disk monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = DiskMonitorPlugin
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
"""
|
||||||
|
Filesystem information plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects static filesystem and partition information using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import InfoPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemInfoPlugin(InfoPlugin):
|
||||||
|
"""
|
||||||
|
Collect filesystem and partition information.
|
||||||
|
|
||||||
|
This is an InfoPlugin that collects static information once during startup.
|
||||||
|
|
||||||
|
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
|
||||||
|
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- List of mounted filesystems
|
||||||
|
- Partition details (device, mount point, filesystem type, options)
|
||||||
|
- Filesystem capabilities and features
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||||
|
exclude_types: List of additional filesystem types to exclude (default: [])
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "filesystem_info"
|
||||||
|
interval = 0 # InfoPlugin - collect once
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the filesystem info plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||||
|
- exclude_types: List of filesystem types to exclude (default: [])
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.include_pseudo = self.config.get('include_pseudo', False)
|
||||||
|
# By default, no exclusions since all=False filters most pseudo filesystems
|
||||||
|
# Users can add specific types to exclude if needed
|
||||||
|
self.exclude_types = set(self.config.get('exclude_types', []))
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for filesystem_info plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - filesystem_info cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect filesystem information.
|
||||||
|
|
||||||
|
Returns only physical mounted filesystems by default.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with filesystem data:
|
||||||
|
- filesystems: List of filesystem dictionaries:
|
||||||
|
- device: Device name (e.g., /dev/sda1)
|
||||||
|
- mountpoint: Mount point path
|
||||||
|
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
|
||||||
|
- opts: Mount options (comma-separated string)
|
||||||
|
- maxfile: Maximum filename length
|
||||||
|
- maxpath: Maximum path length
|
||||||
|
- filesystem_types: List of unique filesystem types found
|
||||||
|
- mount_count: Total number of mounted filesystems
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_info()
|
||||||
|
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting filesystem info: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Collect filesystem information from psutil."""
|
||||||
|
info = {}
|
||||||
|
filesystems = []
|
||||||
|
filesystem_types = set()
|
||||||
|
|
||||||
|
# Get mounted disk partitions
|
||||||
|
# all=False returns only physical devices (real mounted filesystems)
|
||||||
|
# all=True would include pseudo filesystems (proc, sysfs, etc.)
|
||||||
|
partitions = psutil.disk_partitions(all=self.include_pseudo)
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
# Additional filtering if exclude_types is specified
|
||||||
|
if partition.fstype in self.exclude_types:
|
||||||
|
continue
|
||||||
|
|
||||||
|
fs_info = {
|
||||||
|
'device': partition.device,
|
||||||
|
'mountpoint': partition.mountpoint,
|
||||||
|
'fstype': partition.fstype,
|
||||||
|
'opts': partition.opts,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to get filesystem capabilities
|
||||||
|
try:
|
||||||
|
# Get path configuration for this mount point
|
||||||
|
import os
|
||||||
|
if hasattr(os, 'pathconf'):
|
||||||
|
try:
|
||||||
|
# Maximum filename length
|
||||||
|
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||||
|
if max_name:
|
||||||
|
fs_info['maxfile'] = max_name
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Maximum path length
|
||||||
|
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||||
|
if max_path:
|
||||||
|
fs_info['maxpath'] = max_path
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
|
||||||
|
|
||||||
|
filesystems.append(fs_info)
|
||||||
|
filesystem_types.add(partition.fstype)
|
||||||
|
|
||||||
|
info['filesystems'] = filesystems
|
||||||
|
info['filesystem_types'] = sorted(list(filesystem_types))
|
||||||
|
info['mount_count'] = len(filesystems)
|
||||||
|
|
||||||
|
# Add some additional filesystem statistics
|
||||||
|
try:
|
||||||
|
# Get boot time (useful for determining filesystem mount times)
|
||||||
|
boot_time = psutil.boot_time()
|
||||||
|
info['boot_time'] = boot_time
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Could not get boot time: {e}")
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Filesystem info cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = FilesystemInfoPlugin
|
||||||
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
Memory monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects memory and swap usage statistics using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor memory and swap usage.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Physical memory (RAM) usage and statistics
|
||||||
|
- Virtual memory details
|
||||||
|
- Swap memory usage and statistics
|
||||||
|
- Memory available for applications
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
include_swap: Include swap statistics (default: True)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "memory_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the memory monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- include_swap: Include swap statistics (default: True)
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.include_swap = self.config.get('include_swap', True)
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for memory_monitor plugin")
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - memory_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current memory statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with memory metrics:
|
||||||
|
- memory_total: Total physical RAM in bytes
|
||||||
|
- memory_available: Available memory in bytes
|
||||||
|
- memory_used: Used memory in bytes
|
||||||
|
- memory_free: Free memory in bytes
|
||||||
|
- memory_percent: Memory usage percentage
|
||||||
|
- memory_active: Active memory (Unix)
|
||||||
|
- memory_inactive: Inactive memory (Unix)
|
||||||
|
- memory_buffers: Buffers (Linux)
|
||||||
|
- memory_cached: Cached (Linux)
|
||||||
|
- memory_shared: Shared (Linux)
|
||||||
|
- swap_total: Total swap in bytes (if include_swap)
|
||||||
|
- swap_used: Used swap in bytes (if include_swap)
|
||||||
|
- swap_free: Free swap in bytes (if include_swap)
|
||||||
|
- swap_percent: Swap usage percentage (if include_swap)
|
||||||
|
- swap_sin: Bytes swapped in from disk (if include_swap)
|
||||||
|
- swap_sout: Bytes swapped out to disk (if include_swap)
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected memory metrics: {len(data)} fields")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting memory metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect memory metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Virtual (physical) memory statistics
|
||||||
|
vmem = psutil.virtual_memory()
|
||||||
|
metrics['memory_total'] = vmem.total
|
||||||
|
metrics['memory_available'] = vmem.available
|
||||||
|
metrics['memory_used'] = vmem.used
|
||||||
|
metrics['memory_free'] = vmem.free
|
||||||
|
metrics['memory_percent'] = vmem.percent
|
||||||
|
|
||||||
|
# Platform-specific memory details
|
||||||
|
if hasattr(vmem, 'active'):
|
||||||
|
metrics['memory_active'] = vmem.active
|
||||||
|
if hasattr(vmem, 'inactive'):
|
||||||
|
metrics['memory_inactive'] = vmem.inactive
|
||||||
|
if hasattr(vmem, 'buffers'):
|
||||||
|
metrics['memory_buffers'] = vmem.buffers
|
||||||
|
if hasattr(vmem, 'cached'):
|
||||||
|
metrics['memory_cached'] = vmem.cached
|
||||||
|
if hasattr(vmem, 'shared'):
|
||||||
|
metrics['memory_shared'] = vmem.shared
|
||||||
|
|
||||||
|
# Swap memory statistics
|
||||||
|
if self.include_swap:
|
||||||
|
try:
|
||||||
|
swap = psutil.swap_memory()
|
||||||
|
metrics['swap_total'] = swap.total
|
||||||
|
metrics['swap_used'] = swap.used
|
||||||
|
metrics['swap_free'] = swap.free
|
||||||
|
metrics['swap_percent'] = swap.percent
|
||||||
|
|
||||||
|
# Swap in/out counters (may not be available on all platforms)
|
||||||
|
if hasattr(swap, 'sin'):
|
||||||
|
metrics['swap_sin'] = swap.sin
|
||||||
|
if hasattr(swap, 'sout'):
|
||||||
|
metrics['swap_sout'] = swap.sout
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect swap statistics: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Memory monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = MemoryMonitorPlugin
|
||||||
@@ -0,0 +1,303 @@
|
|||||||
|
"""Nagios Plugin Runner for Heartbeat.
|
||||||
|
|
||||||
|
Executes Nagios-compatible monitoring plugins and parses their output.
|
||||||
|
|
||||||
|
Nagios Plugin Standard:
|
||||||
|
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||||
|
- Output format: Single line status message, optional performance data
|
||||||
|
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
|
||||||
|
Example configuration in ~/.hb.yaml:
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
interval: 60
|
||||||
|
commands:
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shlex
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# Nagios exit codes
|
||||||
|
NAGIOS_OK = 0
|
||||||
|
NAGIOS_WARNING = 1
|
||||||
|
NAGIOS_CRITICAL = 2
|
||||||
|
NAGIOS_UNKNOWN = 3
|
||||||
|
|
||||||
|
STATUS_NAMES = {
|
||||||
|
NAGIOS_OK: "OK",
|
||||||
|
NAGIOS_WARNING: "WARNING",
|
||||||
|
NAGIOS_CRITICAL: "CRITICAL",
|
||||||
|
NAGIOS_UNKNOWN: "UNKNOWN"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class NagiosRunnerPlugin(MonitorPlugin):
|
||||||
|
"""Run Nagios-compatible monitoring plugins.
|
||||||
|
|
||||||
|
This plugin executes external Nagios plugins and collects their output,
|
||||||
|
including status codes, messages, and performance data.
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
commands: List of command definitions with 'name' and 'command' keys
|
||||||
|
timeout: Command execution timeout in seconds (default: 30)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300 # Check every 5 minutes
|
||||||
|
timeout: 30
|
||||||
|
commands:
|
||||||
|
- name: check_disk
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "nagios_runner"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Execute Nagios-compatible monitoring plugins"
|
||||||
|
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
# Extract configuration
|
||||||
|
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||||
|
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||||
|
self.interval = config.get("interval", 300) if config else 300
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the Nagios runner plugin.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if at least one command is configured, False otherwise
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
|
||||||
|
if not self.commands:
|
||||||
|
self.skip_reason = "no commands configured (add nagios_runner.commands to config)"
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name", "unnamed")
|
||||||
|
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
|
||||||
|
|
||||||
|
# Validate absolute command paths early
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name", "unnamed")
|
||||||
|
command = cmd_config.get("command", "")
|
||||||
|
if not command:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
tokens = shlex.split(command)
|
||||||
|
except ValueError:
|
||||||
|
continue # malformed command string; skip validation
|
||||||
|
if not tokens:
|
||||||
|
continue
|
||||||
|
exe = tokens[0]
|
||||||
|
if os.path.isabs(exe):
|
||||||
|
if not os.path.isfile(exe):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not found: {exe}"
|
||||||
|
)
|
||||||
|
elif not os.access(exe, os.X_OK):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Command '{name}': executable not executable: {exe}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect metrics from all configured Nagios plugins.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with results from all plugins
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Track overall status (worst status wins)
|
||||||
|
worst_status = NAGIOS_OK
|
||||||
|
|
||||||
|
for cmd_config in self.commands:
|
||||||
|
name = cmd_config.get("name")
|
||||||
|
command = cmd_config.get("command")
|
||||||
|
|
||||||
|
if not name or not command:
|
||||||
|
self.logger.warning("Skipping command with missing name or command")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Execute plugin
|
||||||
|
try:
|
||||||
|
status_code, output, perfdata = await self._run_nagios_plugin(command)
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
|
||||||
|
results[f"{name}_status_code"] = status_code
|
||||||
|
results[f"{name}_output"] = output
|
||||||
|
|
||||||
|
# Track worst status
|
||||||
|
if status_code > worst_status:
|
||||||
|
worst_status = status_code
|
||||||
|
|
||||||
|
# Parse and add performance data
|
||||||
|
if perfdata:
|
||||||
|
for metric_name, metric_value in perfdata.items():
|
||||||
|
results[f"{name}_{metric_name}"] = metric_value
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error running {name}: {e}", exc_info=True)
|
||||||
|
results[f"{name}_status"] = "ERROR"
|
||||||
|
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||||
|
results[f"{name}_output"] = str(e)
|
||||||
|
worst_status = NAGIOS_UNKNOWN
|
||||||
|
|
||||||
|
# Add overall status
|
||||||
|
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
|
||||||
|
results["overall_status_code"] = worst_status
|
||||||
|
results["plugin_count"] = len(self.commands)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _run_nagios_plugin(
|
||||||
|
self,
|
||||||
|
command: str
|
||||||
|
) -> Tuple[int, str, Dict[str, Any]]:
|
||||||
|
"""Execute a Nagios plugin and parse its output."""
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_shell(
|
||||||
|
command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||||
|
proc.communicate(), timeout=self.timeout
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
proc.kill()
|
||||||
|
await proc.communicate()
|
||||||
|
self.logger.error(f"Command timed out: {command}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||||
|
|
||||||
|
status_code = proc.returncode
|
||||||
|
|
||||||
|
if status_code < 0:
|
||||||
|
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
|
||||||
|
|
||||||
|
if status_code > 3:
|
||||||
|
status_code = NAGIOS_UNKNOWN
|
||||||
|
|
||||||
|
stdout = stdout_bytes.decode(errors="replace").strip()
|
||||||
|
stderr = stderr_bytes.decode(errors="replace").strip()
|
||||||
|
|
||||||
|
# Parse perfdata from stdout before mixing in stderr
|
||||||
|
perfdata = self._parse_perfdata(stdout)
|
||||||
|
|
||||||
|
# Build status message
|
||||||
|
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
|
||||||
|
|
||||||
|
if not stdout and stderr:
|
||||||
|
output_msg = stderr
|
||||||
|
elif stdout and stderr:
|
||||||
|
output_msg = f"{status_part} [stderr: {stderr}]"
|
||||||
|
else:
|
||||||
|
output_msg = status_part
|
||||||
|
|
||||||
|
return status_code, output_msg, perfdata
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error executing command: {e}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||||
|
|
||||||
|
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
|
||||||
|
"""Parse Nagios performance data from plugin output.
|
||||||
|
|
||||||
|
Nagios performance data format:
|
||||||
|
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
|
||||||
|
Multiple metrics separated by spaces.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output: Plugin output string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of metric_name: value
|
||||||
|
"""
|
||||||
|
perfdata = {}
|
||||||
|
|
||||||
|
# Performance data comes after the pipe character
|
||||||
|
if '|' not in output:
|
||||||
|
return perfdata
|
||||||
|
|
||||||
|
perf_section = output.split('|', 1)[1].strip()
|
||||||
|
|
||||||
|
# Regex to match performance data format
|
||||||
|
# Matches: 'label'=value or label=value
|
||||||
|
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
|
||||||
|
|
||||||
|
for match in re.finditer(perf_regex, perf_section):
|
||||||
|
label = match.group(1).strip()
|
||||||
|
value_str = match.group(2)
|
||||||
|
uom = match.group(3) or ""
|
||||||
|
warn = match.group(4)
|
||||||
|
crit = match.group(5)
|
||||||
|
min_val = match.group(6)
|
||||||
|
max_val = match.group(7)
|
||||||
|
|
||||||
|
# Convert value to float
|
||||||
|
try:
|
||||||
|
value = float(value_str)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Store the value
|
||||||
|
perfdata[label] = value
|
||||||
|
|
||||||
|
# Optionally store UOM as separate field
|
||||||
|
if uom:
|
||||||
|
perfdata[f"{label}_uom"] = uom
|
||||||
|
|
||||||
|
# Store thresholds if present
|
||||||
|
if warn:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_warn"] = float(warn)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if crit:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_crit"] = float(crit)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if min_val:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_min"] = float(min_val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if max_val:
|
||||||
|
try:
|
||||||
|
perfdata[f"{label}_max"] = float(max_val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return perfdata
|
||||||
@@ -0,0 +1,240 @@
|
|||||||
|
"""
|
||||||
|
Network monitoring plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects network interface statistics and connection information using psutil.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class NetworkMonitorPlugin(MonitorPlugin):
|
||||||
|
"""
|
||||||
|
Monitor network interface statistics and connections.
|
||||||
|
|
||||||
|
Collects:
|
||||||
|
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
|
||||||
|
- Per-interface statistics
|
||||||
|
- Network connection counts by state
|
||||||
|
- Interface addresses and configuration
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
interval: Collection interval in seconds (default: 300)
|
||||||
|
interfaces: List of interfaces to monitor (default: all)
|
||||||
|
include_connections: Include connection statistics (default: True)
|
||||||
|
include_addresses: Include interface addresses (default: False)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "network_monitor"
|
||||||
|
interval = 300 # Collect every 5 minutes by default
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the network monitor plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional configuration dict with keys:
|
||||||
|
- interval: Collection interval in seconds (default: 300)
|
||||||
|
- interfaces: List of specific interfaces to monitor
|
||||||
|
- include_connections: Include connection stats (default: True)
|
||||||
|
- include_addresses: Include interface addresses (default: False)
|
||||||
|
"""
|
||||||
|
super().__init__(config)
|
||||||
|
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
|
||||||
|
self.include_connections = self.config.get('include_connections', True)
|
||||||
|
self.include_addresses = self.config.get('include_addresses', False)
|
||||||
|
self.interval = self.config.get('interval', 300)
|
||||||
|
|
||||||
|
if psutil is None:
|
||||||
|
raise ImportError("psutil library is required for network_monitor plugin")
|
||||||
|
|
||||||
|
# Store previous I/O counters for delta calculation
|
||||||
|
self._prev_io = {}
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
"""Initialize the plugin (check psutil availability)."""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available - network_monitor cannot run")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
|
||||||
|
f"connections: {self.include_connections})")
|
||||||
|
|
||||||
|
# Initialize I/O counters
|
||||||
|
try:
|
||||||
|
self._prev_io = psutil.net_io_counters(pernic=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not initialize network I/O counters: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def collect(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect current network statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with network metrics:
|
||||||
|
- interfaces: Dict of interface statistics, keyed by interface name
|
||||||
|
- bytes_sent: Total bytes sent
|
||||||
|
- bytes_recv: Total bytes received
|
||||||
|
- packets_sent: Total packets sent
|
||||||
|
- packets_recv: Total packets received
|
||||||
|
- errin: Total incoming errors
|
||||||
|
- errout: Total outgoing errors
|
||||||
|
- dropin: Total incoming packets dropped
|
||||||
|
- dropout: Total outgoing packets dropped
|
||||||
|
- bytes_sent_delta: Bytes sent since last collection
|
||||||
|
- bytes_recv_delta: Bytes received since last collection
|
||||||
|
- packets_sent_delta: Packets sent since last collection
|
||||||
|
- packets_recv_delta: Packets received since last collection
|
||||||
|
- connections: Connection statistics by state (if include_connections)
|
||||||
|
- ESTABLISHED: Count of established connections
|
||||||
|
- LISTEN: Count of listening sockets
|
||||||
|
- TIME_WAIT: Count of TIME_WAIT connections
|
||||||
|
- etc.
|
||||||
|
- addresses: Interface address information (if include_addresses)
|
||||||
|
- Dict keyed by interface name with address details
|
||||||
|
"""
|
||||||
|
if psutil is None:
|
||||||
|
logger.error("psutil not available")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = await self._collect_metrics()
|
||||||
|
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting network metrics: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Collect network metrics from psutil."""
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
# Collect per-interface I/O counters
|
||||||
|
try:
|
||||||
|
io_counters = psutil.net_io_counters(pernic=True)
|
||||||
|
interfaces_data = {}
|
||||||
|
|
||||||
|
for iface_name, counters in io_counters.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
iface_stats = {
|
||||||
|
'bytes_sent': counters.bytes_sent,
|
||||||
|
'bytes_recv': counters.bytes_recv,
|
||||||
|
'packets_sent': counters.packets_sent,
|
||||||
|
'packets_recv': counters.packets_recv,
|
||||||
|
'errin': counters.errin,
|
||||||
|
'errout': counters.errout,
|
||||||
|
'dropin': counters.dropin,
|
||||||
|
'dropout': counters.dropout,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate deltas from previous collection
|
||||||
|
if iface_name in self._prev_io:
|
||||||
|
prev = self._prev_io[iface_name]
|
||||||
|
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
|
||||||
|
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
|
||||||
|
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
|
||||||
|
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
|
||||||
|
|
||||||
|
interfaces_data[iface_name] = iface_stats
|
||||||
|
|
||||||
|
metrics['interfaces'] = interfaces_data
|
||||||
|
|
||||||
|
# Store current counters for next delta calculation
|
||||||
|
self._prev_io = io_counters
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect network I/O counters: {e}")
|
||||||
|
|
||||||
|
# Collect connection statistics
|
||||||
|
if self.include_connections:
|
||||||
|
try:
|
||||||
|
connections = psutil.net_connections(kind='inet')
|
||||||
|
conn_stats = {}
|
||||||
|
|
||||||
|
# Count connections by state
|
||||||
|
for conn in connections:
|
||||||
|
state = conn.status
|
||||||
|
conn_stats[state] = conn_stats.get(state, 0) + 1
|
||||||
|
|
||||||
|
metrics['connections'] = conn_stats
|
||||||
|
|
||||||
|
except (PermissionError, psutil.AccessDenied):
|
||||||
|
logger.debug("Permission denied for net_connections (requires root/admin)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect connection statistics: {e}")
|
||||||
|
|
||||||
|
# Collect interface addresses
|
||||||
|
if self.include_addresses:
|
||||||
|
try:
|
||||||
|
addresses = psutil.net_if_addrs()
|
||||||
|
addr_data = {}
|
||||||
|
|
||||||
|
for iface_name, addrs in addresses.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
iface_addrs = []
|
||||||
|
for addr in addrs:
|
||||||
|
addr_info = {
|
||||||
|
'family': str(addr.family),
|
||||||
|
'address': addr.address,
|
||||||
|
}
|
||||||
|
if addr.netmask:
|
||||||
|
addr_info['netmask'] = addr.netmask
|
||||||
|
if addr.broadcast:
|
||||||
|
addr_info['broadcast'] = addr.broadcast
|
||||||
|
iface_addrs.append(addr_info)
|
||||||
|
|
||||||
|
addr_data[iface_name] = iface_addrs
|
||||||
|
|
||||||
|
metrics['addresses'] = addr_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect interface addresses: {e}")
|
||||||
|
|
||||||
|
# Add interface stats (up/down status, speed, mtu)
|
||||||
|
try:
|
||||||
|
if_stats = psutil.net_if_stats()
|
||||||
|
stats_data = {}
|
||||||
|
|
||||||
|
for iface_name, stats in if_stats.items():
|
||||||
|
# Skip if we're only monitoring specific interfaces
|
||||||
|
if self.interfaces and iface_name not in self.interfaces:
|
||||||
|
continue
|
||||||
|
|
||||||
|
stats_data[iface_name] = {
|
||||||
|
'isup': stats.isup,
|
||||||
|
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
|
||||||
|
'speed': stats.speed,
|
||||||
|
'mtu': stats.mtu,
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics['interface_stats'] = stats_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not collect interface stats: {e}")
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup (nothing to do for this plugin)."""
|
||||||
|
logger.info("Network monitor cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin instance for automatic discovery
|
||||||
|
plugin = NetworkMonitorPlugin
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
"""OS Information Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Collects static operating system information including OS name, version,
|
||||||
|
kernel, architecture, and distribution details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
# Import from parent package
|
||||||
|
from hbd.client.plugin import InfoPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class OSInfoPlugin(InfoPlugin):
|
||||||
|
"""Collect operating system information.
|
||||||
|
|
||||||
|
This plugin gathers static OS information that rarely changes:
|
||||||
|
- OS name and version
|
||||||
|
- Kernel version
|
||||||
|
- Architecture (x86_64, arm64, etc.)
|
||||||
|
- Distribution details (for Linux)
|
||||||
|
- Python version (used by hbc)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "os_info"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Operating system and platform information"
|
||||||
|
interval = 0 # InfoPlugin: collect once at startup
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""Initialize the OS info plugin.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True (always succeeds - platform module is stdlib)
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Initializing {self.name} plugin")
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _collect_info(self) -> Dict[str, Any]:
|
||||||
|
"""Collect OS information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with OS details
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from hbd import __version__ as hbc_version
|
||||||
|
data = {
|
||||||
|
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
|
||||||
|
"node": platform.node(), # hostname
|
||||||
|
"release": platform.release(), # kernel version
|
||||||
|
"version": platform.version(), # detailed version
|
||||||
|
"machine": platform.machine(), # e.g., "x86_64", "arm64"
|
||||||
|
"processor": platform.processor(), # processor name
|
||||||
|
"architecture": platform.architecture()[0], # e.g., "64bit"
|
||||||
|
"python_version": platform.python_version(),
|
||||||
|
"python_implementation": platform.python_implementation(),
|
||||||
|
"hbc_version": hbc_version,
|
||||||
|
"hbc_type": "full",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add Linux-specific distribution info
|
||||||
|
if platform.system() == "Linux":
|
||||||
|
data.update(self._get_linux_distro())
|
||||||
|
|
||||||
|
# Add macOS-specific info
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
data["macos_version"] = platform.mac_ver()[0]
|
||||||
|
|
||||||
|
# Add Windows-specific info
|
||||||
|
elif platform.system() == "Windows":
|
||||||
|
win_ver = platform.win32_ver()
|
||||||
|
data["windows_release"] = win_ver[0]
|
||||||
|
data["windows_version"] = win_ver[1]
|
||||||
|
data["windows_sp"] = win_ver[2]
|
||||||
|
data["windows_type"] = win_ver[3]
|
||||||
|
|
||||||
|
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _get_linux_distro(self) -> Dict[str, str]:
|
||||||
|
"""Get Linux distribution information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with distribution details
|
||||||
|
"""
|
||||||
|
distro_info = {}
|
||||||
|
|
||||||
|
# Try reading /etc/os-release (standard on modern Linux)
|
||||||
|
os_release = Path("/etc/os-release")
|
||||||
|
if os_release.exists():
|
||||||
|
try:
|
||||||
|
with open(os_release) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if "=" in line and not line.startswith("#"):
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
# Remove quotes from value
|
||||||
|
value = value.strip('"').strip("'")
|
||||||
|
# Map common keys
|
||||||
|
if key == "NAME":
|
||||||
|
distro_info["distro_name"] = value
|
||||||
|
elif key == "VERSION":
|
||||||
|
distro_info["distro_version"] = value
|
||||||
|
elif key == "ID":
|
||||||
|
distro_info["distro_id"] = value
|
||||||
|
elif key == "VERSION_ID":
|
||||||
|
distro_info["distro_version_id"] = value
|
||||||
|
elif key == "PRETTY_NAME":
|
||||||
|
distro_info["distro_pretty_name"] = value
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not read /etc/os-release: {e}")
|
||||||
|
|
||||||
|
# Fallback: try lsb_release (older systems)
|
||||||
|
elif Path("/etc/lsb-release").exists():
|
||||||
|
try:
|
||||||
|
with open("/etc/lsb-release") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if "=" in line:
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
if key == "DISTRIB_ID":
|
||||||
|
distro_info["distro_id"] = value
|
||||||
|
elif key == "DISTRIB_RELEASE":
|
||||||
|
distro_info["distro_version"] = value
|
||||||
|
elif key == "DISTRIB_DESCRIPTION":
|
||||||
|
distro_info["distro_name"] = value
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
|
||||||
|
|
||||||
|
return distro_info
|
||||||
@@ -0,0 +1,151 @@
|
|||||||
|
"""Ping Monitor Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||||
|
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||||
|
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||||
|
|
||||||
|
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
ping_monitor:
|
||||||
|
interval: 60 # ping every 60 seconds (default)
|
||||||
|
count: 3 # ICMP packets per ping run (default 3)
|
||||||
|
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||||
|
hosts:
|
||||||
|
8.8.8.8:
|
||||||
|
warning: 20.0 # ms
|
||||||
|
critical: 100.0 # ms
|
||||||
|
192.168.1.1:
|
||||||
|
warning: 5.0
|
||||||
|
critical: 20.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||||
|
by underscores so it is a valid identifier):
|
||||||
|
|
||||||
|
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||||
|
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||||
|
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||||
|
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||||
|
|
||||||
|
Server-side threshold config example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
ping_monitor:
|
||||||
|
8_8_8_8_rtt_avg:
|
||||||
|
warning: 20.0
|
||||||
|
critical: 100.0
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
def _host_key(host: str) -> str:
|
||||||
|
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||||
|
|
||||||
|
|
||||||
|
class PingMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Ping one or more configured hosts and report RTT metrics."""
|
||||||
|
|
||||||
|
name = "ping_monitor"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "ICMP ping latency monitoring"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
cfg = config or {}
|
||||||
|
self.interval = cfg.get("interval", 60)
|
||||||
|
self.count = int(cfg.get("count", 3))
|
||||||
|
self.timeout = int(cfg.get("timeout", 5))
|
||||||
|
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||||
|
raw_hosts = cfg.get("hosts", {})
|
||||||
|
if isinstance(raw_hosts, list):
|
||||||
|
self.hosts = {h: {} for h in raw_hosts}
|
||||||
|
else:
|
||||||
|
self.hosts = dict(raw_hosts)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
if not self.hosts:
|
||||||
|
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||||
|
return False
|
||||||
|
self.logger.info(
|
||||||
|
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||||
|
len(self.hosts), self.interval, self.count, self.timeout,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _ping(self, host: str) -> Dict[str, float]:
|
||||||
|
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||||
|
if sys.platform == "win32":
|
||||||
|
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||||
|
else:
|
||||||
|
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, _ = await asyncio.wait_for(
|
||||||
|
proc.communicate(),
|
||||||
|
timeout=self.timeout * self.count + 2,
|
||||||
|
)
|
||||||
|
output = stdout.decode(errors="replace")
|
||||||
|
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||||
|
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
|
||||||
|
# Parse packet loss
|
||||||
|
loss = 100.0
|
||||||
|
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||||
|
if loss_match:
|
||||||
|
loss = float(loss_match.group(1))
|
||||||
|
|
||||||
|
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||||
|
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||||
|
rtt_match = re.search(
|
||||||
|
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||||
|
output,
|
||||||
|
)
|
||||||
|
if rtt_match:
|
||||||
|
return {
|
||||||
|
"rtt_min": float(rtt_match.group(1)),
|
||||||
|
"rtt_avg": float(rtt_match.group(2)),
|
||||||
|
"rtt_max": float(rtt_match.group(3)),
|
||||||
|
"loss": loss,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Host unreachable or all packets lost
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": loss}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
data: Dict[str, Any] = {}
|
||||||
|
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||||
|
for host, task in tasks.items():
|
||||||
|
try:
|
||||||
|
result = await task
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||||
|
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
key = _host_key(host)
|
||||||
|
for metric, value in result.items():
|
||||||
|
data[f"{key}_{metric}"] = value
|
||||||
|
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||||
|
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||||
|
return data
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""Common utilities shared between hbc and hbd."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,162 @@
|
|||||||
|
"""Message encoding/decoding utilities for hbd protocol.
|
||||||
|
|
||||||
|
Message Types:
|
||||||
|
HTB: Heartbeat message (client -> server)
|
||||||
|
ACK: Acknowledgment (server -> client)
|
||||||
|
CMD: Command message (server -> client)
|
||||||
|
UPD: Update message (server -> client)
|
||||||
|
PLG: Plugin data message (client -> server)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, Any, Union
|
||||||
|
import json
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
|
||||||
|
def encode_value(v: Any) -> str:
|
||||||
|
"""Encode a value for protocol transmission.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
v: Value to encode (int, float, str, bool, list, dict, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String representation suitable for protocol
|
||||||
|
"""
|
||||||
|
if isinstance(v, float):
|
||||||
|
return f"{v:0.5f}"
|
||||||
|
elif isinstance(v, (list, dict)):
|
||||||
|
# Use JSON encoding for complex types, prefixed with @
|
||||||
|
return "@" + json.dumps(v)
|
||||||
|
elif isinstance(v, bool):
|
||||||
|
return str(int(v)) # True->1, False->0
|
||||||
|
else:
|
||||||
|
return str(v)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_value(val: str) -> Any:
|
||||||
|
"""Decode a value from protocol format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
val: String value from protocol
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded Python object
|
||||||
|
"""
|
||||||
|
if not val:
|
||||||
|
return val
|
||||||
|
|
||||||
|
# Check for JSON-encoded complex types
|
||||||
|
if val.startswith("@"):
|
||||||
|
try:
|
||||||
|
return json.loads(val[1:])
|
||||||
|
except Exception:
|
||||||
|
return val[1:] # Return as string without @
|
||||||
|
|
||||||
|
# Try numeric conversion (avoid eval to prevent SyntaxWarnings on version strings)
|
||||||
|
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return float(val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return val
|
||||||
|
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def dicttos(ID: str, d: Dict[str, Any]):
|
||||||
|
"""Serialize a dict to protocol message bytes.
|
||||||
|
|
||||||
|
If compress is True, the payload is zlib-compressed and the message is
|
||||||
|
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||||
|
`ID:key=value;...` (bytes).
|
||||||
|
"""
|
||||||
|
s = []
|
||||||
|
for k in d:
|
||||||
|
v = d[k]
|
||||||
|
encoded_val = encode_value(v)
|
||||||
|
s.append(f"{k}={encoded_val}")
|
||||||
|
pk = ";".join(s)
|
||||||
|
zpk = zlib.compress(pk.encode(), 6)
|
||||||
|
hdr = ("!" + ID + ":").encode()
|
||||||
|
return hdr + zpk
|
||||||
|
|
||||||
|
|
||||||
|
def stodict(msg: bytes):
|
||||||
|
"""Deserialize a protocol message into a dict.
|
||||||
|
|
||||||
|
Mirrors original behaviour: detects compressed messages starting with
|
||||||
|
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||||
|
message ID and the parsed key/value pairs.
|
||||||
|
"""
|
||||||
|
d = {}
|
||||||
|
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||||
|
# message is: b'!ID:' + compressed_payload
|
||||||
|
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||||
|
try:
|
||||||
|
pk = zlib.decompress(msg[5:]).decode()
|
||||||
|
except Exception:
|
||||||
|
# malformed compressed payload
|
||||||
|
return {}
|
||||||
|
d["ID"] = msg[1:4].decode()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
r0 = msg.split(b":", 1)
|
||||||
|
pk = r0[1].decode()
|
||||||
|
d["ID"] = r0[0].decode()
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
if not pk:
|
||||||
|
return d
|
||||||
|
parts = pk.split(";")
|
||||||
|
for v in parts:
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
vr = v.split("=", 1)
|
||||||
|
k = vr[0].strip()
|
||||||
|
if len(vr) == 1:
|
||||||
|
d[k] = None
|
||||||
|
else:
|
||||||
|
val = vr[1].strip()
|
||||||
|
d[k] = decode_value(val)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def oldmtodict(msg: bytes):
|
||||||
|
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||||
|
|
||||||
|
The original implementation prefixed with 'HTB:' and called stodict.
|
||||||
|
"""
|
||||||
|
return stodict(b"HTB:" + msg)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
|
||||||
|
"""Encode plugin data into a PLG message.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
|
||||||
|
data: Plugin data dictionary
|
||||||
|
compress: Whether to compress the payload
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Encoded message bytes
|
||||||
|
"""
|
||||||
|
# Add plugin name to data
|
||||||
|
full_data = {"plugin": plugin_name, **data}
|
||||||
|
return dicttos("PLG", full_data)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
|
||||||
|
"""Decode a PLG message into plugin data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Raw message bytes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with 'ID', 'plugin', and plugin data fields
|
||||||
|
"""
|
||||||
|
return stodict(msg)
|
||||||
|
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
"""Utility helpers extracted from the original script."""
|
"""Utility helpers extracted from the original script."""
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def shortname(name: str) -> str:
|
def shortname(name: str) -> str:
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
"""Configuration loader and defaults for hbd."""
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
try:
|
|
||||||
import yaml
|
|
||||||
except Exception:
|
|
||||||
yaml = None
|
|
||||||
|
|
||||||
DEFAULTS = {
|
|
||||||
"hb_port": 50003,
|
|
||||||
"hbd_port": 50004,
|
|
||||||
"hbd_host": "",
|
|
||||||
"pickfile": "/tmp/hb.pick",
|
|
||||||
"logfile": "/var/log/heartbeat.log",
|
|
||||||
"logfmt": "text",
|
|
||||||
"pushsrv": "pushover",
|
|
||||||
"pushover_token": "",
|
|
||||||
"pushover_user": "",
|
|
||||||
"interval": 20,
|
|
||||||
"grace": 2,
|
|
||||||
"dyndomains": ["wrede.org"],
|
|
||||||
"watchhosts": [],
|
|
||||||
"dyndnshosts": [],
|
|
||||||
"drophosts": [],
|
|
||||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
|
||||||
"foreground": False,
|
|
||||||
"verbose": False,
|
|
||||||
"debug": 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def load_config(path=None):
|
|
||||||
"""Load configuration from a YAML file and merge with defaults.
|
|
||||||
|
|
||||||
If YAML is not available or the file does not exist, defaults are returned.
|
|
||||||
"""
|
|
||||||
cfg = DEFAULTS.copy()
|
|
||||||
if not path:
|
|
||||||
# default path (~/.hb.yaml)
|
|
||||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
|
||||||
|
|
||||||
if os.path.exists(path):
|
|
||||||
if yaml:
|
|
||||||
with open(path) as fh:
|
|
||||||
data = yaml.safe_load(fh)
|
|
||||||
# only keep known keys
|
|
||||||
for k, v in data.items():
|
|
||||||
if k in cfg:
|
|
||||||
cfg[k] = v
|
|
||||||
else:
|
|
||||||
logging.warning("unknown config key %s in %s", k, path)
|
|
||||||
else:
|
|
||||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
|
||||||
pass
|
|
||||||
return cfg
|
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# Example Heartbeat Client Configuration
|
||||||
|
# This file demonstrates all available configuration options for the heartbeat client (hbc)
|
||||||
|
# and its plugin system.
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Server Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
server: hbd.example.com # Heartbeat server hostname or IP
|
||||||
|
port: 50003 # Server UDP port (default: 50003)
|
||||||
|
interval: 30 # Heartbeat interval in seconds (default: 30)
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Plugin Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
|
||||||
|
# and configured with plugin-specific settings.
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# OS Information Plugin (InfoPlugin - runs once at startup)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
os_info:
|
||||||
|
enabled: true
|
||||||
|
# No additional configuration needed
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
cpu_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
per_core: false # Collect per-core CPU statistics (default: false)
|
||||||
|
# When per_core is true, will report CPU usage for each core separately
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Memory Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
memory_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_swap: true # Include swap memory statistics (default: true)
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Disk Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
disk_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_io: true # Include I/O statistics (default: true)
|
||||||
|
# Optional: Monitor only specific partitions
|
||||||
|
# partitions:
|
||||||
|
# - /
|
||||||
|
# - /home
|
||||||
|
# - /var
|
||||||
|
# Optional: Exclude specific filesystem types
|
||||||
|
exclude_types:
|
||||||
|
- tmpfs
|
||||||
|
- devtmpfs
|
||||||
|
- squashfs
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Network Monitor Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
network_monitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
include_connections: true # Include connection statistics (default: true)
|
||||||
|
include_addresses: false # Include interface addresses (default: false)
|
||||||
|
# Optional: Monitor only specific interfaces
|
||||||
|
# interfaces:
|
||||||
|
# - eth0
|
||||||
|
# - wlan0
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
filesystem_info:
|
||||||
|
enabled: true
|
||||||
|
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
|
||||||
|
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
|
||||||
|
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
|
||||||
|
# Optional: Exclude additional specific filesystem types
|
||||||
|
# exclude_types:
|
||||||
|
# - squashfs
|
||||||
|
# - iso9660
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Nagios Runner Plugin (MonitorPlugin)
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
nagios_runner:
|
||||||
|
enabled: true
|
||||||
|
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||||
|
timeout: 30 # Plugin execution timeout in seconds (default: 30)
|
||||||
|
|
||||||
|
# List of Nagios plugins to execute
|
||||||
|
# Each command is executed as-is, so provide full paths and arguments
|
||||||
|
commands:
|
||||||
|
# System load monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
# Disk space monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||||
|
|
||||||
|
# Process monitoring
|
||||||
|
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
|
||||||
|
|
||||||
|
# Swap usage
|
||||||
|
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
|
||||||
|
# Custom script example
|
||||||
|
# - /usr/local/bin/check_my_app.sh
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Advanced Options
|
||||||
|
# ==============================================================================
|
||||||
|
# These options control client behavior
|
||||||
|
|
||||||
|
# Compression: Enable zlib compression for heartbeat messages (default: true)
|
||||||
|
compress: true
|
||||||
|
|
||||||
|
# Hostname: Override the system hostname (default: auto-detect)
|
||||||
|
# hostname: myhost.example.com
|
||||||
|
|
||||||
|
# Message: Custom message included in heartbeat (optional)
|
||||||
|
# message: "Production web server"
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||||
|
# logfile: /var/log/hbc.log # Optional log file path
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Example Profiles
|
||||||
|
# ==============================================================================
|
||||||
|
# Below are example configuration profiles for different use cases
|
||||||
|
|
||||||
|
# Minimal Configuration (default settings):
|
||||||
|
# -----------------------------------------
|
||||||
|
# server: hbd.example.com
|
||||||
|
# interval: 30
|
||||||
|
|
||||||
|
# Monitoring Server (comprehensive metrics):
|
||||||
|
# ------------------------------------------
|
||||||
|
# server: monitoring.example.com
|
||||||
|
# interval: 30
|
||||||
|
# plugins:
|
||||||
|
# cpu_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 15
|
||||||
|
# per_core: true
|
||||||
|
# memory_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 15
|
||||||
|
# disk_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 60
|
||||||
|
# network_monitor:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 30
|
||||||
|
# include_connections: true
|
||||||
|
|
||||||
|
# Nagios Integration (leverage existing plugins):
|
||||||
|
# -----------------------------------------------
|
||||||
|
# server: hbd.example.com
|
||||||
|
# plugins:
|
||||||
|
# nagios_runner:
|
||||||
|
# enabled: true
|
||||||
|
# interval: 300 # Check every 5 minutes
|
||||||
|
# commands:
|
||||||
|
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
|
||||||
|
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
|
||||||
|
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Threshold Configuration (for Heartbeat Daemon)
|
||||||
|
# ==============================================================================
|
||||||
|
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
|
||||||
|
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
|
||||||
|
#
|
||||||
|
# Basic threshold example:
|
||||||
|
# thresholds:
|
||||||
|
# cpu_monitor:
|
||||||
|
# cpu_percent:
|
||||||
|
# warning: 80.0
|
||||||
|
# critical: 90.0
|
||||||
|
# memory_monitor:
|
||||||
|
# percent:
|
||||||
|
# warning: 85.0
|
||||||
|
# critical: 95.0
|
||||||
|
# disk_monitor:
|
||||||
|
# partitions:
|
||||||
|
# /:
|
||||||
|
# percent:
|
||||||
|
# warning: 80.0
|
||||||
|
# critical: 90.0
|
||||||
|
|
||||||
@@ -0,0 +1,296 @@
|
|||||||
|
# ==============================================================================
|
||||||
|
# Heartbeat Daemon Multi-Threshold Configuration Example
|
||||||
|
# ==============================================================================
|
||||||
|
# This file demonstrates the new multi-threshold configuration feature that allows
|
||||||
|
# different threshold settings for different hosts/clients.
|
||||||
|
#
|
||||||
|
# Features:
|
||||||
|
# - Define multiple named threshold configurations
|
||||||
|
# - Map specific hosts to specific threshold configurations
|
||||||
|
# - Set a default configuration for unmapped hosts
|
||||||
|
# - Backward compatible with single threshold configuration
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Global threshold settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||||
|
|
||||||
|
# Optional: Set default threshold config (defaults to "default" if not specified)
|
||||||
|
default_threshold_config: "default"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Multiple Named Threshold Configurations
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define multiple threshold configurations with different sensitivity levels
|
||||||
|
threshold_configs:
|
||||||
|
|
||||||
|
# Default configuration - moderate thresholds for most servers
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
load_1min:
|
||||||
|
warning: 4.0
|
||||||
|
critical: 8.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
# RTT thresholds (applies to all hosts)
|
||||||
|
warning: 50.0 # ms
|
||||||
|
critical: 200.0
|
||||||
|
|
||||||
|
# High sensitivity configuration - lower thresholds for critical systems
|
||||||
|
high_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 60.0 # Alert earlier
|
||||||
|
critical: 75.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15 # More hysteresis to reduce flapping
|
||||||
|
load_1min:
|
||||||
|
warning: 2.0
|
||||||
|
critical: 4.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert at lower memory usage
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 75.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
/var:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 30.0
|
||||||
|
critical: 100.0
|
||||||
|
|
||||||
|
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||||
|
low_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 90.0 # Only alert at very high usage
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 98.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 100.0
|
||||||
|
critical: 500.0
|
||||||
|
|
||||||
|
# Production database servers - specialized thresholds
|
||||||
|
database:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 70.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0 # Databases can use high memory
|
||||||
|
critical: 97.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
/var/lib/mysql: # Database data partition
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert earlier for DB partition
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
warning: 20.0 # Stricter latency requirements
|
||||||
|
critical: 50.0
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Host to Threshold Configuration Mapping
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Map specific hosts to specific threshold configurations
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Notification Channels
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define notification providers centrally with their credentials
|
||||||
|
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
||||||
|
notification_channels:
|
||||||
|
# Signal notifications
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +1234567890
|
||||||
|
|
||||||
|
signal_oncall:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +0987654321
|
||||||
|
|
||||||
|
# Email notifications
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, alerts@example.com]
|
||||||
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
|
# Pushover notifications
|
||||||
|
pushover_urgent:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost notifications
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
|
||||||
|
# Default notification channels (used if host doesn't specify channels)
|
||||||
|
default_notification_channels: [email_ops]
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Host Definitions (New Unified Format)
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
||||||
|
hosts:
|
||||||
|
# Critical production servers - high sensitivity, multiple notification channels
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-web-02:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-api-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Database servers - database-specific thresholds
|
||||||
|
prod-db-01:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-02:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-replica:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [email_ops] # Replica gets email only
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Development servers - low sensitivity, minimal notifications
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false # Don't monitor dev servers closely
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dev-server-02:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Test servers
|
||||||
|
test-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
dyndns: false
|
||||||
|
# No notification channels - uses default_notification_channels
|
||||||
|
|
||||||
|
# Home server with dynamic DNS
|
||||||
|
home-server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops]
|
||||||
|
dyndns: true # Update DNS when IP changes
|
||||||
|
|
||||||
|
# Hosts not listed in the hosts section will use:
|
||||||
|
# - default_threshold_config for thresholds (falls back to "default")
|
||||||
|
# - default_notification_channels for notifications
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Notes on Configuration Structure
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All configuration is centralized in the hosts section. Each host can specify:
|
||||||
|
# - threshold_config: Name of threshold configuration to use
|
||||||
|
# - watch: Whether to monitor this host actively (send notifications)
|
||||||
|
# - notification_channels: List of channels to use for this host
|
||||||
|
# - dyndns: Whether to update DNS when IP address changes
|
||||||
|
#
|
||||||
|
# Notification channels are defined once at the top level and referenced
|
||||||
|
# by name in host definitions, allowing easy reuse and updates.
|
||||||
|
#
|
||||||
|
# For hosts not explicitly listed, the system will still accept heartbeats
|
||||||
|
# and track their state, but won't apply thresholds or send notifications
|
||||||
|
# unless default settings are configured.
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
# Heartbeat Configuration Example with Nagios Plugin Runner
|
||||||
|
|
||||||
|
# This example shows how to configure the Nagios Runner plugin
|
||||||
|
# to execute existing Nagios-compatible monitoring plugins
|
||||||
|
|
||||||
|
# Basic server settings (existing config)
|
||||||
|
hb_port: 50003
|
||||||
|
hbd_port: 50004
|
||||||
|
interval: 20
|
||||||
|
grace: 2
|
||||||
|
|
||||||
|
# Plugin configuration
|
||||||
|
# Each plugin can have its own configuration section
|
||||||
|
|
||||||
|
# CPU Monitor Plugin
|
||||||
|
cpu_monitor:
|
||||||
|
interval: 300 # Collect every 5 minutes (default)
|
||||||
|
per_core: false # Set to true to get per-core CPU usage
|
||||||
|
|
||||||
|
# Nagios Runner Plugin
|
||||||
|
nagios_runner:
|
||||||
|
interval: 300 # Run Nagios plugins every 5 minutes (default)
|
||||||
|
timeout: 30 # Command execution timeout in seconds
|
||||||
|
shell: true # Execute commands via shell
|
||||||
|
|
||||||
|
# List of Nagios plugins to run
|
||||||
|
commands:
|
||||||
|
|
||||||
|
# Example 1: Check disk space
|
||||||
|
- name: check_disk_root
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
|
|
||||||
|
# Example 2: Check disk space for /home
|
||||||
|
- name: check_disk_home
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||||
|
|
||||||
|
# Example 3: Check system load
|
||||||
|
- name: check_load
|
||||||
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
|
||||||
|
# Example 4: Check process count
|
||||||
|
- name: check_procs
|
||||||
|
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||||
|
|
||||||
|
# Example 5: Check SSH service
|
||||||
|
- name: check_ssh
|
||||||
|
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||||
|
|
||||||
|
# Example 6: Check HTTP service
|
||||||
|
- name: check_http
|
||||||
|
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||||
|
|
||||||
|
# Example 7: Check swap usage
|
||||||
|
- name: check_swap
|
||||||
|
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
|
||||||
|
# Example 8: Custom script (Nagios plugin format)
|
||||||
|
- name: check_custom
|
||||||
|
command: /usr/local/bin/my_custom_check.sh
|
||||||
|
|
||||||
|
# Example 9: Check specific log file
|
||||||
|
- name: check_logs
|
||||||
|
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
|
||||||
|
|
||||||
|
# Notes:
|
||||||
|
#
|
||||||
|
# 1. Nagios Plugin Output Format:
|
||||||
|
# - Single line: STATUS - Message | performance_data
|
||||||
|
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||||
|
#
|
||||||
|
# 2. Exit Codes:
|
||||||
|
# - 0 = OK
|
||||||
|
# - 1 = WARNING
|
||||||
|
# - 2 = CRITICAL
|
||||||
|
# - 3 = UNKNOWN
|
||||||
|
#
|
||||||
|
# 3. Performance Data:
|
||||||
|
# - Automatically parsed and included in heartbeat data
|
||||||
|
# - Metrics are stored as: {plugin_name}_{metric_name}
|
||||||
|
# - Example: check_disk_root_/ will contain the disk usage percentage
|
||||||
|
#
|
||||||
|
# 4. Overall Status:
|
||||||
|
# - The plugin reports the worst status from all commands
|
||||||
|
# - Useful for quick health checks
|
||||||
|
#
|
||||||
|
# 5. Plugin Paths:
|
||||||
|
# Common Nagios plugin directories:
|
||||||
|
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
|
||||||
|
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
|
||||||
|
# - Custom installs: /usr/local/nagios/libexec/
|
||||||
|
#
|
||||||
|
# 6. Installing Nagios Plugins:
|
||||||
|
# Debian/Ubuntu: sudo apt-get install nagios-plugins
|
||||||
|
# RHEL/CentOS: sudo yum install nagios-plugins-all
|
||||||
|
# Arch Linux: sudo pacman -S monitoring-plugins
|
||||||
|
#
|
||||||
|
# 7. Writing Custom Nagios Plugins:
|
||||||
|
# Any script can be a Nagios plugin if it:
|
||||||
|
# - Returns appropriate exit codes (0-3)
|
||||||
|
# - Prints status message to stdout
|
||||||
|
# - Optionally includes performance data after "|"
|
||||||
|
#
|
||||||
|
# Example custom plugin (save as /usr/local/bin/check_example.sh):
|
||||||
|
# #!/bin/bash
|
||||||
|
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
|
||||||
|
# echo "CRITICAL - Too many users | users=52;40;50;0"
|
||||||
|
# exit 2
|
||||||
|
# else
|
||||||
|
# echo "OK - Normal user count | users=25;40;50;0"
|
||||||
|
# exit 0
|
||||||
|
# fi
|
||||||
@@ -0,0 +1,254 @@
|
|||||||
|
# ==============================================================================
|
||||||
|
# Heartbeat Daemon Threshold Configuration Example
|
||||||
|
# ==============================================================================
|
||||||
|
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
||||||
|
# Thresholds can be defined for any metric collected by monitoring plugins.
|
||||||
|
#
|
||||||
|
# Threshold levels:
|
||||||
|
# - WARNING: First level of concern, typically for early notification
|
||||||
|
# - CRITICAL: Severe condition requiring immediate attention
|
||||||
|
#
|
||||||
|
# Alert notifications are sent when:
|
||||||
|
# - A metric crosses from OK to WARNING or CRITICAL
|
||||||
|
# - A metric crosses from WARNING to CRITICAL
|
||||||
|
# - A metric recovers (returns to a lower severity level)
|
||||||
|
#
|
||||||
|
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Global threshold settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||||
|
|
||||||
|
# Threshold definitions per plugin
|
||||||
|
thresholds:
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# CPU Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
cpu_monitor:
|
||||||
|
# Overall CPU usage percentage (0-100)
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0 # Warn when CPU usage exceeds 80%
|
||||||
|
critical: 90.0 # Critical when CPU usage exceeds 90%
|
||||||
|
operator: ">" # Alert when value is GREATER than threshold
|
||||||
|
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 1-minute load average
|
||||||
|
load_1min:
|
||||||
|
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
||||||
|
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15 # 15% hysteresis
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 5-minute load average
|
||||||
|
load_5min:
|
||||||
|
warning: 3.0
|
||||||
|
critical: 6.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# 15-minute load average
|
||||||
|
load_15min:
|
||||||
|
warning: 2.0
|
||||||
|
critical: 4.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Memory Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
memory_monitor:
|
||||||
|
# Memory usage percentage
|
||||||
|
percent:
|
||||||
|
warning: 85.0 # Warn at 85% memory usage
|
||||||
|
critical: 95.0 # Critical at 95% memory usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Available memory in MB (inverse threshold - alert when LOW)
|
||||||
|
available_mb:
|
||||||
|
warning: 1000 # Warn when less than 1GB available
|
||||||
|
critical: 500 # Critical when less than 500MB available
|
||||||
|
operator: "<" # Alert when value is LESS than threshold
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Swap usage percentage
|
||||||
|
swap_percent:
|
||||||
|
warning: 50.0 # Warn at 50% swap usage
|
||||||
|
critical: 80.0 # Critical at 80% swap usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Disk Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
disk_monitor:
|
||||||
|
# Partition-specific thresholds
|
||||||
|
# Use the mount point as the key
|
||||||
|
partitions:
|
||||||
|
# Root filesystem
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0 # Warn at 80% disk usage
|
||||||
|
critical: 90.0 # Critical at 90% disk usage
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
free_gb:
|
||||||
|
warning: 10.0 # Warn when less than 10GB free
|
||||||
|
critical: 5.0 # Critical when less than 5GB free
|
||||||
|
operator: "<"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Home filesystem (if separate partition)
|
||||||
|
/home:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Var filesystem (logs, etc.)
|
||||||
|
/var:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.05
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
free_gb:
|
||||||
|
warning: 5.0 # Var needs space for logs
|
||||||
|
critical: 2.0
|
||||||
|
operator: "<"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Network Monitor Thresholds
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
network_monitor:
|
||||||
|
# Total error count across all interfaces
|
||||||
|
errors_total:
|
||||||
|
warning: 100 # Warn at 100 errors
|
||||||
|
critical: 1000 # Critical at 1000 errors
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2 # 20% hysteresis for counters
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Total dropped packets
|
||||||
|
dropin_total:
|
||||||
|
warning: 50
|
||||||
|
critical: 200
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
dropout_total:
|
||||||
|
warning: 50
|
||||||
|
critical: 200
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# TCP connections in TIME_WAIT state
|
||||||
|
connections_TIME_WAIT:
|
||||||
|
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
||||||
|
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.2
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Total established connections
|
||||||
|
connections_ESTABLISHED:
|
||||||
|
warning: 500
|
||||||
|
critical: 1000
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.1
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Nagios Plugin Thresholds (if using nagios_runner)
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
nagios_runner:
|
||||||
|
# Nagios plugins report exit codes:
|
||||||
|
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
||||||
|
# We can threshold on the exit_code directly
|
||||||
|
exit_code:
|
||||||
|
warning: 1 # Map Nagios WARNING to our WARNING
|
||||||
|
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
||||||
|
operator: ">=" # Alert when exit code >= threshold
|
||||||
|
hysteresis: 0.0 # No hysteresis for exit codes
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Notification Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
# Configure notification methods (email, pushover, etc.)
|
||||||
|
# These are used when threshold violations occur
|
||||||
|
|
||||||
|
# Email notifications
|
||||||
|
toemail:
|
||||||
|
- admin@example.com
|
||||||
|
- oncall@example.com
|
||||||
|
fromemail: heartbeat@example.com
|
||||||
|
smtpserver: smtp.example.com
|
||||||
|
smtpport: 587
|
||||||
|
smtpuser: heartbeat@example.com
|
||||||
|
smtppassword: your-password-here
|
||||||
|
|
||||||
|
# Pushover notifications (optional)
|
||||||
|
# pushover_token: your-pushover-app-token
|
||||||
|
# pushover_user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost webhook (optional)
|
||||||
|
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Watched Hosts
|
||||||
|
# ==============================================================================
|
||||||
|
# Hosts in this list will trigger notifications for:
|
||||||
|
# - Heartbeat timeouts/overdue
|
||||||
|
# - Threshold violations
|
||||||
|
# - Boot messages
|
||||||
|
watchhosts:
|
||||||
|
- webserver01
|
||||||
|
- database01
|
||||||
|
- mailserver
|
||||||
|
- critical-app
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Additional Server Settings
|
||||||
|
# ==============================================================================
|
||||||
|
hb_port: 50003 # UDP port for heartbeat messages
|
||||||
|
hbd_port: 50004 # HTTP port for web interface
|
||||||
|
grace: 10 # Grace period for overdue detection (seconds)
|
||||||
|
debug: 0 # Debug level (0-3)
|
||||||
|
verbose: false # Verbose output
|
||||||
|
|
||||||
|
# Journal settings (message logging)
|
||||||
|
journal_enabled: true
|
||||||
|
journal_path: /var/log/heartbeat/messages.journal
|
||||||
|
journal_max_size: 104857600 # 100MB before rotation
|
||||||
|
journal_max_backups: 10
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Example: Production Configuration with Conservative Thresholds
|
||||||
|
# ==============================================================================
|
||||||
|
# For production systems, consider:
|
||||||
|
# - Higher warning thresholds to reduce alert fatigue
|
||||||
|
# - Appropriate hysteresis values (5-15% typical)
|
||||||
|
# - Re-notification intervals matching on-call rotation
|
||||||
|
# - Multiple escalation contacts
|
||||||
|
# - Integration with incident management systems
|
||||||
|
# ==============================================================================
|
||||||
-593
@@ -1,593 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
|
|
||||||
# NEW
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import socket
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import getopt
|
|
||||||
import string
|
|
||||||
import select
|
|
||||||
import errno
|
|
||||||
import traceback
|
|
||||||
from hashlib import md5
|
|
||||||
import shutil
|
|
||||||
import zlib
|
|
||||||
import subprocess
|
|
||||||
import syslog
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
from .config import load_config
|
|
||||||
|
|
||||||
PORT = 50003
|
|
||||||
INTERVAL = 10
|
|
||||||
REOPENC = 6
|
|
||||||
PIDFILE = "/tmp/hbc.pid"
|
|
||||||
VER = 6
|
|
||||||
MAXRECV = 32767
|
|
||||||
|
|
||||||
running = True
|
|
||||||
dorestart = False
|
|
||||||
warned1 = False
|
|
||||||
|
|
||||||
msgonly = False
|
|
||||||
helpflag = False
|
|
||||||
verbose = False
|
|
||||||
fdaemon = False
|
|
||||||
daemonized = False
|
|
||||||
optlist = []
|
|
||||||
msgboot = {}
|
|
||||||
home = os.environ["HOME"]
|
|
||||||
configfile = "%s/.hbrc" % home
|
|
||||||
cmdargs = []
|
|
||||||
iam = socket.gethostname()
|
|
||||||
|
|
||||||
def log(msg):
|
|
||||||
if fdaemon:
|
|
||||||
syslog.syslog(syslog.LOG_ERR, msg)
|
|
||||||
else:
|
|
||||||
print(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def handler(signum, frame):
|
|
||||||
if signum == signal.SIGTERM:
|
|
||||||
cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class NullDevice:
|
|
||||||
def write(self, s):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Conn:
|
|
||||||
def __init__(self, conId, addr, port, af):
|
|
||||||
self.conId = conId
|
|
||||||
self.addr = addr
|
|
||||||
self.port = port
|
|
||||||
self.af = af
|
|
||||||
|
|
||||||
self.ackcount = 0 # num of accks received
|
|
||||||
self.lastack = 0 # time() last ACK was received
|
|
||||||
self.send = 0
|
|
||||||
self.lastsend = 0 # time() last msg was sent
|
|
||||||
self.rtts = [0]
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
|
|
||||||
|
|
||||||
def open(self):
|
|
||||||
self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
|
|
||||||
self.sock.setsockopt(
|
|
||||||
socket.SOL_SOCKET,
|
|
||||||
socket.SO_REUSEADDR,
|
|
||||||
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
|
|
||||||
)
|
|
||||||
|
|
||||||
def sendto(self, msg, ID="HTB"): # default ID is HearTBeat
|
|
||||||
global warned1
|
|
||||||
|
|
||||||
if self.send % REOPENC == 0:
|
|
||||||
self.close()
|
|
||||||
if not self.sock:
|
|
||||||
self.open()
|
|
||||||
msg["name"] = shortname(iam)
|
|
||||||
msg["id"] = self.conId
|
|
||||||
msg["ver"] = VER
|
|
||||||
msg["time"] = time.time()
|
|
||||||
m = dicttos(ID, msg) # always compress
|
|
||||||
if verbose:
|
|
||||||
log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
|
|
||||||
try:
|
|
||||||
self.sock.sendto(m, (self.addr, self.port))
|
|
||||||
except socket.error as e:
|
|
||||||
if not warned1:
|
|
||||||
log("socket error: %s %s:%s" % (e, self.addr, self.port))
|
|
||||||
warned1 = True
|
|
||||||
self.close()
|
|
||||||
return
|
|
||||||
self.send += 1
|
|
||||||
self.lastsend = time.time()
|
|
||||||
|
|
||||||
def ack(self, msgDict, now):
|
|
||||||
try:
|
|
||||||
self.lastack = msgDict["time"]
|
|
||||||
mul = 2
|
|
||||||
except:
|
|
||||||
self.lastack = now
|
|
||||||
mul = 1
|
|
||||||
rtt = (self.lastack - self.lastsend) * mul
|
|
||||||
if verbose:
|
|
||||||
log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
|
|
||||||
self.rtts.append(rtt * 1000.0)
|
|
||||||
if len(self.rtts) > 10:
|
|
||||||
del self.rtts[0]
|
|
||||||
self.ackcount += 1
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.sock:
|
|
||||||
self.sock.close()
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
|
|
||||||
def shortname(name):
|
|
||||||
r = name.split(".")
|
|
||||||
return r[0]
|
|
||||||
|
|
||||||
|
|
||||||
def dicttos(ID, d):
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
if type(d[k]) == type(1.2):
|
|
||||||
s.append("%s=%0.5f" % (k, d[k]))
|
|
||||||
else:
|
|
||||||
s.append("%s=%s" % (k, d[k]))
|
|
||||||
pk = ";".join(s)
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
ID = "!" + ID + ":"
|
|
||||||
return ID.encode() + zpk
|
|
||||||
|
|
||||||
|
|
||||||
def stodict(msg):
|
|
||||||
d = {}
|
|
||||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
|
||||||
pk = zlib.decompress(msg[5:]).decode()
|
|
||||||
d["ID"] = msg[1:4].decode()
|
|
||||||
else:
|
|
||||||
r0 = msg.split(":", 1)
|
|
||||||
pk = r0[1]
|
|
||||||
d["ID"] = r0[0]
|
|
||||||
r = pk.split(";")
|
|
||||||
for v in r:
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
v = vr[1].strip()
|
|
||||||
try:
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
d[k] = v
|
|
||||||
if verbose:
|
|
||||||
print("msg is %s" % d)
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def XXstodict(msg):
|
|
||||||
d = {}
|
|
||||||
r0 = msg.split(":", 1)
|
|
||||||
if len(r0) == 1:
|
|
||||||
return None
|
|
||||||
if r0[0][0] == "!": # compressed
|
|
||||||
pk = zlib.decompress(msg[len(r0[0]) + 1 :])
|
|
||||||
d["ID"] = r0[0][1:]
|
|
||||||
else:
|
|
||||||
pk = r0[1]
|
|
||||||
d["ID"] = r0[0]
|
|
||||||
r = pk.split(";")
|
|
||||||
for v in r:
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
v = vr[1].strip()
|
|
||||||
try:
|
|
||||||
if v[0].isdigit():
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
d[k] = v
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def syslogtrace(note):
|
|
||||||
logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
|
|
||||||
log(logm)
|
|
||||||
for l in logm.split("\n"):
|
|
||||||
syslog.syslog(syslog.LOG_ERR, " tb: %s" % l)
|
|
||||||
if verbose:
|
|
||||||
print(logm)
|
|
||||||
|
|
||||||
|
|
||||||
conId = 1
|
|
||||||
|
|
||||||
|
|
||||||
def createConnections(hosts):
|
|
||||||
global conId
|
|
||||||
for host in hosts:
|
|
||||||
if verbose:
|
|
||||||
log("createConnections for %s" % host)
|
|
||||||
try:
|
|
||||||
rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
|
||||||
except socket.gaierror:
|
|
||||||
logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
|
|
||||||
if verbose:
|
|
||||||
log(logm)
|
|
||||||
return None
|
|
||||||
for r in rs:
|
|
||||||
if verbose:
|
|
||||||
log("address %s" % str(r))
|
|
||||||
if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD
|
|
||||||
af = socket.AF_INET6
|
|
||||||
elif r[0] == 2:
|
|
||||||
af = socket.AF_INET
|
|
||||||
else:
|
|
||||||
print("dont know this net type: %s" % r[0][0])
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
addr = r[4][0]
|
|
||||||
conns[conId] = Conn(conId, addr, hb_port, af)
|
|
||||||
if verbose:
|
|
||||||
print("cons[%s] = %s" % (conId, str(conns[conId])))
|
|
||||||
conId += 1
|
|
||||||
|
|
||||||
|
|
||||||
def doexec(conn, data):
|
|
||||||
try:
|
|
||||||
ro = subprocess.check_output(
|
|
||||||
data, stderr=subprocess.STDOUT, shell=True
|
|
||||||
).decode()
|
|
||||||
fail = "OK"
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
ro = str(e)
|
|
||||||
fail = "CalledProcessError"
|
|
||||||
except Exception as e:
|
|
||||||
syslogtrace("System")
|
|
||||||
ro = "N/A"
|
|
||||||
fail = "cmd failed: %s" % e
|
|
||||||
msg = {"service": "command", "msg": fail + " " + ro}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def doupdate(conn, msgDict):
|
|
||||||
fail = None
|
|
||||||
try:
|
|
||||||
code = codecs.decode(msgDict["code"], "base64").decode()
|
|
||||||
csum = msgDict["csum"]
|
|
||||||
except Exception as e:
|
|
||||||
fail = "csum/code missing: %s" % e
|
|
||||||
if not fail:
|
|
||||||
fail = doupdateone(code, csum)
|
|
||||||
|
|
||||||
msg = {"service": "update", "msg": fail if fail else "OK"}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
if not fail:
|
|
||||||
log("hc updates, fs = %s" % (len(code)))
|
|
||||||
|
|
||||||
return fail
|
|
||||||
|
|
||||||
|
|
||||||
def doupdateone(code, csum):
|
|
||||||
|
|
||||||
m = md5()
|
|
||||||
m.update(code.encode())
|
|
||||||
icsum = m.hexdigest()
|
|
||||||
if icsum != csum:
|
|
||||||
return "checksum error"
|
|
||||||
|
|
||||||
fn = sys.argv[0]
|
|
||||||
ofn = "%s.sav" % fn
|
|
||||||
try:
|
|
||||||
shutil.copy2(fn, ofn)
|
|
||||||
except Exception as e:
|
|
||||||
return "cannot make backup copy: %s" % e
|
|
||||||
|
|
||||||
try:
|
|
||||||
fh = open(fn, "w")
|
|
||||||
fh.write(code)
|
|
||||||
fh.close()
|
|
||||||
except Exception as e:
|
|
||||||
return "cannot write new code: %s" % e
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def restart():
|
|
||||||
if verbose:
|
|
||||||
print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
|
|
||||||
e = "fallthrough"
|
|
||||||
try:
|
|
||||||
os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
print("should not be here:", str(e))
|
|
||||||
log("restart failed: %s" % e)
|
|
||||||
|
|
||||||
|
|
||||||
def process():
|
|
||||||
global running, dorestart
|
|
||||||
|
|
||||||
nextReport = time.time()
|
|
||||||
|
|
||||||
while running:
|
|
||||||
while time.time() < nextReport:
|
|
||||||
ifiles = {}
|
|
||||||
conIds = {}
|
|
||||||
for conn in conns:
|
|
||||||
if conns[conn].sock:
|
|
||||||
ifiles[conns[conn].sock.fileno()] = conns[conn].sock
|
|
||||||
conIds[conns[conn].sock.fileno()] = conn
|
|
||||||
|
|
||||||
sleep = nextReport - time.time()
|
|
||||||
if sleep <= 0:
|
|
||||||
break
|
|
||||||
try:
|
|
||||||
r = select.select(list(ifiles.keys()), [], [], sleep)
|
|
||||||
now = (
|
|
||||||
time.time()
|
|
||||||
) # nb: delay from actual packet arrival to select is ca. 105ms!
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
except SystemExit:
|
|
||||||
log("daemon exit, running was %s" % running)
|
|
||||||
if running:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
if running:
|
|
||||||
syslogtrace("select")
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
for rfh in r[0]:
|
|
||||||
conn = conIds[rfh]
|
|
||||||
data, addr = ifiles[rfh].recvfrom(MAXRECV)
|
|
||||||
if verbose:
|
|
||||||
print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
|
|
||||||
try:
|
|
||||||
msgDict = stodict(data)
|
|
||||||
except Exception as e:
|
|
||||||
print(
|
|
||||||
"failed to parse incoming data from %s: %s (%s)"
|
|
||||||
% (addr, data, e)
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(
|
|
||||||
"sock.recvfrom: %s (%s) %s"
|
|
||||||
% (addr, len(data), str(msgDict)[:80])
|
|
||||||
)
|
|
||||||
if msgDict == None:
|
|
||||||
print("bad backet from %s (%s) %s" % (addr, len(data), data))
|
|
||||||
elif msgDict["ID"] == "ACK":
|
|
||||||
conns[conn].ack(msgDict, now)
|
|
||||||
elif msgDict["ID"] == "UPD":
|
|
||||||
if doupdate(conn, msgDict) == None:
|
|
||||||
if verbose:
|
|
||||||
print("process: restart after update")
|
|
||||||
dorestart = True
|
|
||||||
break
|
|
||||||
elif msgDict["ID"] == "CMD":
|
|
||||||
doexec(conn, msgDict["cmd"])
|
|
||||||
else:
|
|
||||||
doexec(conn, data) # deprecated until no more VER - hbc
|
|
||||||
if dorestart:
|
|
||||||
running = False
|
|
||||||
break
|
|
||||||
if not running:
|
|
||||||
break
|
|
||||||
for conn in conns:
|
|
||||||
msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
time.sleep(
|
|
||||||
0.1
|
|
||||||
) # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
|
|
||||||
if nextReport + interval >= time.time():
|
|
||||||
nextReport += interval
|
|
||||||
else:
|
|
||||||
nextReport = time.time() + interval
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("process: done running")
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup():
|
|
||||||
global running
|
|
||||||
if not running:
|
|
||||||
return
|
|
||||||
if verbose:
|
|
||||||
log("cleanup")
|
|
||||||
running = False
|
|
||||||
for conn in conns:
|
|
||||||
msg = {"shutdown": 1, "acks": conns[conn].ackcount}
|
|
||||||
conns[conn].sendto(msg)
|
|
||||||
conns[conn].close()
|
|
||||||
time.sleep(1)
|
|
||||||
closeall()
|
|
||||||
|
|
||||||
|
|
||||||
def closeall():
|
|
||||||
if verbose:
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "closecall")
|
|
||||||
for conn in conns:
|
|
||||||
conns[conn].close()
|
|
||||||
|
|
||||||
|
|
||||||
def daemonize(
|
|
||||||
working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
|
|
||||||
UNIX Environment" for details (ISBN 0201563177)
|
|
||||||
http://www.yendor.com/programming/unix/apue/proc/fork2.c
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
# first fork
|
|
||||||
pid = os.fork()
|
|
||||||
if pid > 0:
|
|
||||||
# exit from first parent
|
|
||||||
os._exit(0)
|
|
||||||
except OSError as e:
|
|
||||||
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
|
|
||||||
os._exit(1)
|
|
||||||
|
|
||||||
# decouple from parent environment
|
|
||||||
os.chdir(working_dir)
|
|
||||||
os.setsid()
|
|
||||||
os.umask(0)
|
|
||||||
# second fork
|
|
||||||
try:
|
|
||||||
pid = os.fork()
|
|
||||||
if pid > 0:
|
|
||||||
# exit from second parent
|
|
||||||
os._exit(0)
|
|
||||||
except OSError as e:
|
|
||||||
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# redirects standard file descriptors
|
|
||||||
sys.stdout.flush()
|
|
||||||
sys.stderr.flush()
|
|
||||||
si = open(stdin, "r")
|
|
||||||
so = open(stdout, "a+")
|
|
||||||
se = open(stderr, "a+")
|
|
||||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
|
||||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
|
||||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
|
||||||
|
|
||||||
#
|
|
||||||
# Main program
|
|
||||||
#
|
|
||||||
def build_parser():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="hbc",
|
|
||||||
description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
|
|
||||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
|
||||||
parser.add_argument("-m", "--message", dest="message", help="Send a message")
|
|
||||||
parser.add_argument("-n", "--name", dest="name", help="Name to use in heartbeat message")
|
|
||||||
parser.add_argument("-f", "--daemon", action="store_true", help="Run in daemon mode")
|
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
||||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
|
||||||
parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def main(argv=None):
|
|
||||||
global msgonly, helpflag, verbose, fdaemon, daemonized, optlist, msgboot, home, configfile, cmdargs, iam, hb_port, conns, interval, hb_hosts
|
|
||||||
parser = build_parser()
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
config = load_config(args.configfile)
|
|
||||||
|
|
||||||
# Apply CLI overrides
|
|
||||||
if args.boot:
|
|
||||||
msgboot["boot"] = 1
|
|
||||||
if args.message:
|
|
||||||
msgboot["service"] = "service"
|
|
||||||
msgboot["msg"] = args.message
|
|
||||||
msgonly = True
|
|
||||||
if args.name:
|
|
||||||
iam = args.name
|
|
||||||
if args.daemon:
|
|
||||||
fdaemon = True
|
|
||||||
if args.verbose:
|
|
||||||
verbose = True
|
|
||||||
if args.debug:
|
|
||||||
config.setdefault("debug", 0)
|
|
||||||
config["debug"] += args.debug
|
|
||||||
|
|
||||||
cmdargs += argv
|
|
||||||
if verbose:
|
|
||||||
print("cmdargs for restart are %s" % cmdargs)
|
|
||||||
|
|
||||||
#
|
|
||||||
# set defaults
|
|
||||||
|
|
||||||
hb_hosts = args.hosts
|
|
||||||
hb_port = config.get("hb_port", PORT)
|
|
||||||
interval = config.get("interval", INTERVAL)
|
|
||||||
|
|
||||||
#
|
|
||||||
if verbose:
|
|
||||||
print("notice: hb_hosts: %s" % str(hb_hosts))
|
|
||||||
print("notice: hb_port: %s" % hb_port)
|
|
||||||
print("notice: interval: %s" % interval)
|
|
||||||
print("notice: iam: %s" % iam)
|
|
||||||
print("notice: msgonly: %s" % msgonly)
|
|
||||||
print("notice: msgboot: %s" % msgboot)
|
|
||||||
|
|
||||||
if not msgonly:
|
|
||||||
msgboot["interval"] = interval
|
|
||||||
|
|
||||||
conns = {}
|
|
||||||
while True:
|
|
||||||
if verbose:
|
|
||||||
log("create connections")
|
|
||||||
createConnections(hb_hosts)
|
|
||||||
if len(conns) != 0:
|
|
||||||
break
|
|
||||||
if verbose:
|
|
||||||
log("no connections yet, sleep a bit")
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("%s connections created" % (len(conns)))
|
|
||||||
|
|
||||||
if len(msgboot) > 0:
|
|
||||||
if verbose:
|
|
||||||
print("on boot")
|
|
||||||
msgboot["acks"] = 0
|
|
||||||
for conn in conns:
|
|
||||||
conns[conn].sendto(msgboot)
|
|
||||||
|
|
||||||
if msgonly:
|
|
||||||
if verbose:
|
|
||||||
print("msgboot done msgonly=%s" % msgonly)
|
|
||||||
closeall()
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
#
|
|
||||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
|
||||||
if fdaemon:
|
|
||||||
print("daemoinizing.")
|
|
||||||
daemonize()
|
|
||||||
daemonized = True
|
|
||||||
syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
|
|
||||||
|
|
||||||
signal.signal(signal.SIGTERM, handler)
|
|
||||||
running = True
|
|
||||||
try:
|
|
||||||
process()
|
|
||||||
except Exception as e:
|
|
||||||
syslogtrace("process")
|
|
||||||
if verbose:
|
|
||||||
print("err: process exit: %s" % e)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
log("main: cleanup")
|
|
||||||
cleanup()
|
|
||||||
if dorestart:
|
|
||||||
restart()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
-380
@@ -1,380 +0,0 @@
|
|||||||
"""
|
|
||||||
host and connection class shared between hbd and
|
|
||||||
the websit's heartbeat.py
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import copy
|
|
||||||
import queue
|
|
||||||
|
|
||||||
num = 0
|
|
||||||
|
|
||||||
MAXRTTS = 10
|
|
||||||
|
|
||||||
DEBUG = 2
|
|
||||||
|
|
||||||
|
|
||||||
def log(host, m):
|
|
||||||
if DEBUG:
|
|
||||||
print("class log: %s %s" % (host, m))
|
|
||||||
|
|
||||||
|
|
||||||
class Connection:
|
|
||||||
# map of addrs to names
|
|
||||||
|
|
||||||
htab = {}
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
UP = "up"
|
|
||||||
DOWN = "down"
|
|
||||||
OVERDUE = "overdue"
|
|
||||||
|
|
||||||
def __init__(self, host, cid, addr, afam):
|
|
||||||
self.host = host
|
|
||||||
self.cid = cid
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.addr = addr
|
|
||||||
self.afam = afam
|
|
||||||
self.rtts = [0]
|
|
||||||
self.lastbeat = time.time()
|
|
||||||
self.statetime = self.lastbeat
|
|
||||||
self.deltastatetime = "computed"
|
|
||||||
self.state = Connection.UNKNOWN
|
|
||||||
|
|
||||||
if host:
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
log(self.host.name, "dns update %s" % self.addr)
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def clearstate(self):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = ""
|
|
||||||
d["rtt"] = ""
|
|
||||||
d["lastbeat"] = ""
|
|
||||||
d["state"] = ""
|
|
||||||
d["statetime"] = ""
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
d["rttstate"] = ""
|
|
||||||
return d
|
|
||||||
|
|
||||||
def statedict(self, Null=False):
|
|
||||||
d = self.clearstate()
|
|
||||||
now = time.time()
|
|
||||||
if not Null:
|
|
||||||
d["addr"] = self.addr
|
|
||||||
if self.rtts[-1]:
|
|
||||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
|
||||||
elif self.state == Connection.UNKNOWN:
|
|
||||||
d["rtt"] = ""
|
|
||||||
else:
|
|
||||||
d["rtt"] = "?"
|
|
||||||
d["lastbeat"] = self.lastbeat
|
|
||||||
if self.state == Connection.OVERDUE:
|
|
||||||
d["state"] = "<b>%s</b>" % self.state
|
|
||||||
else:
|
|
||||||
d["state"] = self.state
|
|
||||||
if self.state == Connection.UP:
|
|
||||||
d["rttstate"] = d["rtt"]
|
|
||||||
elif self.state == Connection.OVERDUE:
|
|
||||||
d["rttstate"] = ""
|
|
||||||
else:
|
|
||||||
d["rttstate"] = d["state"]
|
|
||||||
d["statetime"] = time.strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
|
||||||
)
|
|
||||||
delta = now - self.statetime
|
|
||||||
|
|
||||||
if self.state == Connection.UNKNOWN:
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
elif delta > 86400:
|
|
||||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
|
||||||
elif delta > 3600:
|
|
||||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
|
||||||
elif delta > 60:
|
|
||||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
|
||||||
else:
|
|
||||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%i secs" % (delta)
|
|
||||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
|
||||||
d = self.clearstate()
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self, afam):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = "%s Addr" % afam
|
|
||||||
d["rtt"] = "Latencey"
|
|
||||||
d["lastbeat"] = "Last Contact"
|
|
||||||
d["state"] = "State"
|
|
||||||
d["statetime"] = "Last State"
|
|
||||||
d["rttstate"] = "Reach"
|
|
||||||
d["deltastatetime"] = "Last State"
|
|
||||||
return d
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.__dict__)
|
|
||||||
|
|
||||||
# set new state, return number of secs in previous state
|
|
||||||
def newstate(self, state, now, when=0):
|
|
||||||
self.state = state
|
|
||||||
delta = now - when
|
|
||||||
s = delta - self.statetime
|
|
||||||
self.statetime = delta
|
|
||||||
return s
|
|
||||||
|
|
||||||
def getstate(self):
|
|
||||||
return self.state
|
|
||||||
|
|
||||||
def newaddr(self, addr, rtt, now):
|
|
||||||
self.lastbeat = now
|
|
||||||
self.rtts.append(rtt)
|
|
||||||
if len(self.rtts) > MAXRTTS:
|
|
||||||
del self.rtts[0]
|
|
||||||
|
|
||||||
if self.addr == addr:
|
|
||||||
r = None
|
|
||||||
else:
|
|
||||||
r = "changed from %s to %s" % (self.addr, addr)
|
|
||||||
try:
|
|
||||||
del Connection.htab[self.addr]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self.addr = addr
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
class Host:
|
|
||||||
# Table of Hosts
|
|
||||||
hosts = {}
|
|
||||||
dnsQ = queue.Queue()
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
global num
|
|
||||||
self.name = name
|
|
||||||
if name:
|
|
||||||
num += 1
|
|
||||||
Host.hosts[name] = self
|
|
||||||
self.num = num
|
|
||||||
self.dyn = False
|
|
||||||
self.watched = False
|
|
||||||
self.upcount = 0
|
|
||||||
self.interval = 0
|
|
||||||
self.doesack = -1
|
|
||||||
self.cmds = []
|
|
||||||
self.cver = 0
|
|
||||||
self.connections = {}
|
|
||||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
|
||||||
|
|
||||||
def statedict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = self.name
|
|
||||||
if self.dyn:
|
|
||||||
d["name"] += "*"
|
|
||||||
if self.watched:
|
|
||||||
d["name"] = "<b>%s</b>" % d["name"]
|
|
||||||
d["dyn"] = str(self.dyn)
|
|
||||||
d["ver"] = str(self.cver)
|
|
||||||
d["num"] = self.num
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
cs = self.connections[c].statedict()
|
|
||||||
else:
|
|
||||||
cs = ubConnection.statedict(True)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = "Name"
|
|
||||||
d["dyn"] = "Dyn"
|
|
||||||
d["ver"] = "Ver"
|
|
||||||
d["num"] = "??"
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
cs = ubConnection.headerdict(c)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
return d
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
for af in self.connections:
|
|
||||||
self.connections[af].registerDns()
|
|
||||||
|
|
||||||
def stateinfo(self):
|
|
||||||
ddict = {}
|
|
||||||
for d in self.__dict__:
|
|
||||||
if d == "connections":
|
|
||||||
cl = []
|
|
||||||
for c in self.connections:
|
|
||||||
# dirty ugly hack: fix conn to host backpointer
|
|
||||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
|
||||||
cld["host"] = cld["host"].name
|
|
||||||
cl.append(cld)
|
|
||||||
ddict[d] = cl
|
|
||||||
else:
|
|
||||||
ddict[d] = self.__dict__[d]
|
|
||||||
return ddict
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.stateinfo())
|
|
||||||
|
|
||||||
def setcver(self, cver):
|
|
||||||
self.cver = cver
|
|
||||||
|
|
||||||
def isDynDns(self):
|
|
||||||
return self.dyn
|
|
||||||
|
|
||||||
def isIPv4(self, addr):
|
|
||||||
if isinstance(addr, tuple):
|
|
||||||
return addr[0].find(".") > 0
|
|
||||||
else:
|
|
||||||
return addr.find(".") > 0
|
|
||||||
|
|
||||||
def conndata(self, cid, addr, rtt, now):
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
if self.isIPv4(addr):
|
|
||||||
afam = "IPv4"
|
|
||||||
else:
|
|
||||||
afam = "IPv6"
|
|
||||||
|
|
||||||
if afam not in self.connections:
|
|
||||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
|
||||||
|
|
||||||
conn = self.connections[afam]
|
|
||||||
res = conn.newaddr(addr, rtt, now)
|
|
||||||
return conn, res
|
|
||||||
|
|
||||||
# called when reloading class from pickle, add new fields here
|
|
||||||
def fixup(self):
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
addr = self.connections[c].addr
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.connections[c].addr = addr
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def dispstate(self):
|
|
||||||
# if self.state in ["down", "overdue"]:
|
|
||||||
# state = "<b>%s</b>" % self.state
|
|
||||||
# elif self.state in ["up", "UP"]:
|
|
||||||
# state = ""
|
|
||||||
# for x in list(self.connections.keys()):
|
|
||||||
# try:
|
|
||||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
|
||||||
# except:
|
|
||||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
|
||||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
|
||||||
# state = ""
|
|
||||||
# else:
|
|
||||||
# state = "%s" % self.state
|
|
||||||
# return state
|
|
||||||
|
|
||||||
def dispstats(self):
|
|
||||||
if self.doesack != -1:
|
|
||||||
if self.upcount > 0:
|
|
||||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
|
||||||
r = ""
|
|
||||||
for v in range(3):
|
|
||||||
a, u = self.hdwcounts[v]
|
|
||||||
if (self.upcount - u) != 0:
|
|
||||||
vs = "%0.0f" % (
|
|
||||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
|
||||||
)
|
|
||||||
if vs == "0":
|
|
||||||
vs = ""
|
|
||||||
else:
|
|
||||||
vs = "-"
|
|
||||||
r += '<td align="right">%s</td>' % vs
|
|
||||||
return r
|
|
||||||
else:
|
|
||||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
|
||||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
|
||||||
|
|
||||||
hostfields_long = [
|
|
||||||
"name",
|
|
||||||
"IPv4.addr",
|
|
||||||
"IPv4.state",
|
|
||||||
("IPv4.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv4.statetime", 'style="text-align: right;"'),
|
|
||||||
"IPv6.addr",
|
|
||||||
"IPv6.state",
|
|
||||||
("IPv6.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv6.statetime", 'style="text-align: right;"'),
|
|
||||||
"ver",
|
|
||||||
]
|
|
||||||
|
|
||||||
hostfields_short = [
|
|
||||||
"name",
|
|
||||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def gene(self, tag, v, attrib=None):
|
|
||||||
if attrib:
|
|
||||||
a = " %s" % attrib
|
|
||||||
else:
|
|
||||||
a = ""
|
|
||||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
|
||||||
|
|
||||||
def htmltable(self, tag, hd, short):
|
|
||||||
if short:
|
|
||||||
hostfields = Host.hostfields_short
|
|
||||||
else:
|
|
||||||
hostfields = Host.hostfields_long
|
|
||||||
h = []
|
|
||||||
for f in hostfields:
|
|
||||||
if isinstance(f, tuple):
|
|
||||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
|
||||||
else:
|
|
||||||
h.append(self.gene(tag, hd[f]))
|
|
||||||
return self.gene("tr", "\n".join(h))
|
|
||||||
|
|
||||||
def buildhosttable(self, short=False):
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: start")
|
|
||||||
res = []
|
|
||||||
res.append('<table id="ntable" class="sortable">')
|
|
||||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
|
||||||
hosts_sorted = list(Host.hosts.keys())
|
|
||||||
if len(hosts_sorted):
|
|
||||||
hosts_sorted.sort()
|
|
||||||
for h in hosts_sorted:
|
|
||||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
|
||||||
res.append("</table>")
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: %s" % res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def buildmsgtable(self, msgs):
|
|
||||||
res = []
|
|
||||||
le = max(40 - len(Host.hosts), 3)
|
|
||||||
res.append("<h4>Log of Events</h4>")
|
|
||||||
for m in msgs[len(msgs) - le:]:
|
|
||||||
res.append("%s<BR>" % m)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
# create fake "unbound objects", remove in Python 3.0
|
|
||||||
ubHost = Host(None)
|
|
||||||
ubConnection = Connection(None, "", "", "")
|
|
||||||
-199
@@ -1,199 +0,0 @@
|
|||||||
"""HTTP server implementation using aiohttp and jinja2."""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import urllib.parse
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from aiohttp import web
|
|
||||||
from fastapi.templating import Jinja2Templates
|
|
||||||
import jinja2
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def _render_template(html_str: str, **context) -> str:
|
|
||||||
tmpl = jinja2.Template(html_str)
|
|
||||||
return tmpl.render(**context)
|
|
||||||
|
|
||||||
async def start(
|
|
||||||
host: str,
|
|
||||||
port: int,
|
|
||||||
config,
|
|
||||||
hbdclass,
|
|
||||||
msgs_getter,
|
|
||||||
log=None,
|
|
||||||
email=None,
|
|
||||||
pushmsg=None,
|
|
||||||
msg_to_websockets=None,
|
|
||||||
tcss=None,
|
|
||||||
DEBUG=0,
|
|
||||||
verbose=False,
|
|
||||||
get_now=None,
|
|
||||||
VER="",
|
|
||||||
):
|
|
||||||
"""Start an aiohttp web server and block until cancelled.
|
|
||||||
|
|
||||||
This function is intended to be awaited inside the main asyncio event loop.
|
|
||||||
"""
|
|
||||||
get_now = get_now or (lambda: time.time())
|
|
||||||
|
|
||||||
async def index(request):
|
|
||||||
res = []
|
|
||||||
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
|
||||||
res.append("<html>")
|
|
||||||
res.append("<head>")
|
|
||||||
res.append(f"<title>Heartbeat</title>")
|
|
||||||
if tcss:
|
|
||||||
res.append(tcss)
|
|
||||||
res.append("</head>")
|
|
||||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
|
||||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
|
||||||
res += hbdclass.ubHost.buildhosttable()
|
|
||||||
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
|
|
||||||
res.append(
|
|
||||||
"<p> %s (%s)</p>" % (time.strftime("%H:%M:%S", time.localtime(get_now())), config.get("tz", "CET-1CDT"))
|
|
||||||
)
|
|
||||||
res.append("</body></html>")
|
|
||||||
body = "\n".join(res)
|
|
||||||
return web.Response(text=body, content_type="text/html")
|
|
||||||
|
|
||||||
async def api_hosts(request):
|
|
||||||
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
|
|
||||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
|
||||||
|
|
||||||
async def api_messages(request):
|
|
||||||
lst = msgs_getter()[-30:]
|
|
||||||
return web.json_response(lst)
|
|
||||||
|
|
||||||
async def cmd(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
ucmd = qa.get("c")
|
|
||||||
if not ucmd or not uname:
|
|
||||||
return web.Response(status=400, text="need h= and c= arguments")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
hbdclass.Host.hosts[uname].cmds.append(("CMD", {"cmd": urllib.parse.unquote(ucmd)}))
|
|
||||||
return web.Response(text=f"cmd {uname} queued")
|
|
||||||
|
|
||||||
async def drop(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
if not uname:
|
|
||||||
return web.Response(status=400, text="need h= argument")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
if log:
|
|
||||||
log(uname, "dropped")
|
|
||||||
del hbdclass.Host.hosts[uname]
|
|
||||||
return web.Response(text="Done")
|
|
||||||
|
|
||||||
async def register(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = qa.get("h")
|
|
||||||
if not uname:
|
|
||||||
return web.Response(status=400, text="need h= argument")
|
|
||||||
if uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
ll = hbdclass.Host.hosts[uname].registerDns()
|
|
||||||
if log:
|
|
||||||
log(uname, ll)
|
|
||||||
return web.Response(text=str(ll))
|
|
||||||
|
|
||||||
async def update(request):
|
|
||||||
qa = request.rel_url.query
|
|
||||||
uname = urllib.parse.unquote(qa.get("h", ""))
|
|
||||||
ucode = qa.get("c")
|
|
||||||
if not ucode or not uname:
|
|
||||||
return web.Response(status=400, text="need h= and c= arguments")
|
|
||||||
if uname != "All" and uname not in hbdclass.Host.hosts:
|
|
||||||
return web.Response(status=400, text=f"h={uname} not found")
|
|
||||||
if uname != "All":
|
|
||||||
names = [uname]
|
|
||||||
else:
|
|
||||||
names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
|
|
||||||
out = []
|
|
||||||
for n in names:
|
|
||||||
err = None
|
|
||||||
try:
|
|
||||||
r = {"csum": None, "code": ucode}
|
|
||||||
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
|
|
||||||
except Exception as e:
|
|
||||||
err = str(e)
|
|
||||||
out.append(f"update started for {n}: {err if err else 'OK'}")
|
|
||||||
return web.Response(text="\n".join(out))
|
|
||||||
|
|
||||||
async def restart(request):
|
|
||||||
# signal main application to perform restart if needed
|
|
||||||
# not implemented here - return OK
|
|
||||||
if log:
|
|
||||||
log(None, "restart request")
|
|
||||||
return web.Response(text="restart request")
|
|
||||||
|
|
||||||
async def live(request):
|
|
||||||
# render template from hbd/templates/live.html using Jinja2
|
|
||||||
# Resolve templates directory relative to the hbd package
|
|
||||||
pkg_dir = os.path.dirname(__file__)
|
|
||||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
|
||||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
|
||||||
host = config.get("hb_host", "localhost")
|
|
||||||
extra_scripts = config.get("http_extra_scripts", "")
|
|
||||||
host = request.host.split(":")[0]
|
|
||||||
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
|
|
||||||
tmpl = env.get_template("live.html")
|
|
||||||
body = tmpl.render(
|
|
||||||
title="Heartbeat",
|
|
||||||
header="Heartbeat",
|
|
||||||
request=request,
|
|
||||||
heartbeat_ws_url=heartbeat_ws_url,
|
|
||||||
extra_scripts=extra_scripts,
|
|
||||||
hosts=[hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)],
|
|
||||||
messages=msgs_getter()[-30:],
|
|
||||||
)
|
|
||||||
return web.Response(text=body, content_type="text/html")
|
|
||||||
|
|
||||||
async def static(request):
|
|
||||||
"""Serve files from the package static directory.
|
|
||||||
|
|
||||||
URL form: /static/<path>
|
|
||||||
"""
|
|
||||||
p = request.match_info.get("path", "")
|
|
||||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
|
||||||
# normalize and prevent directory traversal
|
|
||||||
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
|
||||||
if not target.startswith(base + os.sep) and target != base:
|
|
||||||
return web.Response(status=403, text="Forbidden")
|
|
||||||
if not os.path.exists(target) or not os.path.isfile(target):
|
|
||||||
return web.Response(status=404, text="Not Found")
|
|
||||||
logger.info("serving static file: %s", target)
|
|
||||||
return web.FileResponse(path=target)
|
|
||||||
|
|
||||||
app = web.Application()
|
|
||||||
app.add_routes(
|
|
||||||
[
|
|
||||||
web.get("/", index),
|
|
||||||
web.get("/api/0/hosts", api_hosts),
|
|
||||||
web.get("/api/0/messages", api_messages),
|
|
||||||
web.get("/c", cmd),
|
|
||||||
web.get("/d", drop),
|
|
||||||
web.get("/n", register),
|
|
||||||
web.get("/u", update),
|
|
||||||
web.get("/r", restart),
|
|
||||||
web.get("/live", live),
|
|
||||||
web.get("/static/{path:.*}", static),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
runner = web.AppRunner(app)
|
|
||||||
await runner.setup()
|
|
||||||
site = web.TCPSite(runner, host, port)
|
|
||||||
await site.start()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"HTTP server started on {host}:{port}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
await asyncio.Future()
|
|
||||||
finally:
|
|
||||||
await runner.cleanup()
|
|
||||||
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
"""monitor helper and thread for heartbeat daemon."""
|
|
||||||
from __future__ import annotations
|
|
||||||
import asyncio
|
|
||||||
import threading
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
from subprocess import Popen, PIPE, STDOUT
|
|
||||||
from typing import Optional
|
|
||||||
from . import hbdclass
|
|
||||||
DROPOVERDUE = 7 * 24 * 3600
|
|
||||||
|
|
||||||
def checkoverdue(config: dict, hbdclass, log: callable, email: callable, pushmsg: callable, msg_to_websockets: callable):
|
|
||||||
now = time.time()
|
|
||||||
for h in list(hbdclass.Host.hosts.keys()):
|
|
||||||
pmsg = []
|
|
||||||
for c in hbdclass.Host.hosts[h].connections:
|
|
||||||
conn = hbdclass.Host.hosts[h].connections[c]
|
|
||||||
if conn.state == hbdclass.Connection.DOWN:
|
|
||||||
continue
|
|
||||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
|
||||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
|
||||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
|
||||||
pmsg.append(conn.afam)
|
|
||||||
if (
|
|
||||||
conn.state == hbdclass.Connection.OVERDUE and (now - conn.lastbeat) > DROPOVERDUE
|
|
||||||
):
|
|
||||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
|
||||||
if pmsg != []:
|
|
||||||
if h in config.get("watchhosts", []):
|
|
||||||
email("overdue", "%s overdue" % " and ".join(pmsg))
|
|
||||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
|
||||||
log(h, "%s overdue" % " and ".join(pmsg))
|
|
||||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
|
||||||
|
|
||||||
async def start(
|
|
||||||
config: dict,
|
|
||||||
hbdclass: callable,
|
|
||||||
log=None,
|
|
||||||
email=None,
|
|
||||||
pushmsg=None,
|
|
||||||
msg_to_websockets=None,
|
|
||||||
):
|
|
||||||
""" start a monitor loop that checks for overdue hosts every minute """
|
|
||||||
while True:
|
|
||||||
await asyncio.sleep(15) # 15 seconds between checks
|
|
||||||
checkoverdue(config, hbdclass, log, email, pushmsg, msg_to_websockets)
|
|
||||||
-155
@@ -1,155 +0,0 @@
|
|||||||
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
|
|
||||||
import logging
|
|
||||||
from typing import Optional
|
|
||||||
import http.client
|
|
||||||
import urllib.parse
|
|
||||||
import subprocess
|
|
||||||
import smtplib
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
|
||||||
|
|
||||||
# module-level configuration set via setup()
|
|
||||||
_config = {}
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def setup(cfg: dict):
|
|
||||||
"""Initialize notifier defaults from a configuration dict."""
|
|
||||||
global _config
|
|
||||||
_config = dict(cfg)
|
|
||||||
|
|
||||||
|
|
||||||
def send_email(aemail, smtpserver, sender, subject, body, debug=0):
|
|
||||||
"""Send a plain email via SMTP. Returns True on success."""
|
|
||||||
try:
|
|
||||||
server = smtplib.SMTP(smtpserver)
|
|
||||||
if debug > 0:
|
|
||||||
server.set_debuglevel(1)
|
|
||||||
server.sendmail(sender, aemail, body)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("email send failed: %s", e)
|
|
||||||
try:
|
|
||||||
server.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
server.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def email(subject: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Convenience wrapper exposed to the rest of the application.
|
|
||||||
|
|
||||||
Uses module-level configuration to supply recipient list, smtp server
|
|
||||||
and sender address.
|
|
||||||
"""
|
|
||||||
toaddrs = _config.get("AEMAIL") or _config.get("aemail") or _config.get("email_to") or []
|
|
||||||
fromemail = _config.get("fromemail") or _config.get("sender") or f"aew.heartbeat@{_config.get('domain','local') }"
|
|
||||||
smtpserver = _config.get("SMTPSERVER") or _config.get("smtpserver") or _config.get("SMTPSERVER", "localhost")
|
|
||||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
|
||||||
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
|
||||||
toaddrs[0] if toaddrs else "",
|
|
||||||
fromemail,
|
|
||||||
subject,
|
|
||||||
date,
|
|
||||||
msg,
|
|
||||||
)
|
|
||||||
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
|
|
||||||
|
|
||||||
|
|
||||||
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Send message via Pushover API."""
|
|
||||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
|
||||||
try:
|
|
||||||
conn.request(
|
|
||||||
"POST",
|
|
||||||
"/1/messages.json",
|
|
||||||
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
|
|
||||||
{"Content-type": "application/x-www-form-urlencoded"},
|
|
||||||
)
|
|
||||||
r = conn.getresponse()
|
|
||||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
|
||||||
return r.status == 200
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("pushover error: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushmattermost(host: str, token: str, channel: str, msg: str, username: str = "hbd", icon: Optional[str] = None, debug: int = 0) -> bool:
|
|
||||||
"""Send a message to Mattermost via simple webhook driver if available.
|
|
||||||
|
|
||||||
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
|
|
||||||
If the import fails it returns False.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from mattermostdriver import Driver
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
|
||||||
mm = Driver(ses)
|
|
||||||
payload = {"text": msg, "channel": channel, "username": username}
|
|
||||||
if icon:
|
|
||||||
payload["icon_url"] = icon
|
|
||||||
try:
|
|
||||||
rc = mm.webhooks.call_webhook(token, payload)
|
|
||||||
logger.debug("mattermost rc: %s", rc)
|
|
||||||
return bool(rc is None or rc == "")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("mattermost error: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushsignal(signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0) -> bool:
|
|
||||||
"""Send a message via signal-cli (requires local installation).
|
|
||||||
|
|
||||||
Uses subprocess to call signal-cli. Returns True if the command succeeded.
|
|
||||||
"""
|
|
||||||
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
|
|
||||||
logger.debug("signal cli: %s", CLI)
|
|
||||||
try:
|
|
||||||
res = subprocess.run(CLI, capture_output=True)
|
|
||||||
if res.returncode != 0:
|
|
||||||
logger.error("signal failed: %s". res.stderr.decode())
|
|
||||||
return False
|
|
||||||
logger.debug("signal sent: %s", res.stdout.decode())
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("signal exception: %s", e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
|
||||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
|
||||||
|
|
||||||
cfg is expected to contain keys for different services when needed, e.g.
|
|
||||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
|
||||||
- cfg['pushover_token'], cfg['pushover_user']
|
|
||||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
|
||||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
|
||||||
|
|
||||||
Returns a dict of results per provider.
|
|
||||||
"""
|
|
||||||
results = {}
|
|
||||||
p = cfg.get("pushsrv", "pushover")
|
|
||||||
if p in ("all", "pushover"):
|
|
||||||
ok = pushover(cfg.get("pushover_token", ""), cfg.get("pushover_user", ""), msg, debug=debug)
|
|
||||||
results["pushover"] = ok
|
|
||||||
if p in ("all", "mattermost"):
|
|
||||||
ok = pushmattermost(cfg.get("matter_host", ""), cfg.get("matter_token", ""), cfg.get("matter_channel", ""), msg, username=cfg.get("matter_username", "hbd"), icon=cfg.get("matter_icon"), debug=debug)
|
|
||||||
results["mattermost"] = ok
|
|
||||||
if p in ("all", "signal"):
|
|
||||||
ok = pushsignal(cfg.get("signal_cli", "/usr/local/bin/signal-cli"), cfg.get("signal_user", ""), cfg.get("signal_recipient", ""), msg, debug=debug)
|
|
||||||
results["signal"] = ok
|
|
||||||
logger.debug("push results: %s", results)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
|
||||||
"""Use the module-level configuration dict to dispatch a push message."""
|
|
||||||
return pushmsg(_config, msg, debug=debug)
|
|
||||||
|
|
||||||
@@ -1,81 +0,0 @@
|
|||||||
"""Message encoding/decoding utilities for hbd protocol."""
|
|
||||||
from typing import Dict, Any
|
|
||||||
import zlib
|
|
||||||
|
|
||||||
|
|
||||||
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
|
||||||
"""Serialize a dict to protocol message bytes.
|
|
||||||
|
|
||||||
If compress is True, the payload is zlib-compressed and the message is
|
|
||||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
|
||||||
`ID:key=value;...` (bytes).
|
|
||||||
"""
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
v = d[k]
|
|
||||||
if isinstance(v, float):
|
|
||||||
s.append(f"{k}={v:0.5f}")
|
|
||||||
else:
|
|
||||||
s.append(f"{k}={v}")
|
|
||||||
pk = ";".join(s)
|
|
||||||
if compress:
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
hdr = ("!" + ID + ":").encode()
|
|
||||||
return hdr + zpk
|
|
||||||
else:
|
|
||||||
return (ID + ":" + pk).encode()
|
|
||||||
|
|
||||||
|
|
||||||
def stodict(msg: bytes):
|
|
||||||
"""Deserialize a protocol message into a dict.
|
|
||||||
|
|
||||||
Mirrors original behaviour: detects compressed messages starting with
|
|
||||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
|
||||||
message ID and the parsed key/value pairs.
|
|
||||||
"""
|
|
||||||
d = {}
|
|
||||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
|
||||||
# message is: b'!ID:' + compressed_payload
|
|
||||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
|
||||||
try:
|
|
||||||
pk = zlib.decompress(msg[5:]).decode()
|
|
||||||
except Exception:
|
|
||||||
# malformed compressed payload
|
|
||||||
return {}
|
|
||||||
d["ID"] = msg[1:4].decode()
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
r0 = msg.split(b":", 1)
|
|
||||||
pk = r0[1].decode()
|
|
||||||
d["ID"] = r0[0].decode()
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
if not pk:
|
|
||||||
return d
|
|
||||||
parts = pk.split(";")
|
|
||||||
for v in parts:
|
|
||||||
if not v:
|
|
||||||
continue
|
|
||||||
vr = v.split("=", 1)
|
|
||||||
k = vr[0].strip()
|
|
||||||
if len(vr) == 1:
|
|
||||||
d[k] = None
|
|
||||||
else:
|
|
||||||
val = vr[1].strip()
|
|
||||||
if val and val[0].isdigit():
|
|
||||||
try:
|
|
||||||
val_e = eval(val)
|
|
||||||
except Exception:
|
|
||||||
val_e = val
|
|
||||||
d[k] = val_e
|
|
||||||
else:
|
|
||||||
d[k] = val
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def oldmtodict(msg: bytes):
|
|
||||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
|
||||||
|
|
||||||
The original implementation prefixed with 'HTB:' and called stodict.
|
|
||||||
"""
|
|
||||||
return stodict(b"HTB:" + msg)
|
|
||||||
-323
@@ -1,323 +0,0 @@
|
|||||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import atexit
|
|
||||||
import time
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
from . import __version__
|
|
||||||
|
|
||||||
from . import udp
|
|
||||||
from . import hbdclass
|
|
||||||
|
|
||||||
from . import ws as ws_mod
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
msg_to_websockets = ws_mod.broadcast
|
|
||||||
|
|
||||||
logf = None
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
|
|
||||||
# shared runtime collections and helpers
|
|
||||||
msgs = []
|
|
||||||
|
|
||||||
def initlog(logfile):
|
|
||||||
try:
|
|
||||||
return open(logfile, "a+")
|
|
||||||
except Exception as e:
|
|
||||||
import sys
|
|
||||||
print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
|
|
||||||
return sys.stderr
|
|
||||||
|
|
||||||
def log(host, m, service=None):
|
|
||||||
ts = time.time()
|
|
||||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
|
|
||||||
msgs.append(s)
|
|
||||||
logger.info(s)
|
|
||||||
if logf:
|
|
||||||
try:
|
|
||||||
logf.write(s + "\n")
|
|
||||||
logf.flush()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("failed to write to logfile: %s", e)
|
|
||||||
msg_to_websockets("message", s)
|
|
||||||
|
|
||||||
def cleanup_function(config):
|
|
||||||
"""This function will be executed upon program exit."""
|
|
||||||
logger.info("Running cleanup function...")
|
|
||||||
import pickle
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
|
||||||
|
|
||||||
pickf = open(pickfile, "wb")
|
|
||||||
pick = pickle.Pickler(pickf)
|
|
||||||
pick.dump(hbdclass.Host.hosts)
|
|
||||||
pick.dump(msgs)
|
|
||||||
pick.dump(lastfm)
|
|
||||||
pickf.close()
|
|
||||||
|
|
||||||
logger.info("Cleanup complete.")
|
|
||||||
|
|
||||||
async def _run_async(config):
|
|
||||||
global msgs
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
shutdown_event = asyncio.Event()
|
|
||||||
|
|
||||||
# Signal handlers for graceful shutdown
|
|
||||||
def signal_handler(signum, frame):
|
|
||||||
sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else signum
|
|
||||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
|
||||||
loop.call_soon_threadsafe(shutdown_event.set)
|
|
||||||
|
|
||||||
# Register signal handlers
|
|
||||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
|
||||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
|
||||||
|
|
||||||
# prepare runtime dependencies
|
|
||||||
import threading
|
|
||||||
# from . import hbdclass
|
|
||||||
from . import http as http_mod
|
|
||||||
from . import dns as dns_mod
|
|
||||||
from . import notify as notify_mod
|
|
||||||
from . import monitor as monitor_mod
|
|
||||||
|
|
||||||
notify_mod.setup(config)
|
|
||||||
|
|
||||||
email = notify_mod.email
|
|
||||||
pushmsg = notify_mod.pushmsg_from_config
|
|
||||||
|
|
||||||
# UDP server endpoint (handler wired to handle_datagram with context)
|
|
||||||
bind_addr = ("0.0.0.0", config.get("hb_port", 50003))
|
|
||||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
|
||||||
|
|
||||||
def udp_handler(msg, addr, transport):
|
|
||||||
ctx = dict(
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
DEBUG=config.get("debug", 0),
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
)
|
|
||||||
udp.handle_datagram(msg, addr, transport, ctx)
|
|
||||||
|
|
||||||
transport, protocol = await loop.create_datagram_endpoint(
|
|
||||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
|
||||||
local_addr=bind_addr,
|
|
||||||
)
|
|
||||||
|
|
||||||
# HTTP server (asyncio-based via aiohttp)
|
|
||||||
try:
|
|
||||||
http_task = asyncio.create_task(
|
|
||||||
http_mod.start(
|
|
||||||
host=config.get("hbd_host", ""),
|
|
||||||
port=config.get("hbd_port", 50004),
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
msgs_getter=lambda: msgs,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
tcss=None,
|
|
||||||
DEBUG=config.get("debug", 0),
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
get_now=lambda: time.time(),
|
|
||||||
VER="",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("HTTP server started on %s:%s", config.get("hbd_host", ""), config.get("hbd_port", 50004))
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("failed to start HTTP server: %s", e)
|
|
||||||
|
|
||||||
# start dns update worker (async)
|
|
||||||
dns_task = None
|
|
||||||
try:
|
|
||||||
dns_task = dns_mod.start_dns_worker(hbdclass, config, log=log, email=email, loop=loop)
|
|
||||||
logger.info("dns update worker started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("dns worker failed to start: %s", e)
|
|
||||||
|
|
||||||
# Start the websocket servers as a background task
|
|
||||||
try:
|
|
||||||
ws_task = asyncio.create_task(
|
|
||||||
ws_mod.start(
|
|
||||||
host=config.get("hbd_host", ""),
|
|
||||||
ws_port=config.get("ws_port", 50005),
|
|
||||||
wss_port=config.get("wss_port", None),
|
|
||||||
ssl_context=None,
|
|
||||||
get_hosts=lambda: [hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)],
|
|
||||||
get_msgs=lambda: msgs,
|
|
||||||
verbose=config.get("verbose", False),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("WebSocket task started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("websocket server failed to start: %s", e)
|
|
||||||
|
|
||||||
# Start the monitor thread as a background task
|
|
||||||
try:
|
|
||||||
monitor_task = asyncio.create_task(
|
|
||||||
monitor_mod.start(
|
|
||||||
config=config,
|
|
||||||
hbdclass=hbdclass,
|
|
||||||
log=log,
|
|
||||||
email=email,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("Monitor task started")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("monitor task failed to start: %s", e)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# run forever until shutdown event is set
|
|
||||||
await shutdown_event.wait()
|
|
||||||
logger.info("Shutdown signal received, stopping services...")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Error in main loop: %s", e)
|
|
||||||
finally:
|
|
||||||
# Cancel all running tasks
|
|
||||||
logger.info("Cancelling tasks...")
|
|
||||||
try:
|
|
||||||
transport.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error closing UDP transport: %s", e)
|
|
||||||
|
|
||||||
tasks_to_cancel = [http_task, ws_task, monitor_task]
|
|
||||||
for task in tasks_to_cancel:
|
|
||||||
if task:
|
|
||||||
try:
|
|
||||||
task.cancel()
|
|
||||||
logger.debug("Cancelled task: %s", task)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error cancelling task: %s", e)
|
|
||||||
|
|
||||||
# Wait for tasks to finish cancellation with timeout
|
|
||||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
|
||||||
if remaining_tasks:
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(asyncio.gather(*remaining_tasks, return_exceptions=True), timeout=2.0)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("Timeout waiting for tasks to cancel")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug("Exception during task cancellation: %s", e)
|
|
||||||
|
|
||||||
# Signal DNS worker to exit and await it
|
|
||||||
try:
|
|
||||||
if 'dns_task' in locals() and dns_task:
|
|
||||||
try:
|
|
||||||
hbdclass.Host.dnsQ.put(None)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
|
||||||
logger.info("DNS worker finished")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("Timeout waiting for DNS worker to finish")
|
|
||||||
dns_task.cancel()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
logger.info("DNS worker was cancelled")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error awaiting DNS worker: %s", e)
|
|
||||||
finally:
|
|
||||||
# Clear queue bridge to release any held references
|
|
||||||
hbdclass.Host.dnsQ = None
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error stopping DNS worker: %s", e)
|
|
||||||
|
|
||||||
logger.info("All tasks cancelled")
|
|
||||||
|
|
||||||
|
|
||||||
def load_pickled_hosts(config, hbdclass):
|
|
||||||
"""Load pickled hosts from file, if available."""
|
|
||||||
global lastfm, msgs
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
|
||||||
dyndnshosts = config.get("dyndnshosts", [])
|
|
||||||
watchhosts = config.get("watchhosts", [])
|
|
||||||
drophosts = config.get("drophosts", [])
|
|
||||||
if 1 and os.path.exists(pickfile):
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("opening pickls %s", pickfile)
|
|
||||||
pickf = open(pickfile, "rb")
|
|
||||||
pick = pickle.Unpickler(pickf)
|
|
||||||
try:
|
|
||||||
hbdclass.Host.hosts = pick.load()
|
|
||||||
msgs = pick.load()
|
|
||||||
try:
|
|
||||||
lastfm = pick.load()
|
|
||||||
except:
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
pickf.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("load pickled failed: %s", e)
|
|
||||||
os.unlink(pickfile)
|
|
||||||
hbdclass.Connection.htab = {}
|
|
||||||
for h in list(hbdclass.Host.hosts.keys()):
|
|
||||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
|
||||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
|
||||||
hbdclass.Host.hosts[h].fixup()
|
|
||||||
for h in drophosts:
|
|
||||||
if h in hbdclass.Host.hosts:
|
|
||||||
del hbdclass.Host.hosts[h]
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
|
||||||
else:
|
|
||||||
if config.get("verbose", False):
|
|
||||||
logger.info("no pickled data")
|
|
||||||
|
|
||||||
def run(config):
|
|
||||||
"""Start the hbd service (blocking).
|
|
||||||
|
|
||||||
Manually manages the event loop to ensure clean shutdown.
|
|
||||||
"""
|
|
||||||
global logf
|
|
||||||
import os
|
|
||||||
import threading
|
|
||||||
import time as time_module
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO)
|
|
||||||
load_pickled_hosts(config, hbdclass)
|
|
||||||
|
|
||||||
logf = initlog(logfile=config.get("logfile", "messages.log"))
|
|
||||||
log(None, f"hbd version {__version__} starting up")
|
|
||||||
|
|
||||||
# Create and set the event loop manually
|
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(loop)
|
|
||||||
|
|
||||||
try:
|
|
||||||
loop.run_until_complete(_run_async(config))
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Unhandled exception in main: %s", e)
|
|
||||||
finally:
|
|
||||||
cleanup_function(config)
|
|
||||||
logger.info("hbd shutdown complete")
|
|
||||||
if logf and logf != sys.stderr:
|
|
||||||
try:
|
|
||||||
logf.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
# Explicitly close the loop
|
|
||||||
try:
|
|
||||||
# Cancel all remaining tasks
|
|
||||||
pending = asyncio.all_tasks(loop)
|
|
||||||
for task in pending:
|
|
||||||
task.cancel()
|
|
||||||
# Run one more cycle to process cancellations
|
|
||||||
if pending:
|
|
||||||
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
loop.close()
|
|
||||||
|
|
||||||
# Exit
|
|
||||||
os._exit(0)
|
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""HeartBeat Daemon (hbd) - Server/daemon component."""
|
||||||
|
|
||||||
|
from hbd import __version__
|
||||||
@@ -0,0 +1,302 @@
|
|||||||
|
"""Command line interface for hbd package."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import getpass
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from .config import load_config
|
||||||
|
from .main import run as run_server
|
||||||
|
|
||||||
|
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="hbd",
|
||||||
|
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(dest="command")
|
||||||
|
|
||||||
|
# --- serve (default) ---
|
||||||
|
serve_p = subparsers.add_parser("serve", help="Start the hbd server (default)")
|
||||||
|
serve_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
serve_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||||
|
serve_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||||
|
serve_p.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||||
|
help="Push service to use")
|
||||||
|
serve_p.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||||
|
|
||||||
|
# Legacy top-level flags (no subcommand) — kept for backward compatibility
|
||||||
|
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||||
|
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||||
|
help="Push service to use")
|
||||||
|
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||||
|
|
||||||
|
# --- passwd ---
|
||||||
|
passwd_p = subparsers.add_parser(
|
||||||
|
"passwd",
|
||||||
|
help="Generate a password hash for use in the config file",
|
||||||
|
)
|
||||||
|
passwd_p.add_argument(
|
||||||
|
"username",
|
||||||
|
nargs="?",
|
||||||
|
help="Username (informational only, for display)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- notify ---
|
||||||
|
notify_p = subparsers.add_parser(
|
||||||
|
"notify",
|
||||||
|
help="Send a test message via a configured notification channel",
|
||||||
|
)
|
||||||
|
notify_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
notify_p.add_argument(
|
||||||
|
"channel",
|
||||||
|
help="Channel name as defined in notification_channels",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"message",
|
||||||
|
nargs="?",
|
||||||
|
default="Test notification from hbd",
|
||||||
|
help="Message body (default: 'Test notification from hbd')",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"--level",
|
||||||
|
default="WARNING",
|
||||||
|
choices=["INFO", "WARNING", "CRITICAL", "RECOVER"],
|
||||||
|
help="Notification level (default: WARNING)",
|
||||||
|
)
|
||||||
|
notify_p.add_argument(
|
||||||
|
"--title",
|
||||||
|
default=None,
|
||||||
|
help="Notification title (default: '[LEVEL] test')",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- stop ---
|
||||||
|
stop_p = subparsers.add_parser("stop", help="Stop the running hbd instance")
|
||||||
|
stop_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
|
||||||
|
# --- reload ---
|
||||||
|
reload_p = subparsers.add_parser("reload", help="Reload configuration (SIGHUP)")
|
||||||
|
reload_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
|
||||||
|
# --- restart ---
|
||||||
|
restart_p = subparsers.add_parser("restart", help="Restart the running hbd instance")
|
||||||
|
restart_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||||
|
restart_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground after restart")
|
||||||
|
restart_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output after restart")
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_passwd(args):
|
||||||
|
"""Interactive password hash generator."""
|
||||||
|
from .users import hash_password
|
||||||
|
|
||||||
|
username = args.username or ""
|
||||||
|
prompt = f"New password for {username}: " if username else "New password: "
|
||||||
|
while True:
|
||||||
|
pw = getpass.getpass(prompt)
|
||||||
|
if not pw:
|
||||||
|
print("Password must not be empty.", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
pw2 = getpass.getpass("Confirm password: ")
|
||||||
|
if pw != pw2:
|
||||||
|
print("Passwords do not match, try again.", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
hashed = hash_password(pw)
|
||||||
|
if username:
|
||||||
|
print(f"\nAdd the following to your config under users: -> {username}:")
|
||||||
|
else:
|
||||||
|
print("\nPassword hash (paste into config file under the user's 'password' key):")
|
||||||
|
print(f" password: {hashed}")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_notify(args):
|
||||||
|
"""Send a test message via a single notification channel."""
|
||||||
|
from .config import load_config
|
||||||
|
from .notify import Notification, _dispatch_to_channel, setup
|
||||||
|
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
setup(config)
|
||||||
|
|
||||||
|
channels = config.get("notification_channels", {})
|
||||||
|
if args.channel not in channels:
|
||||||
|
available = ", ".join(channels.keys()) if channels else "(none)"
|
||||||
|
print(f"Error: channel '{args.channel}' not found in notification_channels.", file=sys.stderr)
|
||||||
|
print(f"Available channels: {available}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
channel_cfg = channels[args.channel]
|
||||||
|
level = args.level.upper()
|
||||||
|
title = args.title or f"[{level}] test"
|
||||||
|
base_url = config.get("base_url", "").rstrip("/")
|
||||||
|
|
||||||
|
notif = Notification(
|
||||||
|
title=title,
|
||||||
|
body=args.message,
|
||||||
|
level=level,
|
||||||
|
url=f"{base_url}/plugins" if base_url else "",
|
||||||
|
)
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from .notify import _send_matrix_async, _send_sms_voipms_async, _DRIVERS
|
||||||
|
ch_type = channel_cfg.get("type", "")
|
||||||
|
print(f"Sending via {args.channel} ({ch_type}): {title} — {args.message}")
|
||||||
|
|
||||||
|
if ch_type == "matrix":
|
||||||
|
ok = asyncio.run(_send_matrix_async(channel_cfg, notif))
|
||||||
|
elif ch_type == "sms_voipms":
|
||||||
|
ok = asyncio.run(_send_sms_voipms_async(channel_cfg, notif))
|
||||||
|
else:
|
||||||
|
driver = _DRIVERS.get(ch_type)
|
||||||
|
if driver is None:
|
||||||
|
print(f"Error: unknown channel type '{ch_type}'", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
ok = driver(channel_cfg, notif)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
print("OK")
|
||||||
|
else:
|
||||||
|
print("FAILED — check logs for details", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_pid(configfile) -> int | None:
|
||||||
|
"""Return the PID from the pidfile, or None if not found / not running."""
|
||||||
|
import os
|
||||||
|
config = load_config(configfile)
|
||||||
|
pidfile = config.get("pidfile", "")
|
||||||
|
if not pidfile:
|
||||||
|
print("Error: no pidfile configured.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with open(pidfile) as f:
|
||||||
|
pid = int(f.read().strip())
|
||||||
|
# Verify process is actually running
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return pid
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"PID file not found ({pidfile}). Is hbd running?", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except ProcessLookupError:
|
||||||
|
print(f"PID file exists but process {pid} is not running.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading pidfile: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_stop(args):
|
||||||
|
import os, signal as _signal, time
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is None:
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Stopping hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGTERM)
|
||||||
|
# Wait up to 10 s for the process to exit
|
||||||
|
for _ in range(20):
|
||||||
|
time.sleep(0.5)
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
except ProcessLookupError:
|
||||||
|
print("hbd stopped.")
|
||||||
|
return
|
||||||
|
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_reload(args):
|
||||||
|
import os, signal as _signal
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is None:
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Sending SIGHUP to hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGHUP)
|
||||||
|
print("Reload signal sent.")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_restart(args):
|
||||||
|
import os, signal as _signal, time, subprocess
|
||||||
|
pid = _read_pid(args.configfile)
|
||||||
|
if pid is not None:
|
||||||
|
print(f"Stopping hbd (pid {pid})...")
|
||||||
|
os.kill(pid, _signal.SIGTERM)
|
||||||
|
for _ in range(20):
|
||||||
|
time.sleep(0.5)
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
except ProcessLookupError:
|
||||||
|
print("hbd stopped.")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Warning: hbd did not stop within 10 seconds.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("hbd does not appear to be running — starting fresh.")
|
||||||
|
|
||||||
|
# Re-launch hbd with the same config
|
||||||
|
cmd = [sys.executable, "-m", "hbd.server.cli", "serve"]
|
||||||
|
if args.configfile:
|
||||||
|
cmd += ["-c", args.configfile]
|
||||||
|
if getattr(args, "foreground", False):
|
||||||
|
cmd += ["-f"]
|
||||||
|
if getattr(args, "verbose", False):
|
||||||
|
cmd += ["-v"]
|
||||||
|
|
||||||
|
if getattr(args, "foreground", False):
|
||||||
|
# Run in foreground — replace current process
|
||||||
|
os.execv(sys.executable, cmd)
|
||||||
|
else:
|
||||||
|
subprocess.Popen(cmd, start_new_session=True)
|
||||||
|
print("hbd restarted.")
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if args.command == "passwd":
|
||||||
|
cmd_passwd(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "notify":
|
||||||
|
cmd_notify(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "stop":
|
||||||
|
cmd_stop(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "reload":
|
||||||
|
cmd_reload(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.command == "restart":
|
||||||
|
cmd_restart(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Default: run the server (supports both `hbd serve ...` and `hbd ...`)
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
|
||||||
|
# Apply CLI overrides
|
||||||
|
if args.foreground:
|
||||||
|
config["foreground"] = True
|
||||||
|
if args.verbose:
|
||||||
|
config["verbose"] = True
|
||||||
|
if args.pushsrv:
|
||||||
|
config["pushsrv"] = args.pushsrv
|
||||||
|
if args.debug > 0:
|
||||||
|
config["debug"] = args.debug
|
||||||
|
|
||||||
|
# Pass config_path for reloading support
|
||||||
|
run_server(config, config_path=args.configfile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,320 @@
|
|||||||
|
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
SERVER_DEFAULTS = {
|
||||||
|
# Network settings
|
||||||
|
"hb_port": 50003, # Port to listen for heartbeats
|
||||||
|
"hbd_port": 50004, # HTTP API port
|
||||||
|
"hbd_host": "", # Bind address (empty = all interfaces)
|
||||||
|
|
||||||
|
# Persistence
|
||||||
|
"pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"), # File to store host state between restarts
|
||||||
|
"pidfile": os.path.join(os.path.expanduser("~"), ".hb.pid"), # PID file for stop/restart/reload
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
"logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
|
||||||
|
# Notification channels
|
||||||
|
"notification_channels": {}, # Named channels with type and credentials
|
||||||
|
"base_url": "", # Base URL for notification links (e.g. https://hbd.example.com)
|
||||||
|
|
||||||
|
# Monitoring settings
|
||||||
|
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||||
|
"grace": 2, # Grace multiplier (interval * grace = timeout)
|
||||||
|
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||||
|
|
||||||
|
# User management
|
||||||
|
"users": {}, # username -> {full_name, avatar, password, admin, notification_channels}
|
||||||
|
"default_owner": None, # Username that owns hosts with no explicit owner
|
||||||
|
|
||||||
|
# Host management
|
||||||
|
"hosts": {}, # Unified host definitions
|
||||||
|
"dyndnshosts": [], # Hosts with dynamic DNS (legacy)
|
||||||
|
"drophosts": [], # Hosts to ignore
|
||||||
|
"dyndomains": ["wrede.org"],
|
||||||
|
|
||||||
|
# DNS updates
|
||||||
|
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||||
|
|
||||||
|
# WebSocket settings
|
||||||
|
"ws_port": 50005,
|
||||||
|
"wss_port": None,
|
||||||
|
"cert_path": "/usr/local/etc/ssl/",
|
||||||
|
"wss_pem": "fullchain.pem",
|
||||||
|
"wss_key": "privkey.pem",
|
||||||
|
|
||||||
|
# Message journal configuration
|
||||||
|
"journal_enabled": True,
|
||||||
|
"journal_dir": "/var/log/heartbeat",
|
||||||
|
"journal_file": "messages.journal",
|
||||||
|
"journal_max_size": 100 * 1024 * 1024, # 100MB
|
||||||
|
"journal_max_backups": 10,
|
||||||
|
|
||||||
|
# Runtime flags
|
||||||
|
"foreground": False,
|
||||||
|
"verbose": False,
|
||||||
|
"debug": 0,
|
||||||
|
|
||||||
|
# Plugin/threshold configs (for clients reporting to this server)
|
||||||
|
"plugins": {},
|
||||||
|
"thresholds": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
THRESHOLD_DEFAULTS = {
|
||||||
|
'thresholds': {
|
||||||
|
'cpu_monitor': {
|
||||||
|
'cpu_percent': {
|
||||||
|
'warning': 80.0,
|
||||||
|
'critical': 90.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'memory_monitor': {
|
||||||
|
'percent': {
|
||||||
|
'warning': 85.0,
|
||||||
|
'critical': 95.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'disk_monitor': {
|
||||||
|
'partitions': {
|
||||||
|
'/': {
|
||||||
|
'percent': {
|
||||||
|
'warning': 85.0,
|
||||||
|
'critical': 90.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'rtt': {
|
||||||
|
'warning': 200,
|
||||||
|
'critical': 250.0,
|
||||||
|
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path=None):
|
||||||
|
"""Load configuration from a YAML file and merge with server defaults.
|
||||||
|
|
||||||
|
If YAML is not available or the file does not exist, defaults are returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with configuration
|
||||||
|
"""
|
||||||
|
cfg = SERVER_DEFAULTS.copy()
|
||||||
|
if not path:
|
||||||
|
# default path (~/.hb.yaml)
|
||||||
|
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||||
|
|
||||||
|
if os.path.exists(path):
|
||||||
|
if yaml:
|
||||||
|
with open(path) as fh:
|
||||||
|
data = yaml.safe_load(fh)
|
||||||
|
# Merge YAML data with defaults
|
||||||
|
# Keep all keys from YAML to support plugin configs and future extensions
|
||||||
|
for k, v in data.items():
|
||||||
|
cfg[k] = v
|
||||||
|
else:
|
||||||
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
|
pass
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
class ReloadableConfig:
|
||||||
|
"""Thread-safe/async-safe configuration wrapper that supports runtime reloading.
|
||||||
|
|
||||||
|
This class wraps the configuration dictionary and provides:
|
||||||
|
- Thread-safe config reloading via SIGHUP
|
||||||
|
- Backward-compatible dict-like access
|
||||||
|
- Async lock to prevent concurrent reloads
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, initial_config, config_path=None):
|
||||||
|
"""Initialize with initial configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
initial_config: Initial configuration dictionary
|
||||||
|
config_path: Path to config file for reloading (optional)
|
||||||
|
"""
|
||||||
|
self._config = initial_config
|
||||||
|
self._config_path = config_path
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def reload(self, config_path=None):
|
||||||
|
"""Reload configuration from file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path: Path to config file (uses stored path if not provided)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New configuration dictionary
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception if reload fails (keeps existing config)
|
||||||
|
"""
|
||||||
|
path = config_path or self._config_path
|
||||||
|
if not path:
|
||||||
|
raise ValueError("No config path specified for reload")
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
try:
|
||||||
|
# Load new config
|
||||||
|
new_config = load_config(path)
|
||||||
|
|
||||||
|
# Store old config for rollback if needed
|
||||||
|
old_config = self._config
|
||||||
|
|
||||||
|
# Update config
|
||||||
|
self._config = new_config
|
||||||
|
self._logger.info(f"Configuration reloaded from {path}")
|
||||||
|
|
||||||
|
return new_config
|
||||||
|
except Exception as e:
|
||||||
|
self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
|
||||||
|
# Keep existing config on error
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
"""Get a config value (dict-compatible)."""
|
||||||
|
return self._config.get(key, default)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Get a config value via subscript (dict-compatible)."""
|
||||||
|
return self._config[key]
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
"""Check if key exists (dict-compatible)."""
|
||||||
|
return key in self._config
|
||||||
|
|
||||||
|
def keys(self):
|
||||||
|
"""Return config keys (dict-compatible)."""
|
||||||
|
return self._config.keys()
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
"""Return config items (dict-compatible)."""
|
||||||
|
return self._config.items()
|
||||||
|
|
||||||
|
def values(self):
|
||||||
|
"""Return config values (dict-compatible)."""
|
||||||
|
return self._config.values()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self):
|
||||||
|
"""Get the underlying config dict (for components that need full dict)."""
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
|
||||||
|
def get_watchhosts(config):
|
||||||
|
"""Extract watched hostnames from config (hosts with watch: true).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of hostnames to watch
|
||||||
|
"""
|
||||||
|
watchhosts = []
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
|
||||||
|
watchhosts.append(host_name)
|
||||||
|
return watchhosts
|
||||||
|
|
||||||
|
|
||||||
|
def get_dyndnshosts(config):
|
||||||
|
"""Extract dyndnshosts from config, supporting both new and legacy formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of hostnames with dynamic DNS
|
||||||
|
"""
|
||||||
|
dyndnshosts = []
|
||||||
|
|
||||||
|
# New format: hosts section with dyndns attribute
|
||||||
|
if "hosts" in config:
|
||||||
|
hosts_config = config["hosts"]
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
|
||||||
|
dyndnshosts.append(host_name)
|
||||||
|
|
||||||
|
# Legacy format: dyndnshosts list/set
|
||||||
|
if "dyndnshosts" in config:
|
||||||
|
legacy_dyndnshosts = config.get("dyndnshosts", [])
|
||||||
|
if isinstance(legacy_dyndnshosts, (list, set)):
|
||||||
|
dyndnshosts.extend(legacy_dyndnshosts)
|
||||||
|
|
||||||
|
return list(set(dyndnshosts)) # Remove duplicates
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_config(config, hostname):
|
||||||
|
"""Get configuration for a specific host from the hosts section.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with host attributes or empty dict
|
||||||
|
"""
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||||
|
val = hosts_config[hostname]
|
||||||
|
return val if isinstance(val, dict) else {}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# User / host-access helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_default_owner(config) -> str | None:
|
||||||
|
"""Return the configured default_owner username, or the first admin user, or None."""
|
||||||
|
explicit = config.get("default_owner")
|
||||||
|
if explicit:
|
||||||
|
return explicit
|
||||||
|
# Fall back to first admin user found in config
|
||||||
|
users_cfg = config.get("users", {})
|
||||||
|
if isinstance(users_cfg, dict):
|
||||||
|
for username, attrs in users_cfg.items():
|
||||||
|
if isinstance(attrs, dict) and attrs.get("admin", False):
|
||||||
|
return username
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_access(config, hostname) -> dict:
|
||||||
|
"""Return the access dict for *hostname*: owner, managers, monitors.
|
||||||
|
|
||||||
|
Falls back to default_owner for hosts without an explicit owner.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"owner": str | None,
|
||||||
|
"managers": list[str],
|
||||||
|
"monitors": list[str],
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
host_cfg = get_host_config(config, hostname)
|
||||||
|
|
||||||
|
owner = host_cfg.get("owner") or get_default_owner(config)
|
||||||
|
|
||||||
|
managers = host_cfg.get("managers", [])
|
||||||
|
if isinstance(managers, str):
|
||||||
|
managers = [managers]
|
||||||
|
|
||||||
|
monitors = host_cfg.get("monitors", [])
|
||||||
|
if isinstance(monitors, str):
|
||||||
|
monitors = [monitors]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"owner": owner,
|
||||||
|
"managers": list(managers),
|
||||||
|
"monitors": list(monitors),
|
||||||
|
}
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
|
||||||
|
class Data:
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.data = {}
|
||||||
|
|
||||||
|
def update(self, new_data):
|
||||||
|
self.data.update(new_data)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
return self.data.get(key, default)
|
||||||
@@ -1,13 +1,23 @@
|
|||||||
"""DNS update helper and pure asyncio worker for heartbeat daemon."""
|
"""DNS update helper and pure asyncio worker for heartbeat daemon."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import subprocess
|
|
||||||
from subprocess import Popen, PIPE, STDOUT
|
from subprocess import Popen, PIPE, STDOUT
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
def create_nsupdate_payload(hostname: str, newip: str, dyndomain: str, dnsttl: str = "5") -> str:
|
def create_nsupdate_payload(
|
||||||
D = {"domain": dyndomain, "fqdn": f"{hostname}.dy.{dyndomain}", "dnsttl": dnsttl, "newip": newip, "ts": __import__("time").strftime("%Y-%m-%d.%H:%M:%S", __import__("time").gmtime())}
|
hostname: str, newip: str, dyndomain: str, dnsttl: str = "5"
|
||||||
|
) -> str:
|
||||||
|
D = {
|
||||||
|
"domain": dyndomain,
|
||||||
|
"fqdn": f"{hostname}.dy.{dyndomain}",
|
||||||
|
"dnsttl": dnsttl,
|
||||||
|
"newip": newip,
|
||||||
|
"ts": __import__("time").strftime(
|
||||||
|
"%Y-%m-%d.%H:%M:%S", __import__("time").gmtime()
|
||||||
|
),
|
||||||
|
}
|
||||||
if ":" in newip:
|
if ":" in newip:
|
||||||
nsup = (
|
nsup = (
|
||||||
"""update delete %(fqdn)s AAAA
|
"""update delete %(fqdn)s AAAA
|
||||||
@@ -17,7 +27,8 @@ update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
|
|||||||
send
|
send
|
||||||
answer
|
answer
|
||||||
|
|
||||||
""" % D
|
"""
|
||||||
|
% D
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nsup = (
|
nsup = (
|
||||||
@@ -28,12 +39,19 @@ update add %(fqdn)s %(dnsttl)s TXT "Created: %(ts)s"
|
|||||||
send
|
send
|
||||||
answer
|
answer
|
||||||
|
|
||||||
""" % D
|
"""
|
||||||
|
% D
|
||||||
)
|
)
|
||||||
return nsup
|
return nsup
|
||||||
|
|
||||||
|
|
||||||
def nsupdate(hostname: str, newip: str, dyndomain: str, nsupdate_bin: str = "/usr/local/bin/nsupdate", rndc_key: str = "/etc/dhcpc/rndc-key") -> Optional[str]:
|
def nsupdate(
|
||||||
|
hostname: str,
|
||||||
|
newip: str,
|
||||||
|
dyndomain: str,
|
||||||
|
nsupdate_bin: str = "/usr/local/bin/nsupdate",
|
||||||
|
rndc_key: str = "/etc/dhcpc/rndc-key",
|
||||||
|
) -> Optional[str]:
|
||||||
"""Perform DNS update via nsupdate command.
|
"""Perform DNS update via nsupdate command.
|
||||||
|
|
||||||
Returns None on success, else returns combined stdout/stderr as a string.
|
Returns None on success, else returns combined stdout/stderr as a string.
|
||||||
@@ -54,7 +72,14 @@ def nsupdate(hostname: str, newip: str, dyndomain: str, nsupdate_bin: str = "/us
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional[callable] = None, email: Optional[callable] = None, loop: Optional[asyncio.AbstractEventLoop] = None):
|
async def dns_update_worker(
|
||||||
|
hbdclass,
|
||||||
|
cfg: dict,
|
||||||
|
async_queue=None,
|
||||||
|
log: Optional[callable] = None,
|
||||||
|
pushmsg: Optional[callable] = None,
|
||||||
|
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||||
|
):
|
||||||
"""Pure async DNS worker that processes updates from asyncio.Queue.
|
"""Pure async DNS worker that processes updates from asyncio.Queue.
|
||||||
|
|
||||||
Exits when it receives a None sentinel.
|
Exits when it receives a None sentinel.
|
||||||
@@ -66,7 +91,9 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
if not dnsq:
|
if not dnsq:
|
||||||
if log:
|
if log:
|
||||||
try:
|
try:
|
||||||
await loop.run_in_executor(None, log, None, "dns_update_worker: no queue available")
|
await loop.run_in_executor(
|
||||||
|
None, log, None, "dns_update_worker: no queue available"
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
@@ -77,7 +104,9 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if log:
|
if log:
|
||||||
try:
|
try:
|
||||||
await loop.run_in_executor(None, log, None, f"dns_update_worker: error getting item: {e}")
|
await loop.run_in_executor(
|
||||||
|
None, log, None, f"dns_update_worker: error getting item: {e}"
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
@@ -96,14 +125,18 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
|
|
||||||
m = f"changed address to {addr}"
|
m = f"changed address to {addr}"
|
||||||
for dyndomain in cfg.get("dyndomains", []):
|
for dyndomain in cfg.get("dyndomains", []):
|
||||||
err = await loop.run_in_executor(None, nsupdate, name, addr, dyndomain, cfg.get("nsupdate_bin", "/usr/local/bin/nsupdate"), cfg.get("rndc_key", "/etc/dhcpc/rndc-key"))
|
err = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
nsupdate,
|
||||||
|
name,
|
||||||
|
addr,
|
||||||
|
dyndomain,
|
||||||
|
cfg.get("nsupdate_bin", "/usr/local/bin/nsupdate"),
|
||||||
|
cfg.get("rndc_key", "/etc/dhcpc/rndc-key"),
|
||||||
|
)
|
||||||
if err:
|
if err:
|
||||||
m += f", DNS update failed: {err}"
|
m += f", DNS update failed: {err}"
|
||||||
if email:
|
logger.error("DNS update failed for %s: %s", name, err)
|
||||||
try:
|
|
||||||
await loop.run_in_executor(None, email, "error: nsupdate failed", f"{name}.dy.{dyndomain}: {m}")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
m += ", DNS updated."
|
m += ", DNS updated."
|
||||||
|
|
||||||
@@ -125,7 +158,12 @@ async def dns_update_worker(hbdclass, cfg: dict, async_queue=None, log: Optional
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email: Optional[callable] = None, loop: Optional[asyncio.AbstractEventLoop] = None):
|
def start_dns_worker(
|
||||||
|
hbdclass,
|
||||||
|
cfg: dict,
|
||||||
|
log: Optional[callable] = None,
|
||||||
|
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||||
|
):
|
||||||
"""Start the async DNS worker and return the Task.
|
"""Start the async DNS worker and return the Task.
|
||||||
|
|
||||||
Replaces Host.dnsQ with an asyncio.Queue wrapped in a thread-safe bridge
|
Replaces Host.dnsQ with an asyncio.Queue wrapped in a thread-safe bridge
|
||||||
@@ -139,6 +177,7 @@ def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email:
|
|||||||
|
|
||||||
class _QueueBridge:
|
class _QueueBridge:
|
||||||
"""Thread-safe wrapper around asyncio.Queue for synchronous callers."""
|
"""Thread-safe wrapper around asyncio.Queue for synchronous callers."""
|
||||||
|
|
||||||
def __init__(self, loop, aq):
|
def __init__(self, loop, aq):
|
||||||
self._loop = loop
|
self._loop = loop
|
||||||
self._aq = aq
|
self._aq = aq
|
||||||
@@ -167,5 +206,9 @@ def start_dns_worker(hbdclass, cfg: dict, log: Optional[callable] = None, email:
|
|||||||
bridge = _QueueBridge(loop, async_q)
|
bridge = _QueueBridge(loop, async_q)
|
||||||
hbdclass.Host.dnsQ = bridge
|
hbdclass.Host.dnsQ = bridge
|
||||||
|
|
||||||
task = loop.create_task(dns_update_worker(hbdclass, cfg, async_queue=async_q, log=log, email=email, loop=loop))
|
task = loop.create_task(
|
||||||
|
dns_update_worker(
|
||||||
|
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||||
|
)
|
||||||
|
)
|
||||||
return task
|
return task
|
||||||
@@ -0,0 +1,637 @@
|
|||||||
|
"""
|
||||||
|
host and connection class shared between hbd and
|
||||||
|
the websit's heartbeat.py
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import copy
|
||||||
|
import queue
|
||||||
|
|
||||||
|
num = 0
|
||||||
|
|
||||||
|
MAXRTTS = 10
|
||||||
|
|
||||||
|
DEBUG = 2
|
||||||
|
|
||||||
|
|
||||||
|
def log(host, m):
|
||||||
|
if DEBUG:
|
||||||
|
print("class log: %s %s" % (host, m))
|
||||||
|
|
||||||
|
|
||||||
|
class Connection:
|
||||||
|
# map of addrs to names
|
||||||
|
|
||||||
|
htab = {}
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
UP = "up"
|
||||||
|
DOWN = "down"
|
||||||
|
OVERDUE = "overdue"
|
||||||
|
|
||||||
|
def __init__(self, host, cid, addr, afam):
|
||||||
|
self.host = host
|
||||||
|
self.cid = cid
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
self.addr = addr
|
||||||
|
self.afam = afam
|
||||||
|
self.rtts = [0]
|
||||||
|
self.lastbeat = time.time()
|
||||||
|
self.statetime = self.lastbeat
|
||||||
|
self.deltastatetime = "computed"
|
||||||
|
self.state = Connection.UNKNOWN
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
if host:
|
||||||
|
Connection.htab[addr] = self.host.name
|
||||||
|
if self.host.isDynDns():
|
||||||
|
log(self.host.name, "dns update %s" % self.addr)
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
# Remove asyncio timer objects that can't be pickled
|
||||||
|
# These will be recreated when the next HTB arrives after unpickling
|
||||||
|
state['overdue_timer'] = None
|
||||||
|
state['overdue_callback'] = None
|
||||||
|
state['timeout_duration'] = None
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
"""Restore Connection from pickle, reinitializing timer fields."""
|
||||||
|
self.__dict__.update(state)
|
||||||
|
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
|
||||||
|
if not hasattr(self, 'overdue_timer'):
|
||||||
|
self.overdue_timer = None
|
||||||
|
if not hasattr(self, 'overdue_callback'):
|
||||||
|
self.overdue_callback = None
|
||||||
|
if not hasattr(self, 'timeout_duration'):
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
def registerDns(self):
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
|
def clearstate(self):
|
||||||
|
d = {}
|
||||||
|
d["addr"] = ""
|
||||||
|
d["rtt"] = ""
|
||||||
|
d["lastbeat"] = ""
|
||||||
|
d["state"] = ""
|
||||||
|
d["statetime"] = ""
|
||||||
|
d["deltastatetime"] = ""
|
||||||
|
d["rttstate"] = ""
|
||||||
|
return d
|
||||||
|
|
||||||
|
def statedict(self, Null=False):
|
||||||
|
d = self.clearstate()
|
||||||
|
now = time.time()
|
||||||
|
if not Null:
|
||||||
|
d["addr"] = self.addr
|
||||||
|
if self.rtts[-1]:
|
||||||
|
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||||
|
elif self.state == Connection.UNKNOWN:
|
||||||
|
d["rtt"] = ""
|
||||||
|
else:
|
||||||
|
d["rtt"] = "?"
|
||||||
|
d["lastbeat"] = self.lastbeat
|
||||||
|
if self.state == Connection.OVERDUE:
|
||||||
|
d["state"] = "<b>%s</b>" % self.state
|
||||||
|
else:
|
||||||
|
d["state"] = self.state
|
||||||
|
if self.state == Connection.UP:
|
||||||
|
d["rttstate"] = d["rtt"]
|
||||||
|
elif self.state == Connection.OVERDUE:
|
||||||
|
d["rttstate"] = ""
|
||||||
|
else:
|
||||||
|
d["rttstate"] = d["state"]
|
||||||
|
d["statetime"] = time.strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||||
|
)
|
||||||
|
delta = now - self.statetime
|
||||||
|
|
||||||
|
if self.state == Connection.UNKNOWN:
|
||||||
|
d["deltastatetime"] = ""
|
||||||
|
elif delta > 86400:
|
||||||
|
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||||
|
elif delta > 3600:
|
||||||
|
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||||
|
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||||
|
elif delta > 60:
|
||||||
|
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||||
|
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||||
|
else:
|
||||||
|
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||||
|
d["deltastatetime"] = "%i secs" % (delta)
|
||||||
|
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||||
|
d = self.clearstate()
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
def headerdict(self, afam):
|
||||||
|
d = {}
|
||||||
|
d["addr"] = "%s Addr" % afam
|
||||||
|
d["rtt"] = "Latencey"
|
||||||
|
d["lastbeat"] = "Last Contact"
|
||||||
|
d["state"] = "State"
|
||||||
|
d["statetime"] = "Last State"
|
||||||
|
d["rttstate"] = "Reach"
|
||||||
|
d["deltastatetime"] = "Last State"
|
||||||
|
return d
|
||||||
|
|
||||||
|
def jsons(self):
|
||||||
|
"""Serialize connection to JSON, excluding non-serializable timer objects."""
|
||||||
|
data = {}
|
||||||
|
for key, value in self.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
data[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
data[key] = value
|
||||||
|
return json.dumps(data)
|
||||||
|
|
||||||
|
# set new state, return number of secs in previous state
|
||||||
|
def newstate(self, state, now, when=0):
|
||||||
|
self.state = state
|
||||||
|
delta = now - when
|
||||||
|
s = delta - self.statetime
|
||||||
|
self.statetime = delta
|
||||||
|
return s
|
||||||
|
|
||||||
|
def getstate(self):
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
def newaddr(self, addr, rtt, now):
|
||||||
|
self.lastbeat = now
|
||||||
|
if rtt is not None:
|
||||||
|
self.rtts.append(rtt)
|
||||||
|
if len(self.rtts) > MAXRTTS:
|
||||||
|
del self.rtts[0]
|
||||||
|
|
||||||
|
if self.addr == addr:
|
||||||
|
r = None
|
||||||
|
else:
|
||||||
|
r = "changed from %s to %s" % (self.addr, addr)
|
||||||
|
try:
|
||||||
|
del Connection.htab[self.addr]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.addr = addr
|
||||||
|
Connection.htab[addr] = self.host.name
|
||||||
|
if self.host.isDynDns():
|
||||||
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
return r
|
||||||
|
|
||||||
|
def reset_overdue_timer(self, timeout_seconds, callback):
|
||||||
|
"""Reset the overdue timer for this connection.
|
||||||
|
|
||||||
|
Cancels any existing timer and sets a new one that will mark
|
||||||
|
the connection as overdue if no heartbeat arrives before timeout.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout_seconds: Seconds before marking as overdue
|
||||||
|
callback: Async function to call when timer expires
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Cancel existing timer if any
|
||||||
|
if self.overdue_timer and not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
|
||||||
|
# Store parameters for later reference
|
||||||
|
self.timeout_duration = timeout_seconds
|
||||||
|
self.overdue_callback = callback
|
||||||
|
|
||||||
|
# Create new timer
|
||||||
|
async def timer_expired():
|
||||||
|
await callback(self)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
self.overdue_timer = loop.call_later(timeout_seconds,
|
||||||
|
lambda: asyncio.create_task(timer_expired()))
|
||||||
|
except RuntimeError:
|
||||||
|
# No event loop running yet
|
||||||
|
pass
|
||||||
|
|
||||||
|
def cancel_overdue_timer(self):
|
||||||
|
"""Cancel the overdue timer if it exists and clear all timer references."""
|
||||||
|
if self.overdue_timer:
|
||||||
|
try:
|
||||||
|
if not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Clear all timer-related references
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
def get_avg_rtt(self):
|
||||||
|
"""Get average RTT from recent samples."""
|
||||||
|
valid_rtts = [r for r in self.rtts if r > 0]
|
||||||
|
if valid_rtts:
|
||||||
|
return sum(valid_rtts) / len(valid_rtts)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_current_rtt(self):
|
||||||
|
"""Get most recent RTT value."""
|
||||||
|
return self.rtts[-1] if self.rtts else 0
|
||||||
|
|
||||||
|
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
|
||||||
|
"""Check if RTT exceeds thresholds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
warning_threshold: RTT in ms for warning level
|
||||||
|
critical_threshold: RTT in ms for critical level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
|
||||||
|
"""
|
||||||
|
rtt = self.get_current_rtt()
|
||||||
|
if rtt <= 0:
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
if critical_threshold and rtt > critical_threshold:
|
||||||
|
return ('CRITICAL', rtt)
|
||||||
|
elif warning_threshold and rtt > warning_threshold:
|
||||||
|
return ('WARNING', rtt)
|
||||||
|
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
class Host:
|
||||||
|
# Table of Hosts
|
||||||
|
hosts = {}
|
||||||
|
dnsQ = queue.Queue()
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
global num
|
||||||
|
self.name = name
|
||||||
|
if name:
|
||||||
|
num += 1
|
||||||
|
Host.hosts[name] = self
|
||||||
|
self.num = num
|
||||||
|
self.dyn = False
|
||||||
|
self.watched = False
|
||||||
|
self.upcount = 0
|
||||||
|
self.interval = 0
|
||||||
|
self.doesack = -1
|
||||||
|
self.cmds = []
|
||||||
|
self.connections = {}
|
||||||
|
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
|
||||||
|
self.plugin_data = {}
|
||||||
|
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||||
|
# Alert state tracking: {metric_path: AlertState}
|
||||||
|
self.alert_states = {}
|
||||||
|
# User access control
|
||||||
|
self.owner: str | None = None # username of owner
|
||||||
|
self.managers: list = [] # usernames with manager role
|
||||||
|
self.monitors: list = [] # usernames with monitor role
|
||||||
|
|
||||||
|
def statedict(self):
|
||||||
|
d = {}
|
||||||
|
d["name"] = self.name
|
||||||
|
if self.dyn:
|
||||||
|
d["name"] += "*"
|
||||||
|
if self.watched:
|
||||||
|
d["name"] = "<b>%s</b>" % d["name"]
|
||||||
|
d["dyn"] = str(self.dyn)
|
||||||
|
d["num"] = self.num
|
||||||
|
|
||||||
|
# Add alert counts (split by acknowledged status)
|
||||||
|
warning_unacked = 0
|
||||||
|
warning_acked = 0
|
||||||
|
critical_unacked = 0
|
||||||
|
critical_acked = 0
|
||||||
|
for metric_path, alert_state in self.alert_states.items():
|
||||||
|
# Import AlertLevel here to avoid circular imports
|
||||||
|
from .threshold import AlertLevel
|
||||||
|
if alert_state.level == AlertLevel.WARNING:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
warning_acked += 1
|
||||||
|
else:
|
||||||
|
warning_unacked += 1
|
||||||
|
elif alert_state.level == AlertLevel.CRITICAL:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
critical_acked += 1
|
||||||
|
else:
|
||||||
|
critical_unacked += 1
|
||||||
|
|
||||||
|
d["alert_warning_unacked"] = warning_unacked
|
||||||
|
d["alert_warning_acked"] = warning_acked
|
||||||
|
d["alert_critical_unacked"] = critical_unacked
|
||||||
|
d["alert_critical_acked"] = critical_acked
|
||||||
|
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c in self.connections:
|
||||||
|
cs = self.connections[c].statedict()
|
||||||
|
else:
|
||||||
|
cs = ubConnection.statedict(True)
|
||||||
|
for csv in cs:
|
||||||
|
d["%s.%s" % (c, csv)] = cs[csv]
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
def headerdict(self):
|
||||||
|
d = {}
|
||||||
|
d["name"] = "Name"
|
||||||
|
d["dyn"] = "Dyn"
|
||||||
|
d["num"] = "??"
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
cs = ubConnection.headerdict(c)
|
||||||
|
for csv in cs:
|
||||||
|
d["%s.%s" % (c, csv)] = cs[csv]
|
||||||
|
return d
|
||||||
|
|
||||||
|
def registerDns(self):
|
||||||
|
for af in self.connections:
|
||||||
|
self.connections[af].registerDns()
|
||||||
|
|
||||||
|
def stateinfo(self):
|
||||||
|
ddict = {}
|
||||||
|
for d in self.__dict__:
|
||||||
|
if d in ["alert_states", "plugin_data"]:
|
||||||
|
continue
|
||||||
|
if d == "connections":
|
||||||
|
cl = []
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c not in self.connections:
|
||||||
|
continue
|
||||||
|
# Create connection dict, excluding non-serializable timer objects
|
||||||
|
conn = self.connections[c]
|
||||||
|
cld = {}
|
||||||
|
for key, value in conn.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
cld[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
# Safe copy for serializable values
|
||||||
|
try:
|
||||||
|
cld[key] = copy.deepcopy(value)
|
||||||
|
except Exception:
|
||||||
|
# If deepcopy fails, use shallow copy
|
||||||
|
cld[key] = value
|
||||||
|
cl.append(cld)
|
||||||
|
ddict[d] = cl
|
||||||
|
else:
|
||||||
|
ddict[d] = self.__dict__[d]
|
||||||
|
|
||||||
|
# Add alert counts (computed from alert_states)
|
||||||
|
warning_unacked = 0
|
||||||
|
warning_acked = 0
|
||||||
|
critical_unacked = 0
|
||||||
|
critical_acked = 0
|
||||||
|
if hasattr(self, 'alert_states'):
|
||||||
|
from .threshold import AlertLevel
|
||||||
|
for metric_path, alert_state in self.alert_states.items():
|
||||||
|
if alert_state.level == AlertLevel.WARNING:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
warning_acked += 1
|
||||||
|
else:
|
||||||
|
warning_unacked += 1
|
||||||
|
elif alert_state.level == AlertLevel.CRITICAL:
|
||||||
|
if alert_state.acknowledged:
|
||||||
|
critical_acked += 1
|
||||||
|
else:
|
||||||
|
critical_unacked += 1
|
||||||
|
|
||||||
|
ddict["alert_warning_unacked"] = warning_unacked
|
||||||
|
ddict["alert_warning_acked"] = warning_acked
|
||||||
|
ddict["alert_critical_unacked"] = critical_unacked
|
||||||
|
ddict["alert_critical_acked"] = critical_acked
|
||||||
|
|
||||||
|
# User access
|
||||||
|
ddict["owner"] = getattr(self, "owner", None)
|
||||||
|
ddict["managers"] = list(getattr(self, "managers", []))
|
||||||
|
ddict["monitors"] = list(getattr(self, "monitors", []))
|
||||||
|
|
||||||
|
# hbc version from latest os_info plugin data
|
||||||
|
hbc_version = None
|
||||||
|
latest_os = self.get_latest_plugin_data("os_info")
|
||||||
|
if latest_os:
|
||||||
|
_, os_data = latest_os
|
||||||
|
hbc_version = os_data.get("hbc_version")
|
||||||
|
ddict["hbc_version"] = hbc_version
|
||||||
|
|
||||||
|
return ddict
|
||||||
|
|
||||||
|
def jsons(self):
|
||||||
|
return json.dumps(self.stateinfo())
|
||||||
|
|
||||||
|
def isDynDns(self):
|
||||||
|
return self.dyn
|
||||||
|
|
||||||
|
def isIPv4(self, addr):
|
||||||
|
if isinstance(addr, tuple):
|
||||||
|
return addr[0].find(".") > 0
|
||||||
|
else:
|
||||||
|
return addr.find(".") > 0
|
||||||
|
|
||||||
|
def conndata(self, cid, addr, rtt, now):
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
if self.isIPv4(addr):
|
||||||
|
afam = "IPv4"
|
||||||
|
else:
|
||||||
|
afam = "IPv6"
|
||||||
|
|
||||||
|
if afam not in self.connections:
|
||||||
|
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||||
|
|
||||||
|
conn = self.connections[afam]
|
||||||
|
res = conn.newaddr(addr, rtt, now)
|
||||||
|
return conn, res
|
||||||
|
|
||||||
|
# called when reloading class from pickle, add new fields here
|
||||||
|
def fixup(self):
|
||||||
|
for c in ["IPv4", "IPv6"]:
|
||||||
|
if c in self.connections:
|
||||||
|
addr = self.connections[c].addr
|
||||||
|
if addr[0:7] == "::ffff:":
|
||||||
|
addr = addr[7:]
|
||||||
|
self.connections[c].addr = addr
|
||||||
|
|
||||||
|
# Add plugin_data if missing (for backward compatibility)
|
||||||
|
if not hasattr(self, "plugin_data"):
|
||||||
|
self.plugin_data = {}
|
||||||
|
if not hasattr(self, "plugin_retention"):
|
||||||
|
self.plugin_retention = 100
|
||||||
|
if not hasattr(self, "alert_states"):
|
||||||
|
self.alert_states = {}
|
||||||
|
# User access fields (added in user-management feature)
|
||||||
|
if not hasattr(self, "owner"):
|
||||||
|
self.owner = None
|
||||||
|
if not hasattr(self, "managers"):
|
||||||
|
self.managers = []
|
||||||
|
if not hasattr(self, "monitors"):
|
||||||
|
self.monitors = []
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_plugin_data(self, plugin_name, data, timestamp=None):
|
||||||
|
"""Store plugin data with timestamp.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin (e.g., "cpu_monitor")
|
||||||
|
data: Dict of plugin data
|
||||||
|
timestamp: Optional timestamp (default: current time)
|
||||||
|
"""
|
||||||
|
if timestamp is None:
|
||||||
|
timestamp = time.time()
|
||||||
|
|
||||||
|
if plugin_name not in self.plugin_data:
|
||||||
|
self.plugin_data[plugin_name] = []
|
||||||
|
|
||||||
|
# Add new data
|
||||||
|
self.plugin_data[plugin_name].append((timestamp, data))
|
||||||
|
|
||||||
|
# Enforce retention limit (keep last N samples)
|
||||||
|
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
|
||||||
|
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
|
||||||
|
|
||||||
|
def get_plugin_data(self, plugin_name, limit=None):
|
||||||
|
"""Retrieve plugin data for a specific plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
limit: Optional limit on number of recent samples to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (timestamp, data) tuples, most recent last
|
||||||
|
"""
|
||||||
|
data = self.plugin_data.get(plugin_name, [])
|
||||||
|
if limit and len(data) > limit:
|
||||||
|
return data[-limit:]
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_latest_plugin_data(self, plugin_name):
|
||||||
|
"""Get the most recent plugin data for a plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(timestamp, data) tuple or None if no data
|
||||||
|
"""
|
||||||
|
data = self.plugin_data.get(plugin_name, [])
|
||||||
|
return data[-1] if data else None
|
||||||
|
|
||||||
|
def get_all_plugin_data(self):
|
||||||
|
"""Get all plugin data for this host.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict of {plugin_name: [(timestamp, data), ...]}
|
||||||
|
"""
|
||||||
|
return self.plugin_data
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# User-role helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def apply_access(self, owner, managers, monitors):
|
||||||
|
"""Set owner/managers/monitors on this host (called from config load)."""
|
||||||
|
self.owner = owner
|
||||||
|
self.managers = list(managers)
|
||||||
|
self.monitors = list(monitors)
|
||||||
|
|
||||||
|
def is_owner(self, username: str) -> bool:
|
||||||
|
return self.owner == username
|
||||||
|
|
||||||
|
def is_manager(self, username: str) -> bool:
|
||||||
|
return username in self.managers or self.is_owner(username)
|
||||||
|
|
||||||
|
def is_monitor(self, username: str) -> bool:
|
||||||
|
return username in self.monitors or self.is_manager(username)
|
||||||
|
|
||||||
|
def access_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"owner": self.owner,
|
||||||
|
"managers": list(self.managers),
|
||||||
|
"monitors": list(self.monitors),
|
||||||
|
}
|
||||||
|
|
||||||
|
hostfields_long = [
|
||||||
|
"name",
|
||||||
|
"IPv4.addr",
|
||||||
|
"IPv4.state",
|
||||||
|
("IPv4.rtt", 'style="text-align: right;"'),
|
||||||
|
("IPv4.statetime", 'style="text-align: right;"'),
|
||||||
|
"IPv6.addr",
|
||||||
|
"IPv6.state",
|
||||||
|
("IPv6.rtt", 'style="text-align: right;"'),
|
||||||
|
("IPv6.statetime", 'style="text-align: right;"'),
|
||||||
|
]
|
||||||
|
|
||||||
|
hostfields_short = [
|
||||||
|
"name",
|
||||||
|
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||||
|
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||||
|
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||||
|
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def gene(self, tag, v, attrib=None):
|
||||||
|
if attrib:
|
||||||
|
a = " %s" % attrib
|
||||||
|
else:
|
||||||
|
a = ""
|
||||||
|
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||||
|
|
||||||
|
def htmltable(self, tag, hd, short):
|
||||||
|
if short:
|
||||||
|
hostfields = Host.hostfields_short
|
||||||
|
else:
|
||||||
|
hostfields = Host.hostfields_long
|
||||||
|
h = []
|
||||||
|
for f in hostfields:
|
||||||
|
if isinstance(f, tuple):
|
||||||
|
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||||
|
else:
|
||||||
|
h.append(self.gene(tag, hd[f]))
|
||||||
|
return self.gene("tr", "\n".join(h))
|
||||||
|
|
||||||
|
def buildhosttable(self, short=False):
|
||||||
|
if DEBUG > 1:
|
||||||
|
print("DBG buildhosttable: start")
|
||||||
|
res = []
|
||||||
|
res.append('<table id="ntable" class="sortable">')
|
||||||
|
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||||
|
hosts_sorted = list(Host.hosts.keys())
|
||||||
|
if len(hosts_sorted):
|
||||||
|
hosts_sorted.sort()
|
||||||
|
for h in hosts_sorted:
|
||||||
|
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||||
|
res.append("</table>")
|
||||||
|
if DEBUG > 1:
|
||||||
|
print("DBG buildhosttable: %s" % res)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def buildmsgtable(self, msgs):
|
||||||
|
res = []
|
||||||
|
le = max(40 - len(Host.hosts), 3)
|
||||||
|
res.append("<h4>Log of Events</h4>")
|
||||||
|
for m in msgs[len(msgs) - le :]:
|
||||||
|
res.append("%s<BR>" % m)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# create fake "unbound objects", remove in Python 3.0
|
||||||
|
ubHost = Host(None)
|
||||||
|
ubConnection = Connection(None, "", "", "")
|
||||||
@@ -0,0 +1,928 @@
|
|||||||
|
"""HTTP server implementation using aiohttp and jinja2."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import platform
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from aiohttp import web
|
||||||
|
import jinja2
|
||||||
|
from . import data
|
||||||
|
from . import notify as notify_mod
|
||||||
|
from . import settings as settings_mod
|
||||||
|
from . import users as users_mod
|
||||||
|
from . import ws as ws_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
def _render_template(html_str: str, **context) -> str:
|
||||||
|
tmpl = jinja2.Template(html_str)
|
||||||
|
return tmpl.render(**context)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Auth helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
SESSION_COOKIE = "hbd_session"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_token(request) -> str:
|
||||||
|
"""Extract session token from Bearer header, X-Auth-Token header, or cookie."""
|
||||||
|
auth = request.headers.get("Authorization", "")
|
||||||
|
if auth.lower().startswith("bearer "):
|
||||||
|
return auth[7:].strip()
|
||||||
|
header_token = request.headers.get("X-Auth-Token", "").strip()
|
||||||
|
if header_token:
|
||||||
|
return header_token
|
||||||
|
return request.cookies.get(SESSION_COOKIE, "")
|
||||||
|
|
||||||
|
|
||||||
|
def _current_user(request):
|
||||||
|
"""Return the authenticated User, or None when auth is not enabled."""
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return None # unauthenticated mode — all access allowed
|
||||||
|
return users_mod.get_session_user(_get_token(request))
|
||||||
|
|
||||||
|
|
||||||
|
def _require_auth(request):
|
||||||
|
"""Return (user, None) or (None, error Response)."""
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return None, None
|
||||||
|
user = users_mod.get_session_user(_get_token(request))
|
||||||
|
if user is None:
|
||||||
|
return None, web.json_response({"error": "Unauthorized"}, status=401)
|
||||||
|
return user, None
|
||||||
|
|
||||||
|
|
||||||
|
def _require_auth_redirect(request):
|
||||||
|
"""Like _require_auth but returns a redirect to /login for browser requests."""
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return None, None
|
||||||
|
user = users_mod.get_session_user(_get_token(request))
|
||||||
|
if user is None:
|
||||||
|
raise web.HTTPFound("/login")
|
||||||
|
return user, None
|
||||||
|
|
||||||
|
|
||||||
|
def _can_view_host(user, host) -> bool:
|
||||||
|
"""Return True if *user* may see *host* (monitor or higher, or no auth)."""
|
||||||
|
if user is None:
|
||||||
|
return True
|
||||||
|
if user.admin:
|
||||||
|
return True
|
||||||
|
return host.is_monitor(user.username)
|
||||||
|
|
||||||
|
|
||||||
|
def _can_operate_host(user, host) -> bool:
|
||||||
|
"""Manager-level: queue commands, DNS, upgrade."""
|
||||||
|
if user is None:
|
||||||
|
return True
|
||||||
|
if user.admin:
|
||||||
|
return True
|
||||||
|
return host.is_manager(user.username)
|
||||||
|
|
||||||
|
|
||||||
|
def _can_own_host(user, host) -> bool:
|
||||||
|
"""Owner-level: drop host, transfer ownership."""
|
||||||
|
if user is None:
|
||||||
|
return True
|
||||||
|
if user.admin:
|
||||||
|
return True
|
||||||
|
return host.is_owner(user.username)
|
||||||
|
|
||||||
|
|
||||||
|
async def start(
|
||||||
|
host: str,
|
||||||
|
port: int,
|
||||||
|
config,
|
||||||
|
hbdclass,
|
||||||
|
tcss=None,
|
||||||
|
verbose=False,
|
||||||
|
get_now=None,
|
||||||
|
VER="",
|
||||||
|
threshold_checker=None,
|
||||||
|
):
|
||||||
|
"""Start an aiohttp web server and block until cancelled.
|
||||||
|
|
||||||
|
This function is intended to be awaited inside the main asyncio event loop.
|
||||||
|
"""
|
||||||
|
get_now = get_now or (lambda: time.time())
|
||||||
|
_start_epoch = time.time()
|
||||||
|
|
||||||
|
async def old_index(request):
|
||||||
|
_require_auth_redirect(request)
|
||||||
|
res = []
|
||||||
|
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
||||||
|
res.append("<html>")
|
||||||
|
res.append("<head>")
|
||||||
|
res.append("<title>Heartbeat</title>")
|
||||||
|
if tcss:
|
||||||
|
res.append(tcss)
|
||||||
|
res.append("</head>")
|
||||||
|
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
||||||
|
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
||||||
|
res += hbdclass.ubHost.buildhosttable()
|
||||||
|
res += hbdclass.ubHost.buildmsgtable(data.msgs)
|
||||||
|
res.append(
|
||||||
|
"<p> %s (%s)</p>"
|
||||||
|
% (
|
||||||
|
time.strftime("%H:%M:%S", time.localtime(get_now())),
|
||||||
|
config.get("tz", "CET-1CDT"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
res.append("</body></html>")
|
||||||
|
body = "\n".join(res)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
async def api_hosts(request):
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hosts = [
|
||||||
|
hbdclass.Host.hosts[h]
|
||||||
|
for h in hbdclass.Host.hosts
|
||||||
|
if _can_view_host(user, hbdclass.Host.hosts[h])
|
||||||
|
]
|
||||||
|
lst = [h.jsons() for h in hosts]
|
||||||
|
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||||
|
|
||||||
|
async def api_messages(request):
|
||||||
|
lst = data.msgs[-30:]
|
||||||
|
return web.json_response(lst)
|
||||||
|
|
||||||
|
async def cmd(request):
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
qa = request.rel_url.query
|
||||||
|
uname = qa.get("h")
|
||||||
|
ucmd = qa.get("c")
|
||||||
|
if not ucmd or not uname:
|
||||||
|
return web.Response(status=400, text="need h= and c= arguments")
|
||||||
|
if uname not in hbdclass.Host.hosts:
|
||||||
|
return web.Response(status=400, text=f"h={uname} not found")
|
||||||
|
host = hbdclass.Host.hosts[uname]
|
||||||
|
if not _can_operate_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
host.cmds.append(("CMD", {"cmd": urllib.parse.unquote(ucmd)}))
|
||||||
|
return web.Response(text=f"cmd {uname} queued")
|
||||||
|
|
||||||
|
async def drop(request):
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
qa = request.rel_url.query
|
||||||
|
uname = qa.get("h")
|
||||||
|
if not uname:
|
||||||
|
return web.Response(status=400, text="need h= argument")
|
||||||
|
if uname not in hbdclass.Host.hosts:
|
||||||
|
return web.Response(status=400, text=f"h={uname} not found")
|
||||||
|
host = hbdclass.Host.hosts[uname]
|
||||||
|
if not _can_own_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
eventlog(uname, "INFO", "dropped")
|
||||||
|
del hbdclass.Host.hosts[uname]
|
||||||
|
return web.Response(text="Done")
|
||||||
|
|
||||||
|
async def register(request):
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
qa = request.rel_url.query
|
||||||
|
uname = qa.get("h")
|
||||||
|
if not uname:
|
||||||
|
return web.Response(status=400, text="need h= argument")
|
||||||
|
if uname not in hbdclass.Host.hosts:
|
||||||
|
return web.Response(status=400, text=f"h={uname} not found")
|
||||||
|
host = hbdclass.Host.hosts[uname]
|
||||||
|
if not _can_operate_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
ll = host.registerDns()
|
||||||
|
eventlog(uname, "INFO", ll)
|
||||||
|
return web.Response(text=str(ll))
|
||||||
|
|
||||||
|
async def update(request):
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
qa = request.rel_url.query
|
||||||
|
uname = urllib.parse.unquote(qa.get("h", ""))
|
||||||
|
if not uname:
|
||||||
|
return web.Response(status=400, text="need h= argument")
|
||||||
|
if uname != "All" and uname not in hbdclass.Host.hosts:
|
||||||
|
return web.Response(status=400, text=f"h={uname} not found")
|
||||||
|
names = [uname] if uname != "All" else list(hbdclass.Host.hosts)
|
||||||
|
out = []
|
||||||
|
for n in names:
|
||||||
|
host = hbdclass.Host.hosts[n]
|
||||||
|
if not _can_operate_host(user, host):
|
||||||
|
out.append(f"update skipped for {n}: Forbidden")
|
||||||
|
continue
|
||||||
|
op_err = None
|
||||||
|
try:
|
||||||
|
host.cmds.append(("UPD", {}))
|
||||||
|
except Exception as e:
|
||||||
|
op_err = str(e)
|
||||||
|
out.append(f"update started for {n}: {op_err if op_err else 'OK'}")
|
||||||
|
return web.Response(text="\n".join(out))
|
||||||
|
|
||||||
|
async def live(request):
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
# render template from hbd/templates/live.html using Jinja2
|
||||||
|
# Resolve templates directory relative to the hbd package
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
host = config.get("hb_host", "localhost")
|
||||||
|
extra_scripts = config.get("http_extra_scripts", "")
|
||||||
|
host = request.host # includes port if non-standard
|
||||||
|
forwarded_proto = request.headers.get("X-Forwarded-Proto", "")
|
||||||
|
is_secure = request.secure or forwarded_proto.lower() == "https"
|
||||||
|
scheme = "wss" if is_secure else "ws"
|
||||||
|
heartbeat_ws_url = f"{scheme}://{host}/ws"
|
||||||
|
from hbd import __version__ as hbd_version
|
||||||
|
tmpl = env.get_template("live.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="Heartbeat",
|
||||||
|
header="Heartbeat",
|
||||||
|
request=request,
|
||||||
|
heartbeat_ws_url=heartbeat_ws_url,
|
||||||
|
extra_scripts=extra_scripts,
|
||||||
|
hbd_version=hbd_version,
|
||||||
|
hosts=[
|
||||||
|
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||||
|
],
|
||||||
|
messages=data.msgs[-30:],
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
active_page="live",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
async def static(request):
|
||||||
|
"""Serve files from the package static directory.
|
||||||
|
|
||||||
|
URL form: /static/<path>
|
||||||
|
"""
|
||||||
|
p = request.match_info.get("path", "")
|
||||||
|
logger.debug("static file requested: %s", p)
|
||||||
|
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
||||||
|
# normalize and prevent directory traversal
|
||||||
|
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
||||||
|
if not target.startswith(base + os.sep) and target != base:
|
||||||
|
return web.Response(status=403, text="Forbidden")
|
||||||
|
if not os.path.exists(target) or not os.path.isfile(target):
|
||||||
|
return web.Response(status=404, text="Not Found")
|
||||||
|
logger.info("serving static file: %s", target)
|
||||||
|
return web.FileResponse(path=target)
|
||||||
|
|
||||||
|
async def favicon(request):
|
||||||
|
"""Serve favicon.ico from the package static directory."""
|
||||||
|
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
|
||||||
|
target = os.path.join(base, "favicon.ico")
|
||||||
|
if not os.path.exists(target) or not os.path.isfile(target):
|
||||||
|
return web.Response(status=404, text="Not Found")
|
||||||
|
return web.FileResponse(path=target)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Plugin Data API Endpoints
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def api_host_plugins(request):
|
||||||
|
"""Get all plugin data for a specific host."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hostname = request.match_info.get("hostname")
|
||||||
|
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||||
|
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
|
||||||
|
# Get plugin data with most recent sample for each plugin
|
||||||
|
plugins_summary = {}
|
||||||
|
for plugin_name, samples in host.plugin_data.items():
|
||||||
|
if samples:
|
||||||
|
# Get most recent sample
|
||||||
|
timestamp, data = samples[-1]
|
||||||
|
plugins_summary[plugin_name] = {
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"data": data,
|
||||||
|
"sample_count": len(samples),
|
||||||
|
}
|
||||||
|
|
||||||
|
return web.json_response({
|
||||||
|
"hostname": hostname,
|
||||||
|
"plugins": plugins_summary,
|
||||||
|
})
|
||||||
|
|
||||||
|
async def api_host_plugin_detail(request):
|
||||||
|
"""Get detailed data for a specific plugin on a host."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hostname = request.match_info.get("hostname")
|
||||||
|
plugin_name = request.match_info.get("plugin_name")
|
||||||
|
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||||
|
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
|
||||||
|
# Get limit from query parameter
|
||||||
|
limit = request.rel_url.query.get("limit", "10")
|
||||||
|
try:
|
||||||
|
limit = int(limit)
|
||||||
|
except ValueError:
|
||||||
|
limit = 10
|
||||||
|
|
||||||
|
# Get plugin data
|
||||||
|
samples = host.get_plugin_data(plugin_name, limit=limit)
|
||||||
|
|
||||||
|
if not samples:
|
||||||
|
return web.json_response(
|
||||||
|
{"error": f"No data for plugin '{plugin_name}' on host '{hostname}'"},
|
||||||
|
status=404
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format samples
|
||||||
|
formatted_samples = [
|
||||||
|
{
|
||||||
|
"timestamp": ts,
|
||||||
|
"data": data,
|
||||||
|
}
|
||||||
|
for ts, data in samples
|
||||||
|
]
|
||||||
|
|
||||||
|
return web.json_response({
|
||||||
|
"hostname": hostname,
|
||||||
|
"plugin": plugin_name,
|
||||||
|
"samples": formatted_samples,
|
||||||
|
"sample_count": len(formatted_samples),
|
||||||
|
})
|
||||||
|
|
||||||
|
async def api_host_alerts(request):
|
||||||
|
"""Get alert states for a specific host."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hostname = request.match_info.get("hostname")
|
||||||
|
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||||
|
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
|
||||||
|
# Get alert states
|
||||||
|
alerts = []
|
||||||
|
for metric_path, alert_state in host.alert_states.items():
|
||||||
|
alerts.append(alert_state.to_dict())
|
||||||
|
|
||||||
|
# Get summary if threshold_checker available
|
||||||
|
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
|
||||||
|
if threshold_checker:
|
||||||
|
summary = threshold_checker.get_alert_summary(host.alert_states)
|
||||||
|
|
||||||
|
return web.json_response({
|
||||||
|
"hostname": hostname,
|
||||||
|
"alerts": alerts,
|
||||||
|
"summary": summary,
|
||||||
|
})
|
||||||
|
|
||||||
|
async def api_all_alerts(request):
|
||||||
|
"""Get all active alerts across all hosts."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
all_alerts = []
|
||||||
|
|
||||||
|
for hostname, host in hbdclass.Host.hosts.items():
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
continue
|
||||||
|
if threshold_checker:
|
||||||
|
active_alerts = threshold_checker.get_active_alerts(host.alert_states)
|
||||||
|
else:
|
||||||
|
# Fallback if no threshold checker
|
||||||
|
from hbd.server.threshold import AlertLevel
|
||||||
|
active_alerts = [
|
||||||
|
state for state in host.alert_states.values()
|
||||||
|
if state.level != AlertLevel.OK
|
||||||
|
]
|
||||||
|
|
||||||
|
for alert in active_alerts:
|
||||||
|
alert_dict = alert.to_dict()
|
||||||
|
alert_dict["hostname"] = hostname
|
||||||
|
all_alerts.append(alert_dict)
|
||||||
|
|
||||||
|
# Sort by level (critical first) then by hostname
|
||||||
|
level_order = {"CRITICAL": 0, "WARNING": 1, "UNKNOWN": 2, "OK": 3}
|
||||||
|
all_alerts.sort(
|
||||||
|
key=lambda a: (level_order.get(a["level"], 99), a["hostname"], a["metric_path"])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get summary counts
|
||||||
|
summary = {"critical": 0, "warning": 0, "unknown": 0, "total": len(all_alerts)}
|
||||||
|
for alert in all_alerts:
|
||||||
|
level = alert["level"].lower()
|
||||||
|
if level in summary:
|
||||||
|
summary[level] += 1
|
||||||
|
|
||||||
|
return web.json_response({
|
||||||
|
"alerts": all_alerts,
|
||||||
|
"summary": summary,
|
||||||
|
"host_count": len(hbdclass.Host.hosts),
|
||||||
|
})
|
||||||
|
|
||||||
|
async def api_acknowledge_alert(request):
|
||||||
|
"""Acknowledge an alert to stop reminder notifications."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
try:
|
||||||
|
data = await request.json()
|
||||||
|
except Exception:
|
||||||
|
return web.json_response(
|
||||||
|
{"error": "Invalid JSON in request body"},
|
||||||
|
status=400
|
||||||
|
)
|
||||||
|
|
||||||
|
hostname = data.get("hostname")
|
||||||
|
metric_path = data.get("metric_path")
|
||||||
|
|
||||||
|
if not hostname or not metric_path:
|
||||||
|
return web.json_response(
|
||||||
|
{"error": "Missing required fields: hostname and metric_path"},
|
||||||
|
status=400
|
||||||
|
)
|
||||||
|
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response(
|
||||||
|
{"error": f"Host '{hostname}' not found"},
|
||||||
|
status=404
|
||||||
|
)
|
||||||
|
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
|
||||||
|
if metric_path not in host.alert_states:
|
||||||
|
return web.json_response(
|
||||||
|
{"error": f"Alert '{metric_path}' not found for host '{hostname}'"},
|
||||||
|
status=404
|
||||||
|
)
|
||||||
|
|
||||||
|
alert_state = host.alert_states[metric_path]
|
||||||
|
alert_state.acknowledge()
|
||||||
|
|
||||||
|
return web.json_response({
|
||||||
|
"success": True,
|
||||||
|
"hostname": hostname,
|
||||||
|
"metric_path": metric_path,
|
||||||
|
"acknowledged_at": alert_state.acknowledged_at,
|
||||||
|
})
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# UI Pages
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def plugins_page(request):
|
||||||
|
"""Render the plugin metrics visualization page."""
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
|
||||||
|
# Collect all hosts with plugin data (filtered by visibility)
|
||||||
|
hosts_with_plugins = []
|
||||||
|
for hostname in sorted(hbdclass.Host.hosts.keys()):
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(current_user, host):
|
||||||
|
continue
|
||||||
|
if host.plugin_data:
|
||||||
|
hosts_with_plugins.append({
|
||||||
|
"name": hostname,
|
||||||
|
"plugins": list(host.plugin_data.keys()),
|
||||||
|
})
|
||||||
|
|
||||||
|
tmpl = env.get_template("plugins.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="Host Overview - Heartbeat",
|
||||||
|
header="Host Overview",
|
||||||
|
hosts=hosts_with_plugins,
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
active_page="plugins",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
async def alerts_page(request):
|
||||||
|
"""Render the alerts dashboard page."""
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
|
||||||
|
tmpl = env.get_template("alerts.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="Alerts Dashboard - Heartbeat",
|
||||||
|
header="Alerts Dashboard",
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
active_page="alerts",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Auth endpoints
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def api_login(request):
|
||||||
|
"""POST /api/0/auth/login {username, password} -> {token}
|
||||||
|
Also sets an hbd_session cookie for browser clients.
|
||||||
|
"""
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return web.json_response({"error": "Auth not configured"}, status=404)
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
except Exception:
|
||||||
|
return web.json_response({"error": "Invalid JSON"}, status=400)
|
||||||
|
username = body.get("username", "")
|
||||||
|
password = body.get("password", "")
|
||||||
|
user = users_mod.authenticate(username, password)
|
||||||
|
if user is None:
|
||||||
|
return web.json_response({"error": "Invalid credentials"}, status=401)
|
||||||
|
token = users_mod.create_session(username)
|
||||||
|
resp = web.json_response({"token": token, "username": username})
|
||||||
|
resp.set_cookie(
|
||||||
|
SESSION_COOKIE,
|
||||||
|
token,
|
||||||
|
max_age=users_mod.SESSION_TTL,
|
||||||
|
httponly=True,
|
||||||
|
samesite="Lax",
|
||||||
|
)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
async def login_page(request):
|
||||||
|
"""GET /login — show login form; POST /login — process and redirect."""
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
raise web.HTTPFound("/")
|
||||||
|
|
||||||
|
error = ""
|
||||||
|
if request.method == "POST":
|
||||||
|
form = await request.post()
|
||||||
|
username = form.get("username", "")
|
||||||
|
password = form.get("password", "")
|
||||||
|
user = users_mod.authenticate(username, password)
|
||||||
|
if user:
|
||||||
|
token = users_mod.create_session(username)
|
||||||
|
redirect_to = request.rel_url.query.get("next", "/")
|
||||||
|
resp = web.HTTPFound(redirect_to)
|
||||||
|
resp.set_cookie(
|
||||||
|
SESSION_COOKIE,
|
||||||
|
token,
|
||||||
|
max_age=users_mod.SESSION_TTL,
|
||||||
|
httponly=True,
|
||||||
|
samesite="Lax",
|
||||||
|
)
|
||||||
|
raise resp
|
||||||
|
error = "Invalid username or password."
|
||||||
|
|
||||||
|
html = f"""<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>Heartbeat — Login</title>
|
||||||
|
<style>
|
||||||
|
body {{ font-family: sans-serif; background: #f5f5f5; display: flex;
|
||||||
|
justify-content: center; align-items: center; height: 100vh; margin: 0; }}
|
||||||
|
.box {{ background: #fff; padding: 2em 2.5em; border-radius: 8px;
|
||||||
|
box-shadow: 0 2px 12px rgba(0,0,0,.15); min-width: 300px; }}
|
||||||
|
h2 {{ margin: 0 0 1.2em; color: #333; font-size: 1.4em; }}
|
||||||
|
label {{ display: block; margin-bottom: .3em; font-size: .9em; color: #555; }}
|
||||||
|
input {{ width: 100%; padding: .5em .7em; border: 1px solid #ccc;
|
||||||
|
border-radius: 4px; font-size: 1em; box-sizing: border-box; }}
|
||||||
|
button {{ margin-top: 1.2em; width: 100%; padding: .6em; background: #0066cc;
|
||||||
|
color: #fff; border: none; border-radius: 4px; font-size: 1em; cursor: pointer; }}
|
||||||
|
button:hover {{ background: #0055aa; }}
|
||||||
|
.error {{ color: #c00; font-size: .9em; margin-bottom: .8em; }}
|
||||||
|
.field {{ margin-bottom: .9em; }}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="box">
|
||||||
|
<h2>Heartbeat</h2>
|
||||||
|
{'<p class="error">' + error + '</p>' if error else ''}
|
||||||
|
<form method="post">
|
||||||
|
<div class="field"><label>Username</label><input name="username" autofocus></div>
|
||||||
|
<div class="field"><label>Password</label><input name="password" type="password"></div>
|
||||||
|
<button type="submit">Sign in</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
return web.Response(text=html, content_type="text/html")
|
||||||
|
|
||||||
|
async def web_logout(request):
|
||||||
|
"""GET /logout — clear session cookie and redirect to /login."""
|
||||||
|
token = request.cookies.get(SESSION_COOKIE, "")
|
||||||
|
users_mod.delete_session(token)
|
||||||
|
resp = web.HTTPFound("/login")
|
||||||
|
resp.del_cookie(SESSION_COOKIE)
|
||||||
|
raise resp
|
||||||
|
|
||||||
|
async def api_logout(request):
|
||||||
|
"""POST /api/0/auth/logout"""
|
||||||
|
token = _get_token(request)
|
||||||
|
users_mod.delete_session(token)
|
||||||
|
resp = web.json_response({"success": True})
|
||||||
|
resp.del_cookie(SESSION_COOKIE)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# User endpoints
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def api_user_avatar(request):
|
||||||
|
"""GET /api/0/users/{username}/avatar — serve a local avatar file.
|
||||||
|
|
||||||
|
Only reachable when the user's avatar config value starts with '/'.
|
||||||
|
Falls back to 404 for external URLs (the browser fetches those directly).
|
||||||
|
"""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
username = request.match_info.get("username")
|
||||||
|
target_user = users_mod.get_user(username)
|
||||||
|
if target_user is None:
|
||||||
|
return web.Response(status=404, text="User not found")
|
||||||
|
if not target_user.avatar_is_local():
|
||||||
|
return web.Response(status=404, text="No local avatar configured")
|
||||||
|
path = target_user.avatar
|
||||||
|
if not os.path.isfile(path):
|
||||||
|
return web.Response(status=404, text="Avatar file not found")
|
||||||
|
# Infer content-type from extension
|
||||||
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
mime = {
|
||||||
|
".png": "image/png",
|
||||||
|
".jpg": "image/jpeg",
|
||||||
|
".jpeg": "image/jpeg",
|
||||||
|
".gif": "image/gif",
|
||||||
|
".webp": "image/webp",
|
||||||
|
".svg": "image/svg+xml",
|
||||||
|
}.get(ext, "application/octet-stream")
|
||||||
|
return web.FileResponse(path=path, headers={"Content-Type": mime})
|
||||||
|
|
||||||
|
async def api_users(request):
|
||||||
|
"""GET /api/0/users — admin only."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
if users_mod.users_enabled() and (user is None or not user.admin):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
return web.json_response([u.to_dict() for u in users_mod.users.values()])
|
||||||
|
|
||||||
|
async def api_user_self(request):
|
||||||
|
"""GET /api/0/users/me — own profile."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
if user is None:
|
||||||
|
return web.json_response({"error": "Auth not configured"}, status=404)
|
||||||
|
return web.json_response(user.to_dict())
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Host access endpoints
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def api_host_access_get(request):
|
||||||
|
"""GET /api/0/hosts/{hostname}/access"""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hostname = request.match_info.get("hostname")
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_view_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
return web.json_response(host.access_dict())
|
||||||
|
|
||||||
|
async def api_host_access_put(request):
|
||||||
|
"""PUT /api/0/hosts/{hostname}/access — owner or admin only.
|
||||||
|
|
||||||
|
Body: {owner?: str, managers?: [str], monitors?: [str]}
|
||||||
|
"""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
hostname = request.match_info.get("hostname")
|
||||||
|
if hostname not in hbdclass.Host.hosts:
|
||||||
|
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||||
|
host = hbdclass.Host.hosts[hostname]
|
||||||
|
if not _can_own_host(user, host):
|
||||||
|
return web.json_response({"error": "Forbidden"}, status=403)
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
except Exception:
|
||||||
|
return web.json_response({"error": "Invalid JSON"}, status=400)
|
||||||
|
|
||||||
|
if "owner" in body:
|
||||||
|
host.owner = body["owner"] or None
|
||||||
|
if "managers" in body:
|
||||||
|
host.managers = list(body["managers"])
|
||||||
|
if "monitors" in body:
|
||||||
|
host.monitors = list(body["monitors"])
|
||||||
|
|
||||||
|
return web.json_response(host.access_dict())
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# User profile page
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def profile_page(request):
|
||||||
|
"""GET /profile — current user's settings and host access summary."""
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
|
||||||
|
# Build host access summary for this user.
|
||||||
|
# Merge live hosts with config-only hosts (not yet seen) so the profile
|
||||||
|
# reflects the config file immediately after a reload.
|
||||||
|
from . import config as config_mod
|
||||||
|
owned, managed, monitored = [], [], []
|
||||||
|
if current_user:
|
||||||
|
# Collect all known hostnames: live + configured
|
||||||
|
cfg_hostnames = set(config.get("hosts", {}).keys())
|
||||||
|
live_hostnames = set(hbdclass.Host.hosts.keys())
|
||||||
|
all_hostnames = sorted(cfg_hostnames | live_hostnames)
|
||||||
|
|
||||||
|
for hostname in all_hostnames:
|
||||||
|
live_host = hbdclass.Host.hosts.get(hostname)
|
||||||
|
if live_host is not None:
|
||||||
|
# Use live object — it has apply_access already called
|
||||||
|
is_own = live_host.is_owner(current_user.username)
|
||||||
|
is_mgr = not is_own and live_host.is_manager(current_user.username)
|
||||||
|
is_mon = not is_own and not is_mgr and live_host.is_monitor(current_user.username)
|
||||||
|
else:
|
||||||
|
# Config-only host — read access directly from config
|
||||||
|
access = config_mod.get_host_access(config, hostname)
|
||||||
|
is_own = access["owner"] == current_user.username
|
||||||
|
is_mgr = current_user.username in access["managers"]
|
||||||
|
is_mon = current_user.username in access["monitors"]
|
||||||
|
|
||||||
|
if is_own:
|
||||||
|
owned.append(hostname)
|
||||||
|
elif is_mgr:
|
||||||
|
managed.append(hostname)
|
||||||
|
elif is_mon:
|
||||||
|
monitored.append(hostname)
|
||||||
|
|
||||||
|
|
||||||
|
# Resolve notification channel configs for display
|
||||||
|
notif_channels = []
|
||||||
|
if current_user:
|
||||||
|
for ch_name in (current_user.notification_channels or []):
|
||||||
|
ch_cfg = config.get("notification_channels", {}).get(ch_name, {})
|
||||||
|
notif_channels.append({"name": ch_name, "type": ch_cfg.get("type", "")})
|
||||||
|
|
||||||
|
tmpl = env.get_template("profile.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="Profile - Heartbeat",
|
||||||
|
header="My Profile",
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
owned_hosts=owned,
|
||||||
|
managed_hosts=managed,
|
||||||
|
monitored_hosts=monitored,
|
||||||
|
notification_channels=notif_channels,
|
||||||
|
active_page="profile",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# About page
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def about_page(request):
|
||||||
|
"""GET /about — version, runtime, and project information."""
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
from hbd import __version__ as hbd_version
|
||||||
|
|
||||||
|
uptime_secs = int(time.time() - _start_epoch)
|
||||||
|
days, rem = divmod(uptime_secs, 86400)
|
||||||
|
hours, rem = divmod(rem, 3600)
|
||||||
|
mins, secs = divmod(rem, 60)
|
||||||
|
if days:
|
||||||
|
uptime_str = f"{days}d {hours}h {mins}m"
|
||||||
|
elif hours:
|
||||||
|
uptime_str = f"{hours}h {mins}m {secs}s"
|
||||||
|
else:
|
||||||
|
uptime_str = f"{mins}m {secs}s"
|
||||||
|
|
||||||
|
start_dt = datetime.datetime.fromtimestamp(_start_epoch)
|
||||||
|
start_time_str = start_dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
tmpl = env.get_template("about.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="About - Heartbeat",
|
||||||
|
header="About",
|
||||||
|
hbd_version=hbd_version,
|
||||||
|
python_version=f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro} ({platform.python_implementation()})",
|
||||||
|
server_hostname=socket.gethostname(),
|
||||||
|
start_epoch=int(_start_epoch),
|
||||||
|
start_time_str=start_time_str,
|
||||||
|
uptime_str=uptime_str,
|
||||||
|
host_count=len(hbdclass.Host.hosts),
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
active_page="about",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Settings page (admin only)
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def settings_page(request):
|
||||||
|
"""GET /settings — read-only view of the current server configuration."""
|
||||||
|
current_user, _ = _require_auth_redirect(request)
|
||||||
|
if current_user and not current_user.admin:
|
||||||
|
raise web.HTTPForbidden(reason="Admin access required")
|
||||||
|
pkg_dir = os.path.dirname(__file__)
|
||||||
|
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||||
|
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||||
|
tmpl = env.get_template("settings.html")
|
||||||
|
body = tmpl.render(
|
||||||
|
title="Settings - Heartbeat",
|
||||||
|
sections=settings_mod.get_settings_sections(config),
|
||||||
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
|
active_page="settings",
|
||||||
|
)
|
||||||
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
app = web.Application()
|
||||||
|
app.add_routes(
|
||||||
|
[
|
||||||
|
web.get("/", live),
|
||||||
|
web.get("/old", old_index),
|
||||||
|
# Auth
|
||||||
|
web.get("/login", login_page),
|
||||||
|
web.post("/login", login_page),
|
||||||
|
web.get("/logout", web_logout),
|
||||||
|
web.post("/api/0/auth/login", api_login),
|
||||||
|
web.post("/api/0/auth/logout", api_logout),
|
||||||
|
# Users
|
||||||
|
web.get("/api/0/users", api_users),
|
||||||
|
web.get("/api/0/users/me", api_user_self),
|
||||||
|
web.get("/api/0/users/{username}/avatar", api_user_avatar),
|
||||||
|
# Hosts
|
||||||
|
web.get("/api/0/hosts", api_hosts),
|
||||||
|
web.get("/api/0/messages", api_messages),
|
||||||
|
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
|
||||||
|
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
|
||||||
|
web.get("/api/0/hosts/{hostname}/alerts", api_host_alerts),
|
||||||
|
web.get("/api/0/hosts/{hostname}/access", api_host_access_get),
|
||||||
|
web.put("/api/0/hosts/{hostname}/access", api_host_access_put),
|
||||||
|
web.get("/api/0/alerts", api_all_alerts),
|
||||||
|
web.post("/api/0/alerts/acknowledge", api_acknowledge_alert),
|
||||||
|
web.get("/c", cmd),
|
||||||
|
web.get("/d", drop),
|
||||||
|
web.get("/n", register),
|
||||||
|
web.get("/u", update),
|
||||||
|
web.get("/live", live),
|
||||||
|
web.get("/plugins", plugins_page),
|
||||||
|
web.get("/alerts", alerts_page),
|
||||||
|
web.get("/about", about_page),
|
||||||
|
web.get("/profile", profile_page),
|
||||||
|
web.get("/settings", settings_page),
|
||||||
|
web.get("/static/{path:.*}", static),
|
||||||
|
web.get("/favicon.ico", favicon),
|
||||||
|
web.get("/ws", ws_mod.handler),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
runner = web.AppRunner(app)
|
||||||
|
await runner.setup()
|
||||||
|
site = web.TCPSite(runner, host, port)
|
||||||
|
await site.start()
|
||||||
|
|
||||||
|
logger.info(f"HTTP server started on {host}:{port}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.Future()
|
||||||
|
finally:
|
||||||
|
await runner.cleanup()
|
||||||
@@ -0,0 +1,342 @@
|
|||||||
|
"""
|
||||||
|
Journal logging for heartbeat messages.
|
||||||
|
|
||||||
|
Provides size-based rotating log files for all received heartbeat messages.
|
||||||
|
Messages are logged in JSON format for easy parsing and analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MessageJournal:
|
||||||
|
"""
|
||||||
|
Journal logger for heartbeat messages with size-based rotation.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Logs all received messages in JSON format
|
||||||
|
- Automatic rotation when file size exceeds threshold
|
||||||
|
- Keeps configurable number of rotated logs
|
||||||
|
- Thread-safe and async-safe operation
|
||||||
|
- Configurable log directory and file naming
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
|
||||||
|
journal_file: Base filename (default: messages.journal)
|
||||||
|
max_size: Maximum file size in bytes before rotation (default: 100MB)
|
||||||
|
max_backups: Number of backup files to keep (default: 10)
|
||||||
|
enabled: Enable/disable journaling (default: True)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize the message journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary with journal settings
|
||||||
|
"""
|
||||||
|
self.config = config or {}
|
||||||
|
|
||||||
|
# Configuration options
|
||||||
|
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
|
||||||
|
self.journal_file = self.config.get('journal_file', 'messages.journal')
|
||||||
|
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
|
||||||
|
self.max_backups = self.config.get('journal_max_backups', 10)
|
||||||
|
self.enabled = self.config.get('journal_enabled', True)
|
||||||
|
|
||||||
|
# Runtime state
|
||||||
|
self._file_handle = None
|
||||||
|
self._current_size = 0
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
# Full path to current journal file
|
||||||
|
self.journal_path = self.journal_dir / self.journal_file
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
"""
|
||||||
|
Initialize the journal.
|
||||||
|
|
||||||
|
Creates journal directory if needed and opens the journal file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if initialization successful, False otherwise
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
logger.info("Message journal disabled in configuration")
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create journal directory if it doesn't exist
|
||||||
|
self.journal_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Open journal file in append mode
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
|
||||||
|
# Get current file size
|
||||||
|
try:
|
||||||
|
self._current_size = os.path.getsize(self.journal_path)
|
||||||
|
except OSError:
|
||||||
|
self._current_size = 0
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info(f"Message journal initialized: {self.journal_path} "
|
||||||
|
f"(current size: {self._current_size:,} bytes, "
|
||||||
|
f"max: {self.max_size:,} bytes)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize message journal: {e}")
|
||||||
|
self.enabled = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def log_message(
|
||||||
|
self,
|
||||||
|
msg: Dict[str, Any],
|
||||||
|
addr: tuple,
|
||||||
|
timestamp: Optional[float] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Log a received message to the journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Parsed message dictionary
|
||||||
|
addr: Source address (ip, port) tuple
|
||||||
|
timestamp: Message timestamp (defaults to current time)
|
||||||
|
"""
|
||||||
|
if not self.enabled or not self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip HTB (heartbeat) messages - too verbose
|
||||||
|
msg_id = msg.get('ID', '')
|
||||||
|
if msg_id == 'HTB':
|
||||||
|
return
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
try:
|
||||||
|
# Prepare journal entry
|
||||||
|
if timestamp is None:
|
||||||
|
import time
|
||||||
|
timestamp = time.time()
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
|
||||||
|
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
|
||||||
|
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
|
||||||
|
'message': msg
|
||||||
|
}
|
||||||
|
|
||||||
|
# Serialize to JSON (one line per entry)
|
||||||
|
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
|
||||||
|
json_bytes = json_line.encode('utf-8')
|
||||||
|
|
||||||
|
# Check if rotation is needed
|
||||||
|
if self._current_size + len(json_bytes) > self.max_size:
|
||||||
|
await self._rotate()
|
||||||
|
|
||||||
|
# Write to journal
|
||||||
|
if self._file_handle:
|
||||||
|
self._file_handle.write(json_line)
|
||||||
|
self._file_handle.flush() # Ensure data is written
|
||||||
|
self._current_size += len(json_bytes)
|
||||||
|
|
||||||
|
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error writing to journal: {e}")
|
||||||
|
|
||||||
|
async def _rotate(self):
|
||||||
|
"""
|
||||||
|
Rotate the journal file.
|
||||||
|
|
||||||
|
Renames current file with timestamp, opens new file, and removes
|
||||||
|
old backups exceeding max_backups limit.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Close current file
|
||||||
|
if self._file_handle:
|
||||||
|
self._file_handle.close()
|
||||||
|
self._file_handle = None
|
||||||
|
|
||||||
|
# Generate backup filename with timestamp
|
||||||
|
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
|
||||||
|
backup_name = f"{self.journal_file}.{timestamp_str}"
|
||||||
|
backup_path = self.journal_dir / backup_name
|
||||||
|
|
||||||
|
# Rename current file to backup
|
||||||
|
if self.journal_path.exists():
|
||||||
|
self.journal_path.rename(backup_path)
|
||||||
|
logger.info(f"Rotated journal: {backup_path} "
|
||||||
|
f"(size: {self._current_size:,} bytes)")
|
||||||
|
|
||||||
|
# Open new journal file
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
self._current_size = 0
|
||||||
|
|
||||||
|
# Clean up old backups
|
||||||
|
await self._cleanup_old_backups()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error rotating journal: {e}")
|
||||||
|
# Try to reopen the file even if rotation failed
|
||||||
|
try:
|
||||||
|
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||||
|
except Exception as e2:
|
||||||
|
logger.error(f"Failed to reopen journal after rotation error: {e2}")
|
||||||
|
self.enabled = False
|
||||||
|
|
||||||
|
async def _cleanup_old_backups(self):
|
||||||
|
"""
|
||||||
|
Remove old backup files exceeding max_backups limit.
|
||||||
|
|
||||||
|
Keeps only the most recent backups based on filename (which includes timestamp).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Find all backup files
|
||||||
|
backup_pattern = f"{self.journal_file}.*"
|
||||||
|
backup_files = sorted(self.journal_dir.glob(backup_pattern))
|
||||||
|
|
||||||
|
# Remove oldest backups if we have too many
|
||||||
|
if len(backup_files) > self.max_backups:
|
||||||
|
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
|
||||||
|
for backup_file in files_to_remove:
|
||||||
|
try:
|
||||||
|
backup_file.unlink()
|
||||||
|
logger.info(f"Removed old backup: {backup_file.name}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning up old backups: {e}")
|
||||||
|
|
||||||
|
async def log_threshold_event(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
metric_path: str,
|
||||||
|
old_level: str,
|
||||||
|
new_level: str,
|
||||||
|
value: Any,
|
||||||
|
timestamp: Optional[float] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Log a threshold state change event.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
|
||||||
|
old_level: Previous alert level
|
||||||
|
new_level: New alert level
|
||||||
|
value: Current metric value
|
||||||
|
timestamp: Event timestamp (default: current time)
|
||||||
|
"""
|
||||||
|
if not self.enabled or not self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
if timestamp is None:
|
||||||
|
timestamp = __import__('time').time()
|
||||||
|
|
||||||
|
event = {
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
|
||||||
|
'event_type': 'threshold',
|
||||||
|
'host': host_name,
|
||||||
|
'metric': metric_path,
|
||||||
|
'old_level': old_level,
|
||||||
|
'new_level': new_level,
|
||||||
|
'value': value,
|
||||||
|
}
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
if not self._file_handle:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if rotation is needed
|
||||||
|
if self._current_size >= self.max_size:
|
||||||
|
await self._rotate()
|
||||||
|
|
||||||
|
# Write event
|
||||||
|
line = json.dumps(event) + '\n'
|
||||||
|
self._file_handle.write(line)
|
||||||
|
self._file_handle.flush()
|
||||||
|
|
||||||
|
# Update size
|
||||||
|
self._current_size += len(line.encode('utf-8'))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error logging threshold event: {e}")
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""
|
||||||
|
Close the journal and release resources.
|
||||||
|
|
||||||
|
Should be called during shutdown.
|
||||||
|
"""
|
||||||
|
async with self._lock:
|
||||||
|
if self._file_handle:
|
||||||
|
try:
|
||||||
|
self._file_handle.close()
|
||||||
|
logger.info("Message journal closed")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error closing journal: {e}")
|
||||||
|
finally:
|
||||||
|
self._file_handle = None
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get journal statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with journal stats
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'enabled': self.enabled,
|
||||||
|
'initialized': self._initialized,
|
||||||
|
'current_file': str(self.journal_path),
|
||||||
|
'current_size': self._current_size,
|
||||||
|
'max_size': self.max_size,
|
||||||
|
'max_backups': self.max_backups,
|
||||||
|
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global journal instance
|
||||||
|
_journal_instance: Optional[MessageJournal] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
|
||||||
|
"""
|
||||||
|
Get or create the global journal instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary (only used on first call)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MessageJournal instance
|
||||||
|
"""
|
||||||
|
global _journal_instance
|
||||||
|
if _journal_instance is None:
|
||||||
|
_journal_instance = MessageJournal(config)
|
||||||
|
return _journal_instance
|
||||||
|
|
||||||
|
|
||||||
|
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Convenience function to log a message using the global journal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
msg: Parsed message dictionary
|
||||||
|
addr: Source address (ip, port) tuple
|
||||||
|
timestamp: Message timestamp (defaults to current time)
|
||||||
|
"""
|
||||||
|
journal = get_journal()
|
||||||
|
await journal.log_message(msg, addr, timestamp)
|
||||||
@@ -0,0 +1,531 @@
|
|||||||
|
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import ssl
|
||||||
|
from . import __version__
|
||||||
|
|
||||||
|
from . import udp
|
||||||
|
from . import hbdclass
|
||||||
|
|
||||||
|
from . import ws as ws_mod
|
||||||
|
from . import notify as notify_mod
|
||||||
|
from . import data
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
msg_to_websockets = ws_mod.broadcast
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
# shared runtime collections and helpers
|
||||||
|
|
||||||
|
def save_state(config, hbdclass):
|
||||||
|
"""Save current state to pickle file. Safe to call at any time."""
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
# Clear timer references before pickling (they can't be serialized)
|
||||||
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
for conn_type, conn in host.connections.items():
|
||||||
|
if hasattr(conn, 'cancel_overdue_timer'):
|
||||||
|
conn.cancel_overdue_timer()
|
||||||
|
if hasattr(conn, 'overdue_timer'):
|
||||||
|
conn.overdue_timer = None
|
||||||
|
if hasattr(conn, 'overdue_callback'):
|
||||||
|
conn.overdue_callback = None
|
||||||
|
if hasattr(conn, 'timeout_duration'):
|
||||||
|
conn.timeout_duration = None
|
||||||
|
|
||||||
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
|
tmpfile = pickfile + ".tmp"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(tmpfile, "wb") as pickf:
|
||||||
|
pick = pickle.Pickler(pickf)
|
||||||
|
pick.dump(hbdclass.Host.hosts)
|
||||||
|
pick.dump(data.msgs)
|
||||||
|
pick.dump(users_mod.save_sessions())
|
||||||
|
os.replace(tmpfile, pickfile)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to save state: %s", e)
|
||||||
|
try:
|
||||||
|
os.unlink(tmpfile)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_function(config, hbdclass):
|
||||||
|
"""This function will be executed upon program exit."""
|
||||||
|
logger.info("Running cleanup function...")
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.info("Cleanup complete.")
|
||||||
|
|
||||||
|
|
||||||
|
async def reload_configuration(config_obj, config_path, components):
|
||||||
|
"""Reload configuration and update all components.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_obj: ReloadableConfig instance
|
||||||
|
config_path: Path to config file
|
||||||
|
components: Dict with threshold_checker and other components
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if reload succeeded, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("Starting configuration reload...")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
# Reload config file
|
||||||
|
new_config = await config_obj.reload(config_path)
|
||||||
|
|
||||||
|
# Update notify module
|
||||||
|
notify_mod.reload_config(new_config)
|
||||||
|
|
||||||
|
# Reload users
|
||||||
|
users_mod.load_users(new_config)
|
||||||
|
|
||||||
|
# Re-apply host attributes from updated config to all known hosts
|
||||||
|
from . import config as config_mod
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(new_config)
|
||||||
|
watchhosts = config_mod.get_watchhosts(new_config)
|
||||||
|
for hostname, host in hbdclass.Host.hosts.items():
|
||||||
|
host.dyn = hostname in dyndnshosts
|
||||||
|
host.watched = hostname in watchhosts
|
||||||
|
access = config_mod.get_host_access(new_config, hostname)
|
||||||
|
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||||
|
|
||||||
|
# Reload threshold checker
|
||||||
|
if 'threshold_checker' in components:
|
||||||
|
components['threshold_checker'].reload(new_config)
|
||||||
|
|
||||||
|
# Note: Changes to the following require restart:
|
||||||
|
# - hb_port, hbd_port, ws_port (already bound)
|
||||||
|
# - SSL certificates (already loaded)
|
||||||
|
# - pickfile (already opened)
|
||||||
|
# - journal settings (journal already initialized)
|
||||||
|
|
||||||
|
# These are reloadable and effective immediately:
|
||||||
|
# - notification_channels
|
||||||
|
# - threshold_configs
|
||||||
|
# - hosts (watchhosts, dyndnshosts, notification_channels)
|
||||||
|
# - grace period (used on next heartbeat)
|
||||||
|
# - debug/verbose flags (used on next message)
|
||||||
|
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("Configuration reload completed successfully")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("=" * 60)
|
||||||
|
logger.error(f"Failed to reload configuration: {e}", exc_info=True)
|
||||||
|
logger.error("Keeping previous configuration")
|
||||||
|
logger.error("=" * 60)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_async(config, config_path=None):
|
||||||
|
from .config import ReloadableConfig
|
||||||
|
if not isinstance(config, ReloadableConfig):
|
||||||
|
config = ReloadableConfig(config, config_path)
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
shutdown_event = asyncio.Event()
|
||||||
|
reload_event = asyncio.Event()
|
||||||
|
|
||||||
|
# Signal handlers for graceful shutdown and reload
|
||||||
|
def signal_handler(signum, frame):
|
||||||
|
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||||
|
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||||
|
loop.call_soon_threadsafe(shutdown_event.set)
|
||||||
|
|
||||||
|
def reload_handler(signum, frame):
|
||||||
|
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||||
|
logger.info(f"Received {sig_name}, initiating config reload...")
|
||||||
|
loop.call_soon_threadsafe(reload_event.set)
|
||||||
|
|
||||||
|
# Register signal handlers
|
||||||
|
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||||
|
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||||
|
loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
|
||||||
|
|
||||||
|
from . import http as http_mod
|
||||||
|
from . import dns as dns_mod
|
||||||
|
from . import notify as notify_mod
|
||||||
|
from . import journal as journal_mod
|
||||||
|
from . import threshold as threshold_mod
|
||||||
|
|
||||||
|
notify_mod.setup(config, loop=loop)
|
||||||
|
|
||||||
|
# Initialize message journal
|
||||||
|
msg_journal = journal_mod.get_journal(config)
|
||||||
|
await msg_journal.initialize()
|
||||||
|
|
||||||
|
# Initialize threshold checker
|
||||||
|
threshold_checker = threshold_mod.ThresholdChecker(
|
||||||
|
config=config,
|
||||||
|
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||||
|
journal=msg_journal,
|
||||||
|
)
|
||||||
|
logger.info("Threshold checker initialized")
|
||||||
|
|
||||||
|
# Components dict for reload orchestration
|
||||||
|
components = {
|
||||||
|
'threshold_checker': threshold_checker,
|
||||||
|
'msg_journal': msg_journal,
|
||||||
|
}
|
||||||
|
|
||||||
|
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||||
|
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||||
|
# This option is system-dependent; on many systems, setting it to False enables
|
||||||
|
# the socket to handle both IPv4 and IPv6 traffic.
|
||||||
|
try:
|
||||||
|
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
bind_addr = ("::", config.get("hb_port", 50003))
|
||||||
|
sock.bind(bind_addr)
|
||||||
|
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||||
|
|
||||||
|
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
|
||||||
|
# If supported, read datagrams via recvmsg() so RTT uses the kernel
|
||||||
|
# timestamp rather than the time.time() call after asyncio scheduling.
|
||||||
|
use_kernel_ts = udp.enable_kernel_timestamps(sock)
|
||||||
|
if use_kernel_ts:
|
||||||
|
logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
|
||||||
|
else:
|
||||||
|
logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
|
||||||
|
|
||||||
|
def udp_handler(msg, addr, transport, recv_ts=None):
|
||||||
|
ctx = dict(
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
msg_to_websockets=msg_to_websockets,
|
||||||
|
msg_journal=msg_journal,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
|
DEBUG=config.get("debug", 0),
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
recv_ts=recv_ts,
|
||||||
|
)
|
||||||
|
udp.handle_datagram(msg, addr, transport, ctx)
|
||||||
|
|
||||||
|
if use_kernel_ts:
|
||||||
|
# recvmsg path: manage the socket ourselves with loop.add_reader()
|
||||||
|
sock.setblocking(False)
|
||||||
|
transport = udp.RecvmsgTransport(loop, sock)
|
||||||
|
reader = udp.make_recvmsg_reader(sock, udp_handler, transport)
|
||||||
|
loop.add_reader(sock.fileno(), reader)
|
||||||
|
protocol = None
|
||||||
|
else:
|
||||||
|
transport, protocol = await loop.create_datagram_endpoint(
|
||||||
|
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||||
|
sock=sock,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Restore connection timers for hosts loaded from pickle
|
||||||
|
restore_ctx = dict(
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
msg_to_websockets=msg_to_websockets,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
|
)
|
||||||
|
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||||
|
|
||||||
|
# HTTP server (asyncio-based via aiohttp)
|
||||||
|
try:
|
||||||
|
http_task = asyncio.create_task(
|
||||||
|
http_mod.start(
|
||||||
|
host=config.get("hbd_host", ""),
|
||||||
|
port=config.get("hbd_port", 50004),
|
||||||
|
config=config,
|
||||||
|
hbdclass=hbdclass,
|
||||||
|
tcss=None,
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
get_now=lambda: time.time(),
|
||||||
|
VER="",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"HTTP server started on %s:%s",
|
||||||
|
config.get("hbd_host", ""),
|
||||||
|
config.get("hbd_port", 50004),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("failed to start HTTP server: %s", e)
|
||||||
|
|
||||||
|
# start dns update worker (async)
|
||||||
|
dns_task = None
|
||||||
|
try:
|
||||||
|
dns_task = dns_mod.start_dns_worker(
|
||||||
|
hbdclass, config, log=eventlog, loop=loop
|
||||||
|
)
|
||||||
|
logger.info("dns update worker started")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("dns worker failed to start: %s", e)
|
||||||
|
|
||||||
|
# Register WebSocket state — connections are now served through /ws on the HTTP port
|
||||||
|
ws_task = None
|
||||||
|
ws_mod.setup(
|
||||||
|
loop=loop,
|
||||||
|
get_hosts=lambda: [
|
||||||
|
hbdclass.Host.hosts[h].stateinfo()
|
||||||
|
for h in sorted(hbdclass.Host.hosts)
|
||||||
|
],
|
||||||
|
verbose=config.get("verbose", False),
|
||||||
|
)
|
||||||
|
logger.info("WebSocket handler registered on /ws (HTTP port %s)", config.get("hbd_port", 50004))
|
||||||
|
|
||||||
|
# Periodic autosave task
|
||||||
|
autosave_interval = config.get("autosave_interval", 300) # default: 5 minutes
|
||||||
|
|
||||||
|
async def autosave_task():
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(autosave_interval)
|
||||||
|
logger.debug("Autosaving state...")
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.debug("Autosave complete (%d hosts)", len(hbdclass.Host.hosts))
|
||||||
|
|
||||||
|
autosave = asyncio.create_task(autosave_task())
|
||||||
|
logger.info("Autosave task started (interval: %ds)", autosave_interval)
|
||||||
|
|
||||||
|
# Main event loop - monitor shutdown and reload events
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# Wait for either shutdown or reload event
|
||||||
|
done, pending = await asyncio.wait(
|
||||||
|
[
|
||||||
|
asyncio.create_task(shutdown_event.wait()),
|
||||||
|
asyncio.create_task(reload_event.wait()),
|
||||||
|
],
|
||||||
|
return_when=asyncio.FIRST_COMPLETED
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check which event was triggered
|
||||||
|
if shutdown_event.is_set():
|
||||||
|
logger.info("Shutdown signal received, stopping services...")
|
||||||
|
# Cancel pending wait tasks
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
break
|
||||||
|
|
||||||
|
if reload_event.is_set():
|
||||||
|
# Clear the event for next reload
|
||||||
|
reload_event.clear()
|
||||||
|
|
||||||
|
# Cancel pending wait tasks
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
# Perform reload if config_path is available
|
||||||
|
if config_path:
|
||||||
|
await reload_configuration(config, config_path, components)
|
||||||
|
else:
|
||||||
|
logger.warning("Cannot reload: no config path available")
|
||||||
|
|
||||||
|
# Continue main loop
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Error in main loop: %s", e)
|
||||||
|
finally:
|
||||||
|
# Cancel all running tasks
|
||||||
|
logger.info("Cancelling tasks...")
|
||||||
|
try:
|
||||||
|
transport.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error closing UDP transport: %s", e)
|
||||||
|
|
||||||
|
tasks_to_cancel = [http_task, autosave]
|
||||||
|
for task in tasks_to_cancel:
|
||||||
|
if task:
|
||||||
|
try:
|
||||||
|
task.cancel()
|
||||||
|
logger.debug("Cancelled task: %s", task)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error cancelling task: %s", e)
|
||||||
|
|
||||||
|
# Wait for tasks to finish cancellation with timeout
|
||||||
|
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||||
|
if remaining_tasks:
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||||
|
timeout=2.0,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Timeout waiting for tasks to cancel")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Exception during task cancellation: %s", e)
|
||||||
|
|
||||||
|
# Close message journal
|
||||||
|
try:
|
||||||
|
await msg_journal.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error closing message journal: %s", e)
|
||||||
|
|
||||||
|
# Signal DNS worker to exit and await it
|
||||||
|
try:
|
||||||
|
if "dns_task" in locals() and dns_task:
|
||||||
|
try:
|
||||||
|
hbdclass.Host.dnsQ.put(None)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||||
|
logger.info("DNS worker finished")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Timeout waiting for DNS worker to finish")
|
||||||
|
dns_task.cancel()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("DNS worker was cancelled")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error awaiting DNS worker: %s", e)
|
||||||
|
finally:
|
||||||
|
# Clear queue bridge to release any held references
|
||||||
|
hbdclass.Host.dnsQ = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error stopping DNS worker: %s", e)
|
||||||
|
|
||||||
|
# Save state (hosts + sessions) on clean shutdown
|
||||||
|
try:
|
||||||
|
save_state(config, hbdclass)
|
||||||
|
logger.info("State saved on shutdown")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error saving state on shutdown: %s", e)
|
||||||
|
|
||||||
|
logger.info("All tasks cancelled")
|
||||||
|
|
||||||
|
|
||||||
|
def load_pickled_hosts(config, hbdclass):
|
||||||
|
"""Load pickled hosts from file, if available."""
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from . import config as config_mod
|
||||||
|
from . import users as users_mod
|
||||||
|
|
||||||
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||||
|
watchhosts = config_mod.get_watchhosts(config)
|
||||||
|
drophosts = config.get("drophosts", [])
|
||||||
|
if 1 and os.path.exists(pickfile):
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("opening pickls %s", pickfile)
|
||||||
|
pickf = open(pickfile, "rb")
|
||||||
|
pick = pickle.Unpickler(pickf)
|
||||||
|
try:
|
||||||
|
hbdclass.Host.hosts = pick.load()
|
||||||
|
data.msgs = pick.load()
|
||||||
|
try:
|
||||||
|
users_mod.load_sessions(pick.load())
|
||||||
|
except Exception:
|
||||||
|
pass # older pickle without sessions — fine
|
||||||
|
pickf.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("load pickled failed: %s", e)
|
||||||
|
os.unlink(pickfile)
|
||||||
|
hbdclass.Connection.htab = {}
|
||||||
|
for h in list(hbdclass.Host.hosts.keys()):
|
||||||
|
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||||
|
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||||
|
hbdclass.Host.hosts[h].fixup()
|
||||||
|
access = config_mod.get_host_access(config, h)
|
||||||
|
hbdclass.Host.hosts[h].apply_access(
|
||||||
|
access["owner"], access["managers"], access["monitors"]
|
||||||
|
)
|
||||||
|
for h in drophosts:
|
||||||
|
if h in hbdclass.Host.hosts:
|
||||||
|
del hbdclass.Host.hosts[h]
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||||
|
else:
|
||||||
|
if config.get("verbose", False):
|
||||||
|
logger.info("no pickled data")
|
||||||
|
|
||||||
|
|
||||||
|
def run(config, config_path=None):
|
||||||
|
"""Start the hbd service (blocking).
|
||||||
|
|
||||||
|
Manually manages the event loop to ensure clean shutdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
config_path: Path to config file (for reload support)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
log_level = logging.WARNING
|
||||||
|
if config.get("verbose", False):
|
||||||
|
log_level = logging.INFO
|
||||||
|
if config.get("debug", 0) > 0:
|
||||||
|
log_level = logging.DEBUG
|
||||||
|
logging.basicConfig(level=log_level)
|
||||||
|
load_pickled_hosts(config, hbdclass)
|
||||||
|
|
||||||
|
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||||
|
users_mod.load_users(config)
|
||||||
|
|
||||||
|
# Write pidfile
|
||||||
|
pidfile = config.get("pidfile", "")
|
||||||
|
if pidfile:
|
||||||
|
try:
|
||||||
|
with open(pidfile, "w") as f:
|
||||||
|
f.write(str(os.getpid()))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to write pidfile %s: %s", pidfile, e)
|
||||||
|
|
||||||
|
eventlog(None, "INFO", f"hbd version {__version__} starting up")
|
||||||
|
|
||||||
|
if config_path:
|
||||||
|
logger.info(f"Config file: {config_path} (reload with SIGHUP)")
|
||||||
|
else:
|
||||||
|
logger.warning("No config path provided - reload via SIGHUP disabled")
|
||||||
|
|
||||||
|
# Create and set the event loop manually
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop.run_until_complete(_run_async(config, config_path=config_path))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Unhandled exception in main: %s", e)
|
||||||
|
finally:
|
||||||
|
cleanup_function(config, hbdclass)
|
||||||
|
logger.info("hbd shutdown complete")
|
||||||
|
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
|
||||||
|
notify_mod.closelog()
|
||||||
|
# Remove pidfile
|
||||||
|
if pidfile:
|
||||||
|
try:
|
||||||
|
os.unlink(pidfile)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Explicitly close the loop
|
||||||
|
try:
|
||||||
|
# Cancel all remaining tasks
|
||||||
|
pending = asyncio.all_tasks(loop)
|
||||||
|
for task in pending:
|
||||||
|
task.cancel()
|
||||||
|
# Run one more cycle to process cancellations
|
||||||
|
if pending:
|
||||||
|
loop.run_until_complete(
|
||||||
|
asyncio.gather(*pending, return_exceptions=True)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
# Exit
|
||||||
|
os._exit(0)
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
"""Monitor helper for heartbeat daemon.
|
||||||
|
|
||||||
|
This module provides monitoring tasks for the heartbeat daemon.
|
||||||
|
The primary reachability monitoring is now event-driven (timers set/reset
|
||||||
|
on HTB arrival in udp.py) rather than periodic polling.
|
||||||
|
|
||||||
|
This module can be extended for additional monitoring tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
|
DROPOVERDUE = 7 * 24 * 3600
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_connections(hbdclass):
|
||||||
|
"""Clean up connection timers on shutdown.
|
||||||
|
|
||||||
|
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||||
|
"""
|
||||||
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
for conn_type, conn in host.connections.items():
|
||||||
|
if hasattr(conn, 'cancel_overdue_timer'):
|
||||||
|
conn.cancel_overdue_timer()
|
||||||
|
|
||||||
@@ -0,0 +1,483 @@
|
|||||||
|
"""Notification helpers: email, pushover, matrix, mattermost, signal, sms and dispatcher.
|
||||||
|
|
||||||
|
Channel types supported:
|
||||||
|
pushover - Pushover app notifications
|
||||||
|
email - SMTP email
|
||||||
|
matrix - Matrix (via matrix-nio)
|
||||||
|
mattermost - Mattermost webhook
|
||||||
|
signal - Signal via signal-cli subprocess
|
||||||
|
sms_voipms - SMS via voip.ms REST API
|
||||||
|
|
||||||
|
Each channel can specify ``min_level: WARNING|CRITICAL`` (default: WARNING).
|
||||||
|
|
||||||
|
Notifications are dispatched to the owner + managers of the host, each via
|
||||||
|
their own ``notification_channels`` list. When no users are configured the
|
||||||
|
server runs silently (no notifications sent).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import smtplib
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from . import data
|
||||||
|
from . import ws as ws_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
msg_to_websockets = ws_mod.broadcast
|
||||||
|
|
||||||
|
# Module-level state set via setup()
|
||||||
|
_config: dict = {}
|
||||||
|
|
||||||
|
# Tracks which channels fired a WARNING/CRITICAL per host.
|
||||||
|
# {host_name: set of channel_names} — used to route RECOVER to the same channels.
|
||||||
|
_alerted_channels: dict = {}
|
||||||
|
|
||||||
|
logf = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Level ordering
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_LEVEL_ORDER = {"RECOVER": 0, "INFO": 0, "WARNING": 1, "CRITICAL": 2}
|
||||||
|
|
||||||
|
def _level_value(level: str) -> int:
|
||||||
|
return _LEVEL_ORDER.get(level.upper(), 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Notification dataclass
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Notification:
|
||||||
|
"""Structured notification payload."""
|
||||||
|
title: str # e.g. "[CRITICAL] webserver01"
|
||||||
|
body: str # detail message
|
||||||
|
level: str # RECOVER | WARNING | CRITICAL | INFO
|
||||||
|
url: str = "" # link to plugin metrics page
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module setup
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def setup(cfg: dict, loop: Optional[asyncio.AbstractEventLoop] = None):
|
||||||
|
"""Initialize notifier from configuration dict."""
|
||||||
|
global _config
|
||||||
|
_config = dict(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def reload_config(cfg: dict):
|
||||||
|
"""Reload notification configuration on SIGHUP."""
|
||||||
|
global _config
|
||||||
|
_config = dict(cfg)
|
||||||
|
logger.info("Notification configuration reloaded")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Event log (websocket + file + in-memory)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def initlog(logfile):
|
||||||
|
global logf
|
||||||
|
try:
|
||||||
|
logf = open(logfile, "a+")
|
||||||
|
except Exception as e:
|
||||||
|
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
||||||
|
logf = sys.stderr
|
||||||
|
return logf
|
||||||
|
|
||||||
|
|
||||||
|
def closelog():
|
||||||
|
global logf
|
||||||
|
if logf and logf != sys.stderr:
|
||||||
|
try:
|
||||||
|
logf.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def eventlog(host, lvl, m, service=None):
|
||||||
|
ts = time.time()
|
||||||
|
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||||
|
if host:
|
||||||
|
s += f"{host} "
|
||||||
|
s += m
|
||||||
|
data.msgs.append(s)
|
||||||
|
logger.info(s)
|
||||||
|
if logf:
|
||||||
|
try:
|
||||||
|
logf.write(s + "\n")
|
||||||
|
logf.flush()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("failed to write to logfile: %s", e)
|
||||||
|
msg_to_websockets("message", s)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Low-level channel drivers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _send_pushover(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
import http.client
|
||||||
|
import urllib.parse
|
||||||
|
token = channel_cfg.get("token", "")
|
||||||
|
user = channel_cfg.get("user", "")
|
||||||
|
if not token or not user:
|
||||||
|
logger.warning("pushover: missing token or user")
|
||||||
|
return False
|
||||||
|
params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body}
|
||||||
|
if notif.url:
|
||||||
|
params["url"] = notif.url
|
||||||
|
params["url_title"] = "Plugin metrics"
|
||||||
|
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||||
|
try:
|
||||||
|
conn.request(
|
||||||
|
"POST",
|
||||||
|
"/1/messages.json",
|
||||||
|
urllib.parse.urlencode(params),
|
||||||
|
{"Content-type": "application/x-www-form-urlencoded"},
|
||||||
|
)
|
||||||
|
r = conn.getresponse()
|
||||||
|
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||||
|
return r.status == 200
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("pushover error: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_email(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
recipients = channel_cfg.get("recipients", [])
|
||||||
|
sender = channel_cfg.get("sender", "")
|
||||||
|
smtp_server = channel_cfg.get("smtp_server", "")
|
||||||
|
smtp_port = channel_cfg.get("smtp_port", 587)
|
||||||
|
smtp_user = channel_cfg.get("smtp_user")
|
||||||
|
smtp_password = channel_cfg.get("smtp_password")
|
||||||
|
|
||||||
|
if not recipients or not sender or not smtp_server:
|
||||||
|
logger.warning("email: missing recipients, sender, or smtp_server")
|
||||||
|
return False
|
||||||
|
|
||||||
|
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||||
|
body_text = notif.body
|
||||||
|
if notif.url:
|
||||||
|
body_text += f"\n\n{notif.url}"
|
||||||
|
raw = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||||
|
recipients[0] if isinstance(recipients, list) else recipients,
|
||||||
|
sender,
|
||||||
|
notif.title,
|
||||||
|
date,
|
||||||
|
body_text,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
server = smtplib.SMTP(smtp_server, smtp_port)
|
||||||
|
if smtp_port == 587:
|
||||||
|
server.starttls()
|
||||||
|
server.ehlo()
|
||||||
|
if smtp_user and smtp_password:
|
||||||
|
server.login(smtp_user, smtp_password)
|
||||||
|
server.sendmail(sender, recipients, raw)
|
||||||
|
server.quit()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("email send failed: %s", e)
|
||||||
|
try:
|
||||||
|
server.quit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_mattermost(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
try:
|
||||||
|
from mattermostdriver import Driver
|
||||||
|
except ImportError:
|
||||||
|
logger.error("mattermostdriver not installed")
|
||||||
|
return False
|
||||||
|
host = channel_cfg.get("host", "")
|
||||||
|
token = channel_cfg.get("token", "")
|
||||||
|
channel = channel_cfg.get("channel", "")
|
||||||
|
if not host or not token or not channel:
|
||||||
|
logger.warning("mattermost: missing host, token, or channel")
|
||||||
|
return False
|
||||||
|
text = f"**{notif.title}**\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
text += f"\n[Plugin metrics]({notif.url})"
|
||||||
|
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||||
|
mm = Driver(ses)
|
||||||
|
payload: dict = {"text": text, "channel": channel, "username": channel_cfg.get("username", "hbd")}
|
||||||
|
icon = channel_cfg.get("icon")
|
||||||
|
if icon:
|
||||||
|
payload["icon_url"] = icon
|
||||||
|
try:
|
||||||
|
rc = mm.webhooks.call_webhook(token, payload)
|
||||||
|
return bool(rc is None or rc == "")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("mattermost error: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _send_signal(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
cli = channel_cfg.get("cli_path", "/usr/local/bin/signal-cli")
|
||||||
|
user = channel_cfg.get("user", "")
|
||||||
|
recipient = channel_cfg.get("recipient", "")
|
||||||
|
if not user or not recipient:
|
||||||
|
logger.warning("signal: missing user or recipient")
|
||||||
|
return False
|
||||||
|
msg = f"{notif.title}\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
msg += f"\n{notif.url}"
|
||||||
|
try:
|
||||||
|
res = subprocess.run([cli, "-u", user, "send", "-m", msg, recipient], capture_output=True)
|
||||||
|
if res.returncode != 0:
|
||||||
|
logger.error("signal failed: %s", res.stderr.decode())
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("signal exception: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_sms_voipms_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send SMS via voip.ms REST API using multipart form-data POST."""
|
||||||
|
import json
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
api_user = channel_cfg.get("api_user", "")
|
||||||
|
api_password = channel_cfg.get("api_password", "")
|
||||||
|
did = channel_cfg.get("did", "")
|
||||||
|
dst = channel_cfg.get("dst", "")
|
||||||
|
if not api_user or not api_password or not did or not dst:
|
||||||
|
logger.warning("sms_voipms: missing api_user, api_password, did, or dst")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# SMS body: title + body, truncated to 160 chars
|
||||||
|
text = f"{notif.title}: {notif.body}"
|
||||||
|
if len(text) > 160:
|
||||||
|
text = text[:157] + "..."
|
||||||
|
|
||||||
|
form_data = {
|
||||||
|
"api_username": api_user,
|
||||||
|
"api_password": api_password,
|
||||||
|
"method": "sendSMS",
|
||||||
|
"did": did,
|
||||||
|
"dst": dst,
|
||||||
|
"message": text,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
with aiohttp.MultipartWriter("form-data") as mp:
|
||||||
|
for key, value in form_data.items():
|
||||||
|
part = mp.append(value)
|
||||||
|
part.set_content_disposition("form-data", name=key)
|
||||||
|
async with session.post("https://voip.ms/api/v1/rest.php", data=mp) as resp:
|
||||||
|
body = await resp.text()
|
||||||
|
if resp.status != 200:
|
||||||
|
logger.error("sms_voipms HTTP %s: %s", resp.status, body)
|
||||||
|
return False
|
||||||
|
result = json.loads(body)
|
||||||
|
if result.get("status") == "success":
|
||||||
|
return True
|
||||||
|
logger.error("sms_voipms error: %s", result.get("status"))
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("sms_voipms exception: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_matrix_async(channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send a Matrix message using matrix-nio."""
|
||||||
|
try:
|
||||||
|
from nio import AsyncClient, RoomMessageText # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
logger.error("matrix-nio not installed; pip install matrix-nio")
|
||||||
|
return False
|
||||||
|
|
||||||
|
from nio import AsyncClient
|
||||||
|
homeserver = channel_cfg.get("homeserver", "")
|
||||||
|
access_token = channel_cfg.get("access_token", "")
|
||||||
|
room_id = channel_cfg.get("room_id", "")
|
||||||
|
if not homeserver or not access_token or not room_id:
|
||||||
|
logger.warning("matrix: missing homeserver, access_token, or room_id")
|
||||||
|
return False
|
||||||
|
|
||||||
|
text = f"{notif.title}\n{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
text += f"\n{notif.url}"
|
||||||
|
html = f"<strong>{notif.title}</strong><br>{notif.body}"
|
||||||
|
if notif.url:
|
||||||
|
html += f'<br><a href="{notif.url}">Plugin metrics</a>'
|
||||||
|
|
||||||
|
client = AsyncClient(homeserver)
|
||||||
|
client.access_token = access_token
|
||||||
|
try:
|
||||||
|
from nio import RoomSendResponse
|
||||||
|
content = {
|
||||||
|
"msgtype": "m.text",
|
||||||
|
"body": text,
|
||||||
|
"format": "org.matrix.custom.html",
|
||||||
|
"formatted_body": html,
|
||||||
|
}
|
||||||
|
resp = await client.room_send(room_id, "m.room.message", content)
|
||||||
|
if hasattr(resp, "event_id"):
|
||||||
|
return True
|
||||||
|
logger.error("matrix send failed: %s", resp)
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("matrix exception: %s", e)
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
await client.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Channel dispatcher (all async — sync drivers run in a thread executor)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Sync drivers kept for `hbd notify` CLI usage (asyncio.run wraps them there).
|
||||||
|
_DRIVERS = {
|
||||||
|
"pushover": _send_pushover,
|
||||||
|
"email": _send_email,
|
||||||
|
"mattermost": _send_mattermost,
|
||||||
|
"signal": _send_signal,
|
||||||
|
}
|
||||||
|
|
||||||
|
_TIMEOUT = 15 # seconds per channel send
|
||||||
|
|
||||||
|
|
||||||
|
async def _dispatch_to_channel(channel_name: str, channel_cfg: dict, notif: Notification) -> bool:
|
||||||
|
"""Send *notif* to a single named channel, honouring min_level."""
|
||||||
|
level = notif.level.upper()
|
||||||
|
if level != "RECOVER":
|
||||||
|
min_level = channel_cfg.get("min_level", "WARNING").upper()
|
||||||
|
if _level_value(level) < _level_value(min_level):
|
||||||
|
logger.debug(
|
||||||
|
"channel '%s': skipping level %s (min_level=%s)", channel_name, level, min_level
|
||||||
|
)
|
||||||
|
return True # filtered intentionally
|
||||||
|
|
||||||
|
ch_type = channel_cfg.get("type", "")
|
||||||
|
try:
|
||||||
|
if ch_type == "matrix":
|
||||||
|
return await asyncio.wait_for(_send_matrix_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||||
|
if ch_type == "sms_voipms":
|
||||||
|
return await asyncio.wait_for(_send_sms_voipms_async(channel_cfg, notif), timeout=_TIMEOUT)
|
||||||
|
sync_driver = _DRIVERS.get(ch_type)
|
||||||
|
if sync_driver is None:
|
||||||
|
logger.warning("unknown channel type '%s' for channel '%s'", ch_type, channel_name)
|
||||||
|
return False
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
asyncio.to_thread(sync_driver, channel_cfg, notif), timeout=_TIMEOUT
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.error("channel '%s' timed out after %ds", channel_name, _TIMEOUT)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Central dispatch function
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _build_url(host_name: str) -> str:
|
||||||
|
base_url = _config.get("base_url", "").rstrip("/")
|
||||||
|
if not base_url:
|
||||||
|
return ""
|
||||||
|
return f"{base_url}/plugins#{host_name}"
|
||||||
|
|
||||||
|
|
||||||
|
async def send_notification(host_name: str, notif: Notification) -> dict:
|
||||||
|
"""Dispatch *notif* to all managers/owner of *host_name*.
|
||||||
|
|
||||||
|
Looks up the host's owner + managers, resolves each user's
|
||||||
|
notification_channels, and dispatches. Silently does nothing if
|
||||||
|
no users are configured.
|
||||||
|
|
||||||
|
Returns a dict of {channel_name: bool} results.
|
||||||
|
"""
|
||||||
|
from . import users as users_mod
|
||||||
|
from . import hbdclass
|
||||||
|
|
||||||
|
if not users_mod.users_enabled():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Collect recipient usernames: owner + managers
|
||||||
|
host = hbdclass.Host.hosts.get(host_name)
|
||||||
|
if host is None:
|
||||||
|
logger.debug("send_notification: host '%s' not found", host_name)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
recipients: set[str] = set()
|
||||||
|
owner = getattr(host, "owner", None)
|
||||||
|
if owner:
|
||||||
|
recipients.add(owner)
|
||||||
|
for m in getattr(host, "managers", []):
|
||||||
|
recipients.add(m)
|
||||||
|
|
||||||
|
if not recipients:
|
||||||
|
logger.debug("send_notification: no owner/managers for '%s'", host_name)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Fill url if not already set
|
||||||
|
if not notif.url:
|
||||||
|
notif.url = _build_url(host_name)
|
||||||
|
|
||||||
|
global_channels: dict = _config.get("notification_channels", {})
|
||||||
|
results: dict = {}
|
||||||
|
level = notif.level.upper()
|
||||||
|
is_alert = level in ("WARNING", "CRITICAL")
|
||||||
|
is_recover = level in ("RECOVER",)
|
||||||
|
|
||||||
|
# For RECOVER: send to every channel that previously fired an alert for this host,
|
||||||
|
# regardless of that channel's min_level.
|
||||||
|
if is_recover and host_name in _alerted_channels:
|
||||||
|
for channel_name in list(_alerted_channels[host_name]):
|
||||||
|
channel_cfg = global_channels.get(channel_name)
|
||||||
|
if not channel_cfg:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||||
|
results[channel_name] = ok
|
||||||
|
if ok:
|
||||||
|
logger.info("recover sent to channel '%s': %s", channel_name, notif.title)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("error sending recover to channel '%s': %s", channel_name, e)
|
||||||
|
del _alerted_channels[host_name]
|
||||||
|
return results
|
||||||
|
|
||||||
|
for username in recipients:
|
||||||
|
user = users_mod.get_user(username)
|
||||||
|
if user is None:
|
||||||
|
logger.debug("send_notification: user '%s' not found", username)
|
||||||
|
continue
|
||||||
|
for channel_name in user.notification_channels:
|
||||||
|
if channel_name in results:
|
||||||
|
continue
|
||||||
|
channel_cfg = global_channels.get(channel_name)
|
||||||
|
if not channel_cfg:
|
||||||
|
logger.warning("channel '%s' not defined in notification_channels", channel_name)
|
||||||
|
results[channel_name] = False
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ok = await _dispatch_to_channel(channel_name, channel_cfg, notif)
|
||||||
|
results[channel_name] = ok
|
||||||
|
if ok:
|
||||||
|
logger.info("notification sent to channel '%s': %s", channel_name, notif.title)
|
||||||
|
if is_alert:
|
||||||
|
_alerted_channels.setdefault(host_name, set()).add(channel_name)
|
||||||
|
else:
|
||||||
|
logger.warning("failed to send notification to channel '%s'", channel_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("error sending to channel '%s': %s", channel_name, e)
|
||||||
|
results[channel_name] = False
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -0,0 +1,328 @@
|
|||||||
|
"""Settings descriptor: maps config keys to display metadata.
|
||||||
|
|
||||||
|
``get_settings_sections(config)`` returns an ordered list of sections, each
|
||||||
|
containing a list of field descriptors. The template iterates this structure
|
||||||
|
generically, so adding editability later is a matter of:
|
||||||
|
|
||||||
|
1. Setting ``"editable": True`` on a field.
|
||||||
|
2. Adding the matching ``<input>``/``<select>`` in the template
|
||||||
|
(guided by ``"type"``).
|
||||||
|
3. Wiring a POST handler in http.py.
|
||||||
|
|
||||||
|
Field descriptor keys
|
||||||
|
---------------------
|
||||||
|
key str Config key (for future form POST matching)
|
||||||
|
label str Human-readable label
|
||||||
|
description str One-line help text shown below the value
|
||||||
|
value any Sanitized display value (secrets replaced with "•••")
|
||||||
|
type str One of: text | number | port | boolean | path | duration |
|
||||||
|
list | secret | size | select
|
||||||
|
editable bool Reserved for future use — currently always False
|
||||||
|
sensitive bool True when the raw value must never be shown
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Credential field names that should always be masked.
|
||||||
|
_SECRET_KEYS = frozenset({
|
||||||
|
"password", "token", "user_key", "api_key", "secret",
|
||||||
|
"smtp_password", "smtp_user",
|
||||||
|
})
|
||||||
|
|
||||||
|
_CHANNEL_TYPE_LABELS = {
|
||||||
|
"pushover": "Pushover",
|
||||||
|
"email": "E-mail",
|
||||||
|
"signal": "Signal",
|
||||||
|
"mattermost": "Mattermost",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _mask(value):
|
||||||
|
"""Return a masked placeholder for sensitive values."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
return "•••"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_size(n):
|
||||||
|
"""Format a byte count as a human-readable string."""
|
||||||
|
try:
|
||||||
|
n = int(n)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(n)
|
||||||
|
for unit in ("B", "KB", "MB", "GB"):
|
||||||
|
if n < 1024:
|
||||||
|
return f"{n} {unit}"
|
||||||
|
n //= 1024
|
||||||
|
return f"{n} TB"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_duration(seconds):
|
||||||
|
"""Format seconds into a human-readable duration string."""
|
||||||
|
try:
|
||||||
|
s = int(seconds)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(seconds)
|
||||||
|
if s < 60:
|
||||||
|
return f"{s}s"
|
||||||
|
if s < 3600:
|
||||||
|
m, sec = divmod(s, 60)
|
||||||
|
return f"{m}m {sec}s" if sec else f"{m}m"
|
||||||
|
h, rem = divmod(s, 3600)
|
||||||
|
m = rem // 60
|
||||||
|
return f"{h}h {m}m" if m else f"{h}h"
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_channel(name, cfg):
|
||||||
|
"""Return a sanitized copy of a notification channel config."""
|
||||||
|
result = {}
|
||||||
|
for k, v in cfg.items():
|
||||||
|
if k in _SECRET_KEYS:
|
||||||
|
result[k] = _mask(v)
|
||||||
|
elif isinstance(v, list):
|
||||||
|
result[k] = v
|
||||||
|
else:
|
||||||
|
result[k] = v
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_settings_sections(config: dict) -> list:
|
||||||
|
"""Return ordered list of setting sections for the settings page.
|
||||||
|
|
||||||
|
Each section:
|
||||||
|
{
|
||||||
|
"title": str,
|
||||||
|
"description": str,
|
||||||
|
"fields": [ field_descriptor, ... ]
|
||||||
|
}
|
||||||
|
|
||||||
|
Each field_descriptor:
|
||||||
|
{
|
||||||
|
"key": str,
|
||||||
|
"label": str,
|
||||||
|
"description": str,
|
||||||
|
"value": display_value,
|
||||||
|
"raw": raw_config_value, # None for sensitive
|
||||||
|
"type": str,
|
||||||
|
"editable": bool,
|
||||||
|
"sensitive": bool,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
def field(key, label, ftype, description="", editable=False, sensitive=False):
|
||||||
|
raw = config.get(key)
|
||||||
|
if sensitive:
|
||||||
|
display = _mask(raw)
|
||||||
|
raw_out = None
|
||||||
|
elif ftype == "size":
|
||||||
|
display = _fmt_size(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "duration":
|
||||||
|
display = _fmt_duration(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "boolean":
|
||||||
|
display = bool(raw)
|
||||||
|
raw_out = raw
|
||||||
|
elif ftype == "list":
|
||||||
|
val = raw or []
|
||||||
|
display = list(val) if not isinstance(val, list) else val
|
||||||
|
raw_out = display
|
||||||
|
else:
|
||||||
|
display = raw if raw is not None else ""
|
||||||
|
raw_out = raw
|
||||||
|
return {
|
||||||
|
"key": key,
|
||||||
|
"label": label,
|
||||||
|
"description": description,
|
||||||
|
"value": display,
|
||||||
|
"raw": raw_out,
|
||||||
|
"type": ftype,
|
||||||
|
"editable": editable,
|
||||||
|
"sensitive": sensitive,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Notification channels (complex, built separately) ----------------
|
||||||
|
notif_channels = []
|
||||||
|
for ch_name, ch_cfg in (config.get("notification_channels") or {}).items():
|
||||||
|
if not isinstance(ch_cfg, dict):
|
||||||
|
continue
|
||||||
|
ch_type = ch_cfg.get("type", "")
|
||||||
|
fields = []
|
||||||
|
for k, v in ch_cfg.items():
|
||||||
|
if k == "type":
|
||||||
|
continue
|
||||||
|
sensitive = k in _SECRET_KEYS
|
||||||
|
fields.append({
|
||||||
|
"key": k,
|
||||||
|
"label": k.replace("_", " ").title(),
|
||||||
|
"value": _mask(v) if sensitive else (
|
||||||
|
", ".join(v) if isinstance(v, list) else str(v)
|
||||||
|
),
|
||||||
|
"sensitive": sensitive,
|
||||||
|
})
|
||||||
|
notif_channels.append({
|
||||||
|
"name": ch_name,
|
||||||
|
"type": ch_type,
|
||||||
|
"type_label": _CHANNEL_TYPE_LABELS.get(ch_type, ch_type.title()),
|
||||||
|
"fields": fields,
|
||||||
|
})
|
||||||
|
|
||||||
|
# ---- Users (show metadata only, never password hashes) ----------------
|
||||||
|
users_list = []
|
||||||
|
for username, attrs in (config.get("users") or {}).items():
|
||||||
|
if not isinstance(attrs, dict):
|
||||||
|
continue
|
||||||
|
users_list.append({
|
||||||
|
"username": username,
|
||||||
|
"full_name": attrs.get("full_name", ""),
|
||||||
|
"admin": bool(attrs.get("admin", False)),
|
||||||
|
"avatar": attrs.get("avatar", ""),
|
||||||
|
"notification_channels": attrs.get("notification_channels", []),
|
||||||
|
})
|
||||||
|
|
||||||
|
# ---- Hosts summary ----------------------------------------------------
|
||||||
|
hosts_list = []
|
||||||
|
for hname, hcfg in (config.get("hosts") or {}).items():
|
||||||
|
if not isinstance(hcfg, dict):
|
||||||
|
continue
|
||||||
|
hosts_list.append({
|
||||||
|
"name": hname,
|
||||||
|
"watch": bool(hcfg.get("watch", False)),
|
||||||
|
"dyndns": bool(hcfg.get("dyndns", False)),
|
||||||
|
"owner": hcfg.get("owner", ""),
|
||||||
|
"managers": hcfg.get("managers", []),
|
||||||
|
"monitors": hcfg.get("monitors", []),
|
||||||
|
"threshold_config": hcfg.get("threshold_config", ""),
|
||||||
|
"notification_channels": hcfg.get("notification_channels", []),
|
||||||
|
})
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": "network",
|
||||||
|
"title": "Network",
|
||||||
|
"description": "Ports and bind addresses for all server sockets.",
|
||||||
|
"fields": [
|
||||||
|
field("hb_port", "Heartbeat UDP port", "port",
|
||||||
|
"UDP port the server listens on for heartbeat datagrams."),
|
||||||
|
field("hbd_host", "HTTP bind address", "text",
|
||||||
|
"Interface to bind the HTTP server to. Empty = all interfaces."),
|
||||||
|
field("hbd_port", "HTTP API port", "port",
|
||||||
|
"TCP port for the HTTP API and web UI."),
|
||||||
|
field("ws_port", "WebSocket port", "port",
|
||||||
|
"TCP port for the plain WebSocket server."),
|
||||||
|
field("wss_port", "Secure WebSocket port", "port",
|
||||||
|
"TCP port for WSS (TLS WebSocket). Leave empty to disable."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "tls",
|
||||||
|
"title": "TLS / WebSocket Security",
|
||||||
|
"description": "Certificate paths used when wss_port is set.",
|
||||||
|
"fields": [
|
||||||
|
field("cert_path", "Certificate directory", "path",
|
||||||
|
"Directory containing the TLS certificate and key files."),
|
||||||
|
field("wss_pem", "Certificate file", "text",
|
||||||
|
"Filename of the TLS certificate chain (PEM format)."),
|
||||||
|
field("wss_key", "Key file", "text",
|
||||||
|
"Filename of the TLS private key (PEM format)."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "monitoring",
|
||||||
|
"title": "Monitoring",
|
||||||
|
"description": "Heartbeat timing and alert re-notification behaviour.",
|
||||||
|
"fields": [
|
||||||
|
field("interval", "Heartbeat interval", "duration",
|
||||||
|
"Expected time between heartbeat messages from each client."),
|
||||||
|
field("grace", "Grace multiplier", "number",
|
||||||
|
"A host is marked overdue after interval × grace seconds of silence."),
|
||||||
|
field("threshold_renotify_interval", "Re-notify interval", "duration",
|
||||||
|
"How often to re-send notifications for ongoing threshold alerts."),
|
||||||
|
field("autosave_interval", "Autosave interval", "duration",
|
||||||
|
"How often the server saves its state to disk."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "persistence",
|
||||||
|
"title": "Persistence & Logging",
|
||||||
|
"description": "State file and event log settings.",
|
||||||
|
"fields": [
|
||||||
|
field("pickfile", "State file", "path",
|
||||||
|
"Path to the pickle file used to persist host state across restarts."),
|
||||||
|
field("logfile", "Event log", "path",
|
||||||
|
"Path to the event log file."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "journal",
|
||||||
|
"title": "Message Journal",
|
||||||
|
"description": "All received heartbeat and plugin messages are journalled here.",
|
||||||
|
"fields": [
|
||||||
|
field("journal_enabled", "Enabled", "boolean",
|
||||||
|
"Turn journalling on or off."),
|
||||||
|
field("journal_dir", "Journal directory","path",
|
||||||
|
"Directory where journal files are written."),
|
||||||
|
field("journal_file", "Journal filename", "text",
|
||||||
|
"Base filename for the journal (rotated copies get a numeric suffix)."),
|
||||||
|
field("journal_max_size", "Max file size", "size",
|
||||||
|
"Rotate the journal when it exceeds this size."),
|
||||||
|
field("journal_max_backups", "Backup count", "number",
|
||||||
|
"Number of rotated journal files to keep."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "dns",
|
||||||
|
"title": "Dynamic DNS",
|
||||||
|
"description": "nsupdate-based DNS registration for dynamic hosts.",
|
||||||
|
"fields": [
|
||||||
|
field("nsupdate_bin", "nsupdate binary", "path",
|
||||||
|
"Full path to the nsupdate executable."),
|
||||||
|
field("dyndomains", "Dynamic domains", "list",
|
||||||
|
"DNS zones managed by nsupdate for dynamic hosts."),
|
||||||
|
field("drophosts", "Drop hosts", "list",
|
||||||
|
"Hostnames to silently ignore — no state, no alerts."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "users",
|
||||||
|
"title": "Users",
|
||||||
|
"description": "Accounts defined in the config file. Password hashes are never shown.",
|
||||||
|
"users": users_list,
|
||||||
|
"fields": [
|
||||||
|
field("default_owner", "Default owner", "text",
|
||||||
|
"Username that owns hosts with no explicit owner. "
|
||||||
|
"Falls back to the first admin user."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "channels",
|
||||||
|
"title": "Notification Channels",
|
||||||
|
"description": "Named notification providers. Credentials are masked.",
|
||||||
|
"channels": notif_channels,
|
||||||
|
"fields": [
|
||||||
|
field("default_notification_channels", "Default channels", "list",
|
||||||
|
"Channels used when a host does not specify its own."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hosts",
|
||||||
|
"title": "Hosts",
|
||||||
|
"description": "Host definitions loaded from the config file.",
|
||||||
|
"hosts": hosts_list,
|
||||||
|
"fields": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "runtime",
|
||||||
|
"title": "Runtime",
|
||||||
|
"description": "Flags set at startup (require restart to change).",
|
||||||
|
"fields": [
|
||||||
|
field("foreground", "Foreground mode", "boolean",
|
||||||
|
"Run in the foreground instead of daemonising."),
|
||||||
|
field("verbose", "Verbose logging", "boolean",
|
||||||
|
"Enable verbose log output."),
|
||||||
|
field("debug", "Debug level", "number",
|
||||||
|
"0 = off. Higher values increase log verbosity."),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 181 KiB |
@@ -139,4 +139,69 @@
|
|||||||
font-size: 9px;
|
font-size: 9px;
|
||||||
float: left;
|
float: left;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── Responsive / mobile ── */
|
||||||
|
|
||||||
|
/* Suppress the global transition on mobile to avoid sluggish feel */
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
* { transition: none !important; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
overflow: auto;
|
||||||
|
height: auto;
|
||||||
|
font-size: 16px; /* prevent iOS auto-zoom on inputs */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pages that use flex-column full-viewport layout need to relax on mobile */
|
||||||
|
body[style*="height: 100vh"],
|
||||||
|
body {
|
||||||
|
height: auto !important;
|
||||||
|
min-height: 100vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Containers: full width, no fixed heights */
|
||||||
|
.container {
|
||||||
|
max-width: 100% !important;
|
||||||
|
max-height: none !important;
|
||||||
|
overflow: visible !important;
|
||||||
|
padding: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Log section: fixed reasonable height instead of flex-grow */
|
||||||
|
.log-section {
|
||||||
|
flex: none !important;
|
||||||
|
max-height: 40vh !important;
|
||||||
|
overflow-y: auto !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Table section: allow vertical scroll, cap height */
|
||||||
|
.table-section {
|
||||||
|
max-height: 55vh !important;
|
||||||
|
overflow-y: auto !important;
|
||||||
|
overflow-x: auto !important;
|
||||||
|
padding: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Slightly larger tap targets in tables */
|
||||||
|
#ntable td, #ntable th {
|
||||||
|
padding: 4px 6px !important;
|
||||||
|
font-size: 0.82em !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cards on plugin/alerts pages */
|
||||||
|
.host-card, .alert-card, .card {
|
||||||
|
padding: 10px !important;
|
||||||
|
margin-bottom: 8px !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Settings page tables */
|
||||||
|
table { width: 100%; }
|
||||||
|
|
||||||
|
h1 { font-size: 1.2em !important; }
|
||||||
|
h2 { font-size: 1em !important; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Suppress nav-username text on very narrow screens — avatar/initials is enough */
|
||||||
|
@media (max-width: 400px) {
|
||||||
|
.nav-username { display: none; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
html, body { overflow: visible; }
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 700px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 20px 24px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section h2 {
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #333;
|
||||||
|
margin: 0 0 16px;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 8px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.info-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.info-label {
|
||||||
|
width: 160px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.88em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-value {
|
||||||
|
color: #222;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-value a {
|
||||||
|
color: #0066cc;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.info-value a:hover { text-decoration: underline; }
|
||||||
|
|
||||||
|
.version-badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 3px 12px;
|
||||||
|
background: #e8f0fe;
|
||||||
|
color: #1a73e8;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.85em;
|
||||||
|
font-weight: 600;
|
||||||
|
font-family: monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hb-logo {
|
||||||
|
font-size: 2.5em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #0066cc;
|
||||||
|
letter-spacing: -1px;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hb-tagline {
|
||||||
|
color: #555;
|
||||||
|
font-size: 0.95em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo-section {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 20px;
|
||||||
|
padding: 8px 0 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo-text { flex: 1; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Heartbeat monitoring system</p>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<div class="logo-section">
|
||||||
|
<div class="logo-text">
|
||||||
|
<div class="hb-logo">Heartbeat</div>
|
||||||
|
<div class="hb-tagline">Lightweight host monitoring over UDP</div>
|
||||||
|
</div>
|
||||||
|
<span class="version-badge">v{{ hbd_version }}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Version</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Server version</span>
|
||||||
|
<span class="info-value">{{ hbd_version }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Python</span>
|
||||||
|
<span class="info-value">{{ python_version }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">License</span>
|
||||||
|
<span class="info-value">MIT</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Runtime</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Host</span>
|
||||||
|
<span class="info-value">{{ server_hostname }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Started</span>
|
||||||
|
<span class="info-value">{{ start_time_str }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Uptime</span>
|
||||||
|
<span class="info-value" id="uptime-value">{{ uptime_str }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Hosts monitored</span>
|
||||||
|
<span class="info-value">{{ host_count }}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h2>Contact & Source</h2>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Author</span>
|
||||||
|
<span class="info-value">Andreas Wrede</span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Email</span>
|
||||||
|
<span class="info-value"><a href="mailto:aew@wrede.ca">aew@wrede.ca</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">Repository</span>
|
||||||
|
<span class="info-value"><a href="https://git.wrede.ca/andreas/heartbeat" target="_blank" rel="noopener">git.wrede.ca/andreas/heartbeat</a></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
var startEpoch = {{ start_epoch }};
|
||||||
|
var el = document.getElementById('uptime-value');
|
||||||
|
if (!el) return;
|
||||||
|
function fmt(s) {
|
||||||
|
var d = Math.floor(s / 86400);
|
||||||
|
var h = Math.floor((s % 86400) / 3600);
|
||||||
|
var m = Math.floor((s % 3600) / 60);
|
||||||
|
var sec = s % 60;
|
||||||
|
if (d > 0) return d + 'd ' + h + 'h ' + m + 'm';
|
||||||
|
if (h > 0) return h + 'h ' + m + 'm ' + sec + 's';
|
||||||
|
return m + 'm ' + sec + 's';
|
||||||
|
}
|
||||||
|
function tick() {
|
||||||
|
var up = Math.floor(Date.now() / 1000 - startEpoch);
|
||||||
|
el.textContent = fmt(up);
|
||||||
|
}
|
||||||
|
tick();
|
||||||
|
setInterval(tick, 1000);
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,535 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 1400px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-cards {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 10px;
|
||||||
|
margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-card {
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 6px 14px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
border-left: 4px solid #ddd;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-card.critical { border-left-color: #ea1e0f; }
|
||||||
|
.summary-card.warning { border-left-color: #ff9800; }
|
||||||
|
.summary-card.ok { border-left-color: #4caf50; }
|
||||||
|
|
||||||
|
.summary-number {
|
||||||
|
font-size: 1.4em;
|
||||||
|
font-weight: bold;
|
||||||
|
line-height: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.summary-number.critical { color: #ea1e0f; }
|
||||||
|
.summary-number.warning { color: #ff9800; }
|
||||||
|
.summary-number.ok { color: #4caf50; }
|
||||||
|
|
||||||
|
.summary-label {
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.85em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filters {
|
||||||
|
background: white;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
display: flex;
|
||||||
|
gap: 15px;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-label {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button {
|
||||||
|
padding: 8px 16px;
|
||||||
|
border: 2px solid #ddd;
|
||||||
|
background: white;
|
||||||
|
border-radius: 20px;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: all 0.2s;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button:hover {
|
||||||
|
border-color: #2196f3;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-button.active {
|
||||||
|
background: #2196f3;
|
||||||
|
color: white;
|
||||||
|
border-color: #2196f3;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alerts-container {
|
||||||
|
background: white;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 20px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item {
|
||||||
|
border-left: 5px solid #ddd;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
background: #fafafa;
|
||||||
|
border-radius: 4px;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
transition: all 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.acknowledged {
|
||||||
|
opacity: 0.8;
|
||||||
|
background: #f0f0f0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item:hover {
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
transform: translateX(5px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.critical {
|
||||||
|
border-left-color: #f44336;
|
||||||
|
background: #ffebee;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.warning {
|
||||||
|
border-left-color: #ff9800;
|
||||||
|
background: #fff3e0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-item.unknown {
|
||||||
|
border-left-color: #9e9e9e;
|
||||||
|
background: #f5f5f5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-main {
|
||||||
|
flex: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 15px;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level {
|
||||||
|
padding: 4px 12px;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: bold;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.critical {
|
||||||
|
background: #f44336;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.warning {
|
||||||
|
background: #ff9800;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-level.unknown {
|
||||||
|
background: #9e9e9e;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-hostname {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #333;
|
||||||
|
font-size: 1.1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-metric {
|
||||||
|
color: #666;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-details {
|
||||||
|
display: flex;
|
||||||
|
gap: 20px;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-value {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-duration {
|
||||||
|
color: #999;
|
||||||
|
font-size: 0.85em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.alert-actions {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 8px;
|
||||||
|
margin-left: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn {
|
||||||
|
padding: 8px 16px;
|
||||||
|
background: #2196f3;
|
||||||
|
color: white;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 0.85em;
|
||||||
|
transition: all 0.2s;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn:hover {
|
||||||
|
background: #1976d2;
|
||||||
|
transform: scale(1.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledge-btn:disabled {
|
||||||
|
background: #ccc;
|
||||||
|
cursor: not-allowed;
|
||||||
|
transform: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.acknowledged-badge {
|
||||||
|
padding: 4px 8px;
|
||||||
|
background: #4caf50;
|
||||||
|
color: white;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
text-align: center;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-alerts {
|
||||||
|
text-align: center;
|
||||||
|
padding: 60px 20px;
|
||||||
|
color: #999;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-alerts-icon {
|
||||||
|
font-size: 4em;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.loading {
|
||||||
|
text-align: center;
|
||||||
|
padding: 40px;
|
||||||
|
color: #666;
|
||||||
|
}
|
||||||
|
|
||||||
|
.error {
|
||||||
|
background: #ffebee;
|
||||||
|
border-left: 4px solid #f44336;
|
||||||
|
padding: 20px;
|
||||||
|
margin: 20px 0;
|
||||||
|
border-radius: 4px;
|
||||||
|
color: #c62828;
|
||||||
|
}
|
||||||
|
|
||||||
|
.refresh-info {
|
||||||
|
text-align: center;
|
||||||
|
color: #999;
|
||||||
|
font-size: 0.85em;
|
||||||
|
margin-top: 20px;
|
||||||
|
padding-top: 20px;
|
||||||
|
border-top: 1px solid #e0e0e0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.last-update {
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.9em;
|
||||||
|
text-align: right;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
|
||||||
|
|
||||||
|
<div class="summary-cards" id="summary-cards">
|
||||||
|
<div class="summary-card critical">
|
||||||
|
<div class="summary-label">Critical</div>
|
||||||
|
<div class="summary-number critical" id="critical-count">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="summary-card warning">
|
||||||
|
<div class="summary-label">Warning</div>
|
||||||
|
<div class="summary-number warning" id="warning-count">-</div>
|
||||||
|
</div>
|
||||||
|
<div class="summary-card ok">
|
||||||
|
<div class="summary-label">Total Hosts</div>
|
||||||
|
<div class="summary-number ok" id="host-count">-</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="filters">
|
||||||
|
<span class="filter-label">Show:</span>
|
||||||
|
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
|
||||||
|
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
|
||||||
|
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="alerts-container">
|
||||||
|
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
|
||||||
|
<div id="alerts-list">
|
||||||
|
<div class="loading">Loading alerts...</div>
|
||||||
|
</div>
|
||||||
|
<div class="refresh-info">
|
||||||
|
Auto-refreshing every 15 seconds
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let currentFilter = 'all';
|
||||||
|
let allAlerts = [];
|
||||||
|
|
||||||
|
async function loadAlerts() {
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/0/alerts');
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
allAlerts = data.alerts;
|
||||||
|
|
||||||
|
// Update summary cards
|
||||||
|
document.getElementById('critical-count').textContent = data.summary.critical || 0;
|
||||||
|
document.getElementById('warning-count').textContent = data.summary.warning || 0;
|
||||||
|
document.getElementById('host-count').textContent = data.host_count || 0;
|
||||||
|
|
||||||
|
// Update last update time
|
||||||
|
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
|
||||||
|
|
||||||
|
// Render alerts
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
document.getElementById('alerts-list').innerHTML =
|
||||||
|
`<div class="error">Failed to load alerts: ${error.message}</div>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderAlerts(alerts) {
|
||||||
|
const container = document.getElementById('alerts-list');
|
||||||
|
|
||||||
|
// Filter alerts based on current filter
|
||||||
|
let filteredAlerts = alerts;
|
||||||
|
if (currentFilter !== 'all') {
|
||||||
|
filteredAlerts = alerts.filter(alert =>
|
||||||
|
alert.level.toLowerCase() === currentFilter
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filteredAlerts.length === 0) {
|
||||||
|
if (currentFilter === 'all' && alerts.length === 0) {
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="no-alerts">
|
||||||
|
<div class="no-alerts-icon">✓</div>
|
||||||
|
<h2>All Systems Normal</h2>
|
||||||
|
<p>No active alerts at this time</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
container.innerHTML = `
|
||||||
|
<div class="no-alerts">
|
||||||
|
<p>No ${currentFilter} alerts</p>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = '';
|
||||||
|
for (const alert of filteredAlerts) {
|
||||||
|
html += renderAlert(alert);
|
||||||
|
}
|
||||||
|
container.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderAlert(alert) {
|
||||||
|
const level = alert.level.toLowerCase();
|
||||||
|
const duration = getDuration(alert.since);
|
||||||
|
const acknowledged = alert.acknowledged || false;
|
||||||
|
|
||||||
|
// Use formatted message if available, otherwise build from individual fields
|
||||||
|
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||||
|
if (alert.formatted_message) {
|
||||||
|
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
|
||||||
|
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||||
|
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build actions section
|
||||||
|
let actionsHtml = '';
|
||||||
|
if (acknowledged) {
|
||||||
|
actionsHtml = `
|
||||||
|
<div class="alert-actions">
|
||||||
|
<div class="acknowledged-badge">✓ Acknowledged</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
actionsHtml = `
|
||||||
|
<div class="alert-actions">
|
||||||
|
<button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
|
||||||
|
Acknowledge
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
|
||||||
|
<div class="alert-main">
|
||||||
|
<div class="alert-header">
|
||||||
|
<span class="alert-level ${level}">${alert.level}</span>
|
||||||
|
<span class="alert-hostname">${alert.hostname}</span>
|
||||||
|
</div>
|
||||||
|
<div class="alert-metric">${alert.metric_path}</div>
|
||||||
|
<div class="alert-details">
|
||||||
|
<span>${valueText}</span>
|
||||||
|
<span class="alert-duration">Active for ${duration}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
${actionsHtml}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatValue(value) {
|
||||||
|
if (typeof value === 'number') {
|
||||||
|
if (value > 1000) {
|
||||||
|
return value.toLocaleString();
|
||||||
|
}
|
||||||
|
return value.toFixed(2);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getDuration(timestamp) {
|
||||||
|
const now = Date.now() / 1000;
|
||||||
|
const seconds = Math.floor(now - timestamp);
|
||||||
|
|
||||||
|
if (seconds < 60) {
|
||||||
|
return `${seconds}s`;
|
||||||
|
} else if (seconds < 3600) {
|
||||||
|
return `${Math.floor(seconds / 60)}m`;
|
||||||
|
} else if (seconds < 86400) {
|
||||||
|
const hours = Math.floor(seconds / 3600);
|
||||||
|
const minutes = Math.floor((seconds % 3600) / 60);
|
||||||
|
return `${hours}h ${minutes}m`;
|
||||||
|
} else {
|
||||||
|
const days = Math.floor(seconds / 86400);
|
||||||
|
const hours = Math.floor((seconds % 86400) / 3600);
|
||||||
|
return `${days}d ${hours}h`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function filterAlerts(filter) {
|
||||||
|
currentFilter = filter;
|
||||||
|
|
||||||
|
// Update active button
|
||||||
|
document.querySelectorAll('.filter-button').forEach(btn => {
|
||||||
|
btn.classList.remove('active');
|
||||||
|
});
|
||||||
|
event.target.classList.add('active');
|
||||||
|
|
||||||
|
// Re-render with new filter
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function acknowledgeAlert(hostname, metricPath, event) {
|
||||||
|
// Prevent event bubbling
|
||||||
|
if (event) {
|
||||||
|
event.stopPropagation();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disable the button
|
||||||
|
const button = event.target;
|
||||||
|
button.disabled = true;
|
||||||
|
button.textContent = 'Acknowledging...';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/0/alerts/acknowledge', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
hostname: hostname,
|
||||||
|
metric_path: metricPath,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
// Update the alert in our local data
|
||||||
|
const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
|
||||||
|
if (alert) {
|
||||||
|
alert.acknowledged = true;
|
||||||
|
alert.acknowledged_at = result.acknowledged_at;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-render alerts
|
||||||
|
renderAlerts(allAlerts);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
alert(`Failed to acknowledge alert: ${error.message}`);
|
||||||
|
button.disabled = false;
|
||||||
|
button.textContent = 'Acknowledge';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-refresh every 15 seconds
|
||||||
|
setInterval(loadAlerts, 15000);
|
||||||
|
|
||||||
|
// Initial load
|
||||||
|
loadAlerts();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
<footer>
|
<footer>
|
||||||
<div id="copyright">
|
<div id="copyright">
|
||||||
©2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
©2002-2026 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||||
</div>
|
</div>
|
||||||
</footer>
|
</footer>
|
||||||
@@ -0,0 +1,281 @@
|
|||||||
|
<head>
|
||||||
|
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||||
|
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
{% if extra_scripts %}<script src="{{ extra_scripts }}"></script>{% endif %}
|
||||||
|
<style>
|
||||||
|
/* ── Reset / shared baseline ── */
|
||||||
|
*, *::before, *::after { box-sizing: border-box; }
|
||||||
|
html {
|
||||||
|
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
|
||||||
|
font-size: 14px;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 10px;
|
||||||
|
padding-top: 60px;
|
||||||
|
background: #f5f5f5;
|
||||||
|
}
|
||||||
|
h1 { font-size: 1.5em; color: #333; margin: 0 0 5px; }
|
||||||
|
h2 { font-size: 1.1em; color: #333; margin: 0 0 8px; }
|
||||||
|
p { margin: 0; }
|
||||||
|
|
||||||
|
/* Navigation bar — shared across all pages */
|
||||||
|
.nav {
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
right: 0;
|
||||||
|
z-index: 200;
|
||||||
|
background: #fff;
|
||||||
|
padding: 6px 12px;
|
||||||
|
box-shadow: 0 2px 4px rgba(0,0,0,.1);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: space-between;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
.nav-links { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
|
||||||
|
.nav a {
|
||||||
|
margin-right: 20px;
|
||||||
|
text-decoration: none;
|
||||||
|
color: #0066cc;
|
||||||
|
font-weight: 500;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.nav a:hover { text-decoration: underline; }
|
||||||
|
.nav a.active { color: #333; font-weight: bold; }
|
||||||
|
.nav-user {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
text-decoration: none;
|
||||||
|
color: #333;
|
||||||
|
font-size: 0.9em;
|
||||||
|
font-weight: 500;
|
||||||
|
padding: 4px 8px;
|
||||||
|
border-radius: 20px;
|
||||||
|
transition: background 0.15s;
|
||||||
|
}
|
||||||
|
.nav-user:hover { background: #f0f4ff; text-decoration: none; }
|
||||||
|
.nav-username {
|
||||||
|
max-width: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
white-space: nowrap;
|
||||||
|
opacity: 0;
|
||||||
|
transition: max-width 0.2s ease, opacity 0.2s ease;
|
||||||
|
}
|
||||||
|
.nav-user:hover .nav-username {
|
||||||
|
max-width: 160px;
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
.nav-avatar {
|
||||||
|
width: 28px; height: 28px;
|
||||||
|
border-radius: 50%;
|
||||||
|
object-fit: cover;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.nav-initials {
|
||||||
|
width: 28px; height: 28px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: #0066cc;
|
||||||
|
color: #fff;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: 700;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Mobile nav: hamburger toggle ── */
|
||||||
|
.nav-hamburger {
|
||||||
|
display: none;
|
||||||
|
flex-direction: column;
|
||||||
|
justify-content: space-between;
|
||||||
|
width: 26px; height: 20px;
|
||||||
|
cursor: pointer;
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: none;
|
||||||
|
border: none;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
.nav-hamburger span {
|
||||||
|
display: block;
|
||||||
|
height: 3px;
|
||||||
|
background: #555;
|
||||||
|
border-radius: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
.nav-hamburger { display: flex; }
|
||||||
|
.nav-links {
|
||||||
|
display: none;
|
||||||
|
width: 100%;
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: flex-start;
|
||||||
|
padding-top: 8px;
|
||||||
|
border-top: 1px solid #eee;
|
||||||
|
order: 3;
|
||||||
|
}
|
||||||
|
.nav-links.nav-open { display: flex; }
|
||||||
|
.nav-links a { margin-right: 0; padding: 6px 0; font-size: 1em; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Swiss railway clock — nav */
|
||||||
|
.nav-clock {
|
||||||
|
flex-shrink: 0;
|
||||||
|
line-height: 0;
|
||||||
|
margin-left: auto;
|
||||||
|
padding: 4px 4px 4px 0;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#swiss-clock { display: block; }
|
||||||
|
|
||||||
|
/* Swiss railway clock — full-page overlay */
|
||||||
|
#clock-overlay {
|
||||||
|
display: none;
|
||||||
|
position: fixed;
|
||||||
|
inset: 0;
|
||||||
|
z-index: 9999;
|
||||||
|
background: #1a1a1a;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#clock-overlay.visible { display: flex; }
|
||||||
|
#swiss-clock-overlay { display: block; }
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
/* ── Swiss Federal Railway (SBB) clock ── */
|
||||||
|
|
||||||
|
/* Draw one frame of the clock onto any canvas element. */
|
||||||
|
function drawSwissClock(canvas) {
|
||||||
|
var SIZE = canvas.width;
|
||||||
|
var R = SIZE / 2;
|
||||||
|
var ctx = canvas.getContext('2d');
|
||||||
|
var now = new Date();
|
||||||
|
var h = now.getHours() % 12;
|
||||||
|
var m = now.getMinutes();
|
||||||
|
var s = now.getSeconds();
|
||||||
|
var ms = now.getMilliseconds();
|
||||||
|
|
||||||
|
/* Seconds hand idles ~1.5 s at 12 before advancing (SBB behaviour) */
|
||||||
|
var sFrac = s + ms / 1000;
|
||||||
|
var sAngle = sFrac >= 58.5 ? 0 : (sFrac / 58.5) * Math.PI * 2;
|
||||||
|
|
||||||
|
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||||
|
|
||||||
|
/* face */
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#fff';
|
||||||
|
ctx.fill();
|
||||||
|
ctx.strokeStyle = '#333';
|
||||||
|
ctx.lineWidth = SIZE * 0.018;
|
||||||
|
ctx.stroke();
|
||||||
|
|
||||||
|
/* tick marks */
|
||||||
|
for (var i = 0; i < 60; i++) {
|
||||||
|
var a = (i / 60) * Math.PI * 2 - Math.PI / 2;
|
||||||
|
var isHour = (i % 5 === 0);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(R + Math.cos(a) * (isHour ? R * 0.72 : R * 0.88),
|
||||||
|
R + Math.sin(a) * (isHour ? R * 0.72 : R * 0.88));
|
||||||
|
ctx.lineTo(R + Math.cos(a) * R * 0.94,
|
||||||
|
R + Math.sin(a) * R * 0.94);
|
||||||
|
ctx.strokeStyle = '#222';
|
||||||
|
ctx.lineWidth = isHour ? SIZE * 0.027 : SIZE * 0.011;
|
||||||
|
ctx.lineCap = 'butt';
|
||||||
|
ctx.stroke();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* hands */
|
||||||
|
function hand(angle, tip, tail, width, color) {
|
||||||
|
ctx.save();
|
||||||
|
ctx.translate(R, R);
|
||||||
|
ctx.rotate(angle);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(tail, 0);
|
||||||
|
ctx.lineTo(tip, 0);
|
||||||
|
ctx.strokeStyle = color;
|
||||||
|
ctx.lineWidth = width;
|
||||||
|
ctx.lineCap = 'square';
|
||||||
|
ctx.stroke();
|
||||||
|
ctx.restore();
|
||||||
|
}
|
||||||
|
|
||||||
|
hand((m + s / 60) / 60 * Math.PI * 2 - Math.PI / 2,
|
||||||
|
R * 0.88, -R * 0.12, SIZE * 0.027, '#222'); /* minute */
|
||||||
|
hand((h + m / 60) / 12 * Math.PI * 2 - Math.PI / 2,
|
||||||
|
R * 0.58, -R * 0.12, SIZE * 0.039, '#222'); /* hour */
|
||||||
|
hand(sAngle - Math.PI / 2, R * 0.78, -R * 0.22,
|
||||||
|
SIZE * 0.013, '#e00'); /* second tail+tip */
|
||||||
|
|
||||||
|
/* round dot at tip of second hand */
|
||||||
|
var dotR = SIZE * 0.028;
|
||||||
|
ctx.save();
|
||||||
|
ctx.translate(R, R);
|
||||||
|
ctx.rotate(sAngle - Math.PI / 2);
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R * 0.78, 0, dotR, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#e00';
|
||||||
|
ctx.fill();
|
||||||
|
ctx.restore();
|
||||||
|
|
||||||
|
/* centre cap */
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R * 0.04, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#222';
|
||||||
|
ctx.fill();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Resize the overlay canvas to fit the viewport, keeping it square. */
|
||||||
|
function resizeOverlayClock() {
|
||||||
|
var oc = document.getElementById('swiss-clock-overlay');
|
||||||
|
if (!oc) return;
|
||||||
|
var size = Math.min(window.innerWidth, window.innerHeight) * 0.88;
|
||||||
|
size = Math.floor(size);
|
||||||
|
oc.width = size;
|
||||||
|
oc.height = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Main tick — redraws both nav clock and (if visible) overlay clock. */
|
||||||
|
function clockTick() {
|
||||||
|
var nav = document.getElementById('swiss-clock');
|
||||||
|
if (nav) drawSwissClock(nav);
|
||||||
|
var overlay = document.getElementById('clock-overlay');
|
||||||
|
if (overlay && overlay.classList.contains('visible')) {
|
||||||
|
var oc = document.getElementById('swiss-clock-overlay');
|
||||||
|
if (oc) drawSwissClock(oc);
|
||||||
|
}
|
||||||
|
var delay = 100 - (Date.now() % 100);
|
||||||
|
setTimeout(clockTick, delay);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
/* Start the shared tick loop */
|
||||||
|
clockTick();
|
||||||
|
|
||||||
|
/* Overlay toggle — clicking the nav clock opens it */
|
||||||
|
var navClock = document.querySelector('.nav-clock');
|
||||||
|
var overlay = document.getElementById('clock-overlay');
|
||||||
|
if (navClock && overlay) {
|
||||||
|
navClock.addEventListener('click', function() {
|
||||||
|
resizeOverlayClock();
|
||||||
|
overlay.classList.add('visible');
|
||||||
|
});
|
||||||
|
overlay.addEventListener('click', function() {
|
||||||
|
overlay.classList.remove('visible');
|
||||||
|
});
|
||||||
|
window.addEventListener('resize', function() {
|
||||||
|
if (overlay.classList.contains('visible')) resizeOverlayClock();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
<script src="static/sorttable.js"></script>
|
||||||
|
</head>
|
||||||
@@ -0,0 +1,566 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
height: 100vh;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
body {
|
||||||
|
height: auto;
|
||||||
|
min-height: 100vh;
|
||||||
|
overflow: auto;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
.container {
|
||||||
|
max-height: none;
|
||||||
|
overflow: visible;
|
||||||
|
}
|
||||||
|
.table-section {
|
||||||
|
max-height: 55vh;
|
||||||
|
}
|
||||||
|
.log-section {
|
||||||
|
flex: none;
|
||||||
|
max-height: 40vh;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
flex: 1;
|
||||||
|
min-height: 0;
|
||||||
|
max-width: 1600px;
|
||||||
|
width: 100%;
|
||||||
|
margin: 0 auto;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 15px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 5px;
|
||||||
|
margin-top: 15px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
h2 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
font-size: 1.2em;
|
||||||
|
padding: 10px 15px;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 15px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.content {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.table-section {
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 15px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
overflow-x: auto;
|
||||||
|
overflow-y: auto;
|
||||||
|
max-height: 60vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section {
|
||||||
|
flex: 1;
|
||||||
|
min-height: 0;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 15px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable {
|
||||||
|
border-collapse: collapse;
|
||||||
|
width: 100%;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable td,
|
||||||
|
#ntable th {
|
||||||
|
border: 1px solid #e0e0e0;
|
||||||
|
text-align: left;
|
||||||
|
padding: 2px 4px;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tr:nth-child(even) {
|
||||||
|
background-color: #fafafa;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tr:hover {
|
||||||
|
background-color: #e3f2fd;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-warning {
|
||||||
|
background-color: #fff8c5;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-critical {
|
||||||
|
background-color: #fde8e8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-warning:hover {
|
||||||
|
background-color: #fff0a0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable tbody tr.row-critical:hover {
|
||||||
|
background-color: #f9c8c8;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable th {
|
||||||
|
padding: 6px 8px;
|
||||||
|
background-color: #2196f3;
|
||||||
|
color: white;
|
||||||
|
font-weight: 600;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable
|
||||||
|
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||||
|
content: " ⇅";
|
||||||
|
opacity: 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Alert count column styling */
|
||||||
|
#ntable td.alert-warning {
|
||||||
|
color: #ff9800;
|
||||||
|
font-weight: bold;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ntable td.alert-critical {
|
||||||
|
color: #f44336;
|
||||||
|
font-weight: bold;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Scrollbar styling */
|
||||||
|
.log-section::-webkit-scrollbar {
|
||||||
|
width: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-track {
|
||||||
|
background: #f1f1f1;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-thumb {
|
||||||
|
background: #888;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.log-section::-webkit-scrollbar-thumb:hover {
|
||||||
|
background: #555;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Message styling */
|
||||||
|
#messages {
|
||||||
|
font-size: 0.85em;
|
||||||
|
line-height: 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#messages div {
|
||||||
|
padding: 5px 0;
|
||||||
|
border-bottom: 1px solid #f0f0f0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Modal for connection status messages */
|
||||||
|
.connection-modal {
|
||||||
|
display: none;
|
||||||
|
position: fixed;
|
||||||
|
z-index: 1000;
|
||||||
|
left: 0;
|
||||||
|
top: 0;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: rgba(0, 0, 0, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal.show {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal-content {
|
||||||
|
background-color: white;
|
||||||
|
padding: 30px 40px;
|
||||||
|
border-radius: 8px;
|
||||||
|
text-align: center;
|
||||||
|
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
|
||||||
|
min-width: 300px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.connection-modal-content p {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 16px;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* State indicators */
|
||||||
|
.state-up {
|
||||||
|
color: #4caf50;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
|
||||||
|
.state-down {
|
||||||
|
color: #f44336;
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.state-overdue {
|
||||||
|
color: #ff9800;
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<script type="text/javascript">
|
||||||
|
var cnt = 0;
|
||||||
|
var nTable = document;
|
||||||
|
var name_idx = {};
|
||||||
|
var c = 0;
|
||||||
|
var HBD_VERSION = "{{ hbd_version }}";
|
||||||
|
|
||||||
|
function hostNameHtml(data) {
|
||||||
|
var nameHtml = data.name;
|
||||||
|
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||||
|
nameHtml += ' 🥀';
|
||||||
|
}
|
||||||
|
return data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||||
|
}
|
||||||
|
|
||||||
|
function setup() {
|
||||||
|
name_idx = {};
|
||||||
|
nTable = document.getElementById("ntable");
|
||||||
|
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||||
|
if (i == 0) continue;
|
||||||
|
var cell = nTable.rows[i].cells[0];
|
||||||
|
var name = cell.dataset.name || cell.innerText.replace(/\s*🥀\s*$/, '').trim();
|
||||||
|
name_idx[name] = nTable.rows[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateRowAlert(row, data) {
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
row.classList.remove('row-warning', 'row-critical');
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
row.classList.add('row-critical');
|
||||||
|
} else if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
row.classList.add('row-warning');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function createRow(data) {
|
||||||
|
var row = document.createElement("tr");
|
||||||
|
var c_name = document.createElement("td");
|
||||||
|
var c_warning = document.createElement("td");
|
||||||
|
c_warning.style.textAlign = "center";
|
||||||
|
c_warning.style.color = "#ff9800";
|
||||||
|
c_warning.style.fontWeight = "bold";
|
||||||
|
var c_critical = document.createElement("td");
|
||||||
|
c_critical.style.textAlign = "center";
|
||||||
|
c_critical.style.color = "#f44336";
|
||||||
|
c_critical.style.fontWeight = "bold";
|
||||||
|
var c_ipv4addr = document.createElement("td");
|
||||||
|
var c_ipv4state = document.createElement("td");
|
||||||
|
var c_ipv4latency = document.createElement("td");
|
||||||
|
c_ipv4latency.style.textAlign = "right";
|
||||||
|
var c_ipv4statets = document.createElement("td");
|
||||||
|
c_ipv4statets.style.textAlign = "right";
|
||||||
|
var c_ipv6addr = document.createElement("td");
|
||||||
|
var c_ipv6state = document.createElement("td");
|
||||||
|
var c_ipv6latency = document.createElement("td");
|
||||||
|
c_ipv6latency.style.textAlign = "right";
|
||||||
|
var c_ipv6statets = document.createElement("td");
|
||||||
|
c_ipv6statets.style.textAlign = "right";
|
||||||
|
row.appendChild(c_name);
|
||||||
|
row.appendChild(c_warning);
|
||||||
|
row.appendChild(c_critical);
|
||||||
|
row.appendChild(c_ipv4addr);
|
||||||
|
row.appendChild(c_ipv4state);
|
||||||
|
row.appendChild(c_ipv4latency);
|
||||||
|
row.appendChild(c_ipv4statets);
|
||||||
|
row.appendChild(c_ipv6addr);
|
||||||
|
row.appendChild(c_ipv6state);
|
||||||
|
row.appendChild(c_ipv6latency);
|
||||||
|
row.appendChild(c_ipv6statets);
|
||||||
|
c_name.dataset.name = data.name;
|
||||||
|
c_name.innerHTML = hostNameHtml(data);
|
||||||
|
|
||||||
|
// Set alert counts in "x/y" format (unacked/acked)
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
|
||||||
|
if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||||
|
} else {
|
||||||
|
c_warning.innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||||
|
} else {
|
||||||
|
c_critical.innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||||
|
c_ipv4state.innerHTML = data.connections[0].state;
|
||||||
|
if (data.connections.length > 1) {
|
||||||
|
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||||
|
c_ipv6state.innerHTML = data.connections[1].state;
|
||||||
|
}
|
||||||
|
var table = document.getElementById("ntablebody"); // find table to append to
|
||||||
|
table.appendChild(row); // append row to table
|
||||||
|
name_idx[c_name] = row;
|
||||||
|
updateRowAlert(row, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatTS(ts) {
|
||||||
|
const now = new Date();
|
||||||
|
const d = new Date(ts * 1000);
|
||||||
|
|
||||||
|
const pad = n => String(n).padStart(2, '0');
|
||||||
|
const timeStr = `${pad(d.getHours())}:${pad(d.getMinutes())}:${pad(d.getSeconds())}`;
|
||||||
|
|
||||||
|
// Same calendar day → show time only
|
||||||
|
if (d.toDateString() === now.toDateString()) {
|
||||||
|
return timeStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Within 8 days → show "-X d hh:mm:ss"
|
||||||
|
const todayStart = new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
||||||
|
const dStart = new Date(d.getFullYear(), d.getMonth(), d.getDate());
|
||||||
|
const diffDays = Math.round((todayStart - dStart) / 86400000);
|
||||||
|
if (diffDays < 8) {
|
||||||
|
return `-${diffDays}d ${timeStr}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Older → date only
|
||||||
|
return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function update_table(data) {
|
||||||
|
if (!(data.name in name_idx)) {
|
||||||
|
createRow(data);
|
||||||
|
setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update name cell (version indicator)
|
||||||
|
var nameCell = name_idx[data.name].cells[0];
|
||||||
|
nameCell.dataset.name = data.name;
|
||||||
|
nameCell.innerHTML = hostNameHtml(data);
|
||||||
|
|
||||||
|
// Update warning and critical counts in "x/y" format (unacked/acked)
|
||||||
|
var warningUnacked = data.alert_warning_unacked || 0;
|
||||||
|
var warningAcked = data.alert_warning_acked || 0;
|
||||||
|
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||||
|
var criticalAcked = data.alert_critical_acked || 0;
|
||||||
|
|
||||||
|
if (warningUnacked > 0 || warningAcked > 0) {
|
||||||
|
name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||||
|
} else {
|
||||||
|
name_idx[data.name].cells[1].innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||||
|
name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||||
|
} else {
|
||||||
|
name_idx[data.name].cells[2].innerHTML = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var i = 0; i < data.connections.length; i++) {
|
||||||
|
// Offset by 2 for the warning/critical count columns
|
||||||
|
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
|
||||||
|
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
|
||||||
|
data.connections[i].statetime
|
||||||
|
);
|
||||||
|
if (data.connections[i].state == "up") {
|
||||||
|
state = '<span class="state-up">up</span>';
|
||||||
|
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||||
|
} else {
|
||||||
|
if (data.connections[i].state == "unknown") {
|
||||||
|
state = "";
|
||||||
|
latency = "";
|
||||||
|
name_idx[data.name].cells[3 + i * 4].innerHTML = "";
|
||||||
|
name_idx[data.name].cells[6 + i * 4].innerHTML = "";
|
||||||
|
} else if (data.connections[i].state == "down") {
|
||||||
|
state = '<span class="state-down">down</span>';
|
||||||
|
latency = "-";
|
||||||
|
} else if (data.connections[i].state == "overdue") {
|
||||||
|
state = '<span class="state-overdue">overdue</span>';
|
||||||
|
latency = "-";
|
||||||
|
} else {
|
||||||
|
state = "<b>" + data.connections[i].state + "</b>";
|
||||||
|
latency = "-";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
name_idx[data.name].cells[4 + i * 4].innerHTML = state;
|
||||||
|
name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
|
||||||
|
}
|
||||||
|
updateRowAlert(name_idx[data.name], data);
|
||||||
|
}
|
||||||
|
|
||||||
|
function WS_Connect() {
|
||||||
|
if ("WebSocket" in window) {
|
||||||
|
//N.B: subprotocol field causes chrome to error 1006
|
||||||
|
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||||
|
|
||||||
|
ws_hbd.onopen = function () {
|
||||||
|
// Web Socket is connected, send data using send()
|
||||||
|
console.log("ws connect {{heartbeat_ws_url}}");
|
||||||
|
// Hide modal window if visible
|
||||||
|
var modal = document.getElementById("connectionModal");
|
||||||
|
if (modal) {
|
||||||
|
modal.classList.remove("show");
|
||||||
|
}
|
||||||
|
ws_hbd.send("heartbeat_web");
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onerror = function (event) {
|
||||||
|
console.log(event);
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onmessage = function (event) {
|
||||||
|
/* console.log(event.data); */
|
||||||
|
var state = JSON.parse(event.data);
|
||||||
|
/* console.log("State: " + state.type); */
|
||||||
|
if (state.type == "host") {
|
||||||
|
update_table(state.data);
|
||||||
|
} else if (state.type == "message") {
|
||||||
|
var msgs = document.getElementById("messages");
|
||||||
|
msgs.insertAdjacentHTML("afterbegin", "<div>" + state.data + "</div>");
|
||||||
|
}
|
||||||
|
cnt++;
|
||||||
|
};
|
||||||
|
|
||||||
|
ws_hbd.onclose = function (event) {
|
||||||
|
/* console.log(event); */
|
||||||
|
console.log("Connection is closed, reopening");
|
||||||
|
// Show modal window
|
||||||
|
var modal = document.getElementById("connectionModal");
|
||||||
|
if (modal) {
|
||||||
|
modal.classList.add("show");
|
||||||
|
}
|
||||||
|
setTimeout(function () {
|
||||||
|
WS_Connect();
|
||||||
|
}, 3000);
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
// The browser doesn't support WebSocket
|
||||||
|
console.log("WebSocket NOT supported by your Browser!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WS_Connect();
|
||||||
|
</script>
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
{% include 'menu.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Real-time host monitoring and event log</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="table-section">
|
||||||
|
<table id="ntable" class="sortable">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Name</th>
|
||||||
|
<th style="text-align: center" title="Warning Alerts">⚠️</th>
|
||||||
|
<th style="text-align: center" title="Critical Alerts">🔴</th>
|
||||||
|
<th>IPv4 Addr</th>
|
||||||
|
<th>State</th>
|
||||||
|
<th style="text-align: right">Latency</th>
|
||||||
|
<th style="text-align: right">Last State</th>
|
||||||
|
<th>IPv6 Addr</th>
|
||||||
|
<th>State</th>
|
||||||
|
<th style="text-align: right">Latency</th>
|
||||||
|
<th style="text-align: right">Last State</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="ntablebody">
|
||||||
|
{% for host in hosts %}
|
||||||
|
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||||
|
<td data-name="{{ host.name }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</td>
|
||||||
|
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||||
|
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||||
|
{%- set warning_acked = host.alert_warning_acked -%}
|
||||||
|
{%- if warning_unacked > 0 or warning_acked > 0 -%}
|
||||||
|
{{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
|
||||||
|
{%- endif -%}
|
||||||
|
</td>
|
||||||
|
<td style="text-align: center; color: #f44336; font-weight: bold;">
|
||||||
|
{%- set critical_unacked = host.alert_critical_unacked -%}
|
||||||
|
{%- set critical_acked = host.alert_critical_acked -%}
|
||||||
|
{%- if critical_unacked > 0 or critical_acked > 0 -%}
|
||||||
|
{{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
|
||||||
|
{%- endif -%}
|
||||||
|
</td>
|
||||||
|
{% for conn in host.connections %}
|
||||||
|
<td>{{ conn.addr if conn.addr else '' }}</td>
|
||||||
|
<td>{{ conn.state if conn.state else '' }}</td>
|
||||||
|
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
|
||||||
|
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
|
||||||
|
{% endfor %}
|
||||||
|
{% if host.connections|length == 0 %}
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
{% elif host.connections|length == 1 %}
|
||||||
|
<td></td><td></td><td></td><td></td>
|
||||||
|
{% endif %}
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="log-section">
|
||||||
|
<h2>Log of Events</h2>
|
||||||
|
<div id="messages"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% include 'foot.html' %}
|
||||||
|
|
||||||
|
<!-- Connection status modal -->
|
||||||
|
<div id="connectionModal" class="connection-modal">
|
||||||
|
<div class="connection-modal-content">
|
||||||
|
<p>⚠️ Connection is closed, reopening...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
setup();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||||
|
s<header>{{ header }}</header> -->
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
<div class="nav">
|
||||||
|
<button class="nav-hamburger" id="nav-hamburger-btn" aria-label="Menu" aria-expanded="false">
|
||||||
|
<span></span><span></span><span></span>
|
||||||
|
</button>
|
||||||
|
<div class="nav-links" id="nav-links">
|
||||||
|
<a href="/live"{% if active_page == "live" %} class="active"{% endif %}>Live Dashboard</a>
|
||||||
|
<a href="/plugins"{% if active_page == "plugins" %} class="active"{% endif %}>Host Overview</a>
|
||||||
|
<a href="/alerts"{% if active_page == "alerts" %} class="active"{% endif %}>Alerts</a>
|
||||||
|
{% if current_user and current_user.admin %}
|
||||||
|
<a href="/settings"{% if active_page == "settings" %} class="active"{% endif %}>Settings</a>
|
||||||
|
{% endif %}
|
||||||
|
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
||||||
|
</div>
|
||||||
|
<div class="nav-clock" title="Click for full-screen clock">
|
||||||
|
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
||||||
|
</div>
|
||||||
|
{% if current_user %}
|
||||||
|
<a href="/profile" class="nav-user{% if active_page == 'profile' %} active{% endif %}" title="{{ current_user.full_name or current_user.username }}">
|
||||||
|
{% if current_user.avatar %}
|
||||||
|
<img class="nav-avatar" src="{{ current_user.avatar_url }}" alt="{{ current_user.full_name or current_user.username }}">
|
||||||
|
{% else %}
|
||||||
|
<span class="nav-initials">{{ (current_user.full_name or current_user.username)[:1] | upper }}</span>
|
||||||
|
{% endif %}
|
||||||
|
<span class="nav-username">{{ current_user.full_name or current_user.username }}</span>
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Full-page clock overlay (click anywhere to dismiss) -->
|
||||||
|
<div id="clock-overlay">
|
||||||
|
<canvas id="swiss-clock-overlay" width="400" height="400"></canvas>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
var btn = document.getElementById('nav-hamburger-btn');
|
||||||
|
var links = document.getElementById('nav-links');
|
||||||
|
if (btn && links) {
|
||||||
|
btn.addEventListener('click', function() {
|
||||||
|
var open = links.classList.toggle('nav-open');
|
||||||
|
btn.setAttribute('aria-expanded', open ? 'true' : 'false');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,330 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
html, body { overflow: visible; }
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 900px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Profile card ---- */
|
||||||
|
.profile-card {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 28px 32px;
|
||||||
|
margin-bottom: 24px;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 28px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.avatar-large {
|
||||||
|
width: 80px;
|
||||||
|
height: 80px;
|
||||||
|
border-radius: 50%;
|
||||||
|
object-fit: cover;
|
||||||
|
flex-shrink: 0;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.avatar-initials-large {
|
||||||
|
width: 80px;
|
||||||
|
height: 80px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: #0066cc;
|
||||||
|
color: #fff;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
font-size: 2em;
|
||||||
|
font-weight: 700;
|
||||||
|
flex-shrink: 0;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.profile-info { flex: 1; }
|
||||||
|
|
||||||
|
.profile-name {
|
||||||
|
font-size: 1.4em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #222;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.profile-username {
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 10px;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.78em;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||||
|
.badge-user { background: #f1f3f4; color: #555; }
|
||||||
|
|
||||||
|
.profile-logout {
|
||||||
|
margin-top: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-logout {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 6px 16px;
|
||||||
|
border-radius: 4px;
|
||||||
|
background: #f44336;
|
||||||
|
color: #fff;
|
||||||
|
font-size: 0.85em;
|
||||||
|
font-weight: 500;
|
||||||
|
text-decoration: none;
|
||||||
|
transition: background 0.15s;
|
||||||
|
}
|
||||||
|
.btn-logout:hover { background: #d32f2f; text-decoration: none; }
|
||||||
|
|
||||||
|
/* ---- Section cards ---- */
|
||||||
|
.section {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||||
|
padding: 20px 24px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section h2 {
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #333;
|
||||||
|
margin: 0 0 16px;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Settings rows ---- */
|
||||||
|
.settings-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 8px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.settings-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.settings-label {
|
||||||
|
width: 180px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.88em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.settings-value { color: #222; }
|
||||||
|
|
||||||
|
.settings-empty { color: #aaa; font-style: italic; }
|
||||||
|
|
||||||
|
/* ---- Host lists ---- */
|
||||||
|
.host-grid {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.host-chip {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
padding: 4px 12px;
|
||||||
|
border-radius: 16px;
|
||||||
|
font-size: 0.85em;
|
||||||
|
font-weight: 500;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.host-chip.owner { background: #e8f5e9; color: #2e7d32; }
|
||||||
|
.host-chip.manager { background: #e3f2fd; color: #1565c0; }
|
||||||
|
.host-chip.monitor { background: #f3e5f5; color: #6a1b9a; }
|
||||||
|
|
||||||
|
.host-chip-dot {
|
||||||
|
width: 7px; height: 7px; border-radius: 50%;
|
||||||
|
}
|
||||||
|
.owner .host-chip-dot { background: #2e7d32; }
|
||||||
|
.manager .host-chip-dot { background: #1565c0; }
|
||||||
|
.monitor .host-chip-dot { background: #6a1b9a; }
|
||||||
|
|
||||||
|
.no-hosts {
|
||||||
|
color: #aaa;
|
||||||
|
font-size: 0.9em;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Notification channels ---- */
|
||||||
|
.channel-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 6px 0;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.channel-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.channel-type {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-size: 0.78em;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
background: #f1f3f4;
|
||||||
|
color: #555;
|
||||||
|
min-width: 70px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-name { color: #333; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ header }}</h1>
|
||||||
|
<p class="subtitle">Your account settings and host access</p>
|
||||||
|
|
||||||
|
<!-- Profile card -->
|
||||||
|
<div class="profile-card">
|
||||||
|
{% if current_user and current_user.avatar %}
|
||||||
|
<img class="avatar-large" src="{{ current_user.avatar_url }}" alt="">
|
||||||
|
{% else %}
|
||||||
|
<div class="avatar-initials-large">
|
||||||
|
{{ ((current_user.full_name if current_user else '') or (current_user.username if current_user else '?'))[:1] | upper }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div class="profile-info">
|
||||||
|
<div class="profile-name">{{ current_user.full_name if current_user and current_user.full_name else (current_user.username if current_user else '—') }}</div>
|
||||||
|
<div class="profile-username">@{{ current_user.username if current_user else '—' }}</div>
|
||||||
|
{% if current_user and current_user.admin %}
|
||||||
|
<span class="badge badge-admin">Admin</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-user">User</span>
|
||||||
|
{% endif %}
|
||||||
|
<div class="profile-logout">
|
||||||
|
<a href="/logout" class="btn-logout">Sign out</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Account settings -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Account</h2>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Username</span>
|
||||||
|
<span class="settings-value">{{ current_user.username if current_user else '—' }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Full name</span>
|
||||||
|
{% if current_user and current_user.full_name %}
|
||||||
|
<span class="settings-value">{{ current_user.full_name }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="settings-empty">Not set</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Role</span>
|
||||||
|
<span class="settings-value">{{ 'Administrator' if current_user and current_user.admin else 'User' }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="settings-row">
|
||||||
|
<span class="settings-label">Avatar</span>
|
||||||
|
{% if current_user and current_user.avatar %}
|
||||||
|
<span class="settings-value" style="word-break:break-all;">{{ current_user.avatar }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="settings-empty">Not set (initials used)</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Notification channels -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Notification Channels</h2>
|
||||||
|
{% if notification_channels %}
|
||||||
|
{% for ch in notification_channels %}
|
||||||
|
<div class="channel-row">
|
||||||
|
<span class="channel-type">{{ ch.type }}</span>
|
||||||
|
<span class="channel-name">{{ ch.name }}</span>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">No personal notification channels configured.</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Host access -->
|
||||||
|
<div class="section">
|
||||||
|
<h2>Host Access</h2>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Owner</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if owned_hosts %}
|
||||||
|
{% for h in owned_hosts %}
|
||||||
|
<span class="host-chip owner"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Manager</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if managed_hosts %}
|
||||||
|
{% for h in managed_hosts %}
|
||||||
|
<span class="host-chip manager"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="settings-row" style="align-items: flex-start; padding-bottom: 4px;">
|
||||||
|
<span class="settings-label" style="padding-top: 2px;">Monitor</span>
|
||||||
|
<div class="host-grid">
|
||||||
|
{% if monitored_hosts %}
|
||||||
|
{% for h in monitored_hosts %}
|
||||||
|
<span class="host-chip monitor"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="no-hosts">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,490 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
{% include 'head.html' %}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
html, body { overflow: visible; }
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 960px;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 { color: #333; margin-bottom: 5px; margin-top: 15px; font-size: 1.5em; }
|
||||||
|
.subtitle { color: #666; margin-bottom: 24px; font-size: 0.9em; }
|
||||||
|
|
||||||
|
/* ---- Sidebar + content layout ---- */
|
||||||
|
.settings-layout {
|
||||||
|
display: flex;
|
||||||
|
gap: 24px;
|
||||||
|
align-items: flex-start;
|
||||||
|
}
|
||||||
|
|
||||||
|
.settings-sidebar {
|
||||||
|
width: 180px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
position: sticky;
|
||||||
|
top: 60px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar-nav a {
|
||||||
|
display: block;
|
||||||
|
padding: 6px 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
text-decoration: none;
|
||||||
|
font-size: 0.85em;
|
||||||
|
color: #444;
|
||||||
|
margin-bottom: 2px;
|
||||||
|
transition: background 0.1s, color 0.1s;
|
||||||
|
}
|
||||||
|
.sidebar-nav a:hover { background: #e8eaf6; color: #1a237e; }
|
||||||
|
.sidebar-nav a.active { background: #e3f2fd; color: #0066cc; font-weight: 600; }
|
||||||
|
|
||||||
|
.settings-main { flex: 1; min-width: 0; }
|
||||||
|
|
||||||
|
/* ---- Section card ---- */
|
||||||
|
.section {
|
||||||
|
background: #fff;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,.08);
|
||||||
|
margin-bottom: 24px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section-header {
|
||||||
|
padding: 14px 20px 12px;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section-title {
|
||||||
|
font-size: 0.95em;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #222;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
margin: 0 0 3px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.section-desc {
|
||||||
|
font-size: 0.82em;
|
||||||
|
color: #888;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Field rows ---- */
|
||||||
|
.field-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 10px 20px;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
gap: 16px;
|
||||||
|
}
|
||||||
|
.field-row:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
.field-label {
|
||||||
|
width: 200px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
font-size: 0.88em;
|
||||||
|
font-weight: 500;
|
||||||
|
color: #444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.field-body { flex: 1; min-width: 0; }
|
||||||
|
|
||||||
|
.field-value {
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #222;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.field-desc {
|
||||||
|
font-size: 0.78em;
|
||||||
|
color: #999;
|
||||||
|
margin-top: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Value type renderers ---- */
|
||||||
|
.val-boolean {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 9px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-size: 0.8em;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.val-boolean.on { background: #e8f5e9; color: #2e7d32; }
|
||||||
|
.val-boolean.off { background: #fce4ec; color: #c62828; }
|
||||||
|
|
||||||
|
.val-masked {
|
||||||
|
font-family: monospace;
|
||||||
|
color: #bbb;
|
||||||
|
letter-spacing: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.val-list { display: flex; flex-wrap: wrap; gap: 5px; }
|
||||||
|
.val-tag {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 9px;
|
||||||
|
background: #e8eaf6;
|
||||||
|
color: #283593;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-size: 0.8em;
|
||||||
|
}
|
||||||
|
.val-empty { color: #ccc; font-style: italic; font-size: 0.88em; }
|
||||||
|
|
||||||
|
/* ---- Users table ---- */
|
||||||
|
.mini-table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
font-size: 0.875em;
|
||||||
|
}
|
||||||
|
.mini-table th {
|
||||||
|
background: #f5f5f5;
|
||||||
|
padding: 7px 12px;
|
||||||
|
text-align: left;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #555;
|
||||||
|
font-size: 0.82em;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.4px;
|
||||||
|
border-bottom: 1px solid #e0e0e0;
|
||||||
|
}
|
||||||
|
.mini-table td {
|
||||||
|
padding: 7px 12px;
|
||||||
|
border-bottom: 1px solid #f0f0f0;
|
||||||
|
color: #333;
|
||||||
|
vertical-align: middle;
|
||||||
|
}
|
||||||
|
.mini-table tbody tr:last-child td { border-bottom: none; }
|
||||||
|
.mini-table tbody tr:hover { background: #fafafa; }
|
||||||
|
|
||||||
|
.badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 1px 8px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||||
|
.badge-user { background: #f1f3f4; color: #666; }
|
||||||
|
|
||||||
|
/* ---- Notification channels ---- */
|
||||||
|
.channel-card {
|
||||||
|
border: 1px solid #e8eaf6;
|
||||||
|
border-radius: 6px;
|
||||||
|
margin: 12px 20px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 9px 14px;
|
||||||
|
background: #f8f9ff;
|
||||||
|
border-bottom: 1px solid #e8eaf6;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-name-text { font-weight: 600; font-size: 0.9em; color: #222; }
|
||||||
|
|
||||||
|
.ch-type-badge {
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 8px;
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: 600;
|
||||||
|
background: #e8eaf6;
|
||||||
|
color: #3949ab;
|
||||||
|
}
|
||||||
|
|
||||||
|
.channel-fields { padding: 6px 0; }
|
||||||
|
|
||||||
|
.channel-field {
|
||||||
|
display: flex;
|
||||||
|
padding: 5px 14px;
|
||||||
|
font-size: 0.85em;
|
||||||
|
border-bottom: 1px solid #f5f5f5;
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
.channel-field:last-child { border-bottom: none; }
|
||||||
|
.channel-field-label { width: 130px; flex-shrink: 0; color: #777; }
|
||||||
|
.channel-field-value { color: #333; word-break: break-all; }
|
||||||
|
|
||||||
|
/* ---- Hosts table ---- */
|
||||||
|
/* ---- Mobile: collapsible sidebar ---- */
|
||||||
|
.sidebar-toggle {
|
||||||
|
display: none;
|
||||||
|
width: 100%;
|
||||||
|
padding: 8px 12px;
|
||||||
|
background: #e8eaf6;
|
||||||
|
border: none;
|
||||||
|
border-radius: 6px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #283593;
|
||||||
|
cursor: pointer;
|
||||||
|
text-align: left;
|
||||||
|
margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
.sidebar-toggle::after { content: ' ▾'; float: right; }
|
||||||
|
.sidebar-toggle.open::after { content: ' ▴'; }
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
.sidebar-toggle { display: block; }
|
||||||
|
|
||||||
|
.settings-layout { flex-direction: column; gap: 0; }
|
||||||
|
|
||||||
|
.settings-sidebar {
|
||||||
|
width: 100%;
|
||||||
|
position: static;
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar-nav {
|
||||||
|
display: none;
|
||||||
|
background: white;
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,.1);
|
||||||
|
margin-bottom: 16px;
|
||||||
|
padding: 4px 0;
|
||||||
|
}
|
||||||
|
.sidebar-nav.open { display: block; }
|
||||||
|
.sidebar-nav a { padding: 10px 16px; font-size: 1em; }
|
||||||
|
|
||||||
|
.field-row { flex-direction: column; gap: 4px; }
|
||||||
|
.field-label { width: 100%; font-size: 0.82em; color: #888; }
|
||||||
|
}
|
||||||
|
.host-bool { text-align: center; }
|
||||||
|
.dot-yes { color: #2e7d32; font-size: 1.1em; }
|
||||||
|
.dot-no { color: #ddd; font-size: 1.1em; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% include 'nav.html' %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<h1>Settings</h1>
|
||||||
|
<p class="subtitle">Current server configuration — read from the config file at startup.</p>
|
||||||
|
|
||||||
|
<div class="settings-layout">
|
||||||
|
|
||||||
|
<!-- Sidebar navigation -->
|
||||||
|
<nav class="settings-sidebar">
|
||||||
|
<button class="sidebar-toggle" id="sidebar-toggle" aria-expanded="false">Sections</button>
|
||||||
|
<div class="sidebar-nav" id="sidebar-nav">
|
||||||
|
{% for section in sections %}
|
||||||
|
<a href="#{{ section.id }}" onclick="closeSidebar()">{{ section.title }}</a>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<!-- Main content -->
|
||||||
|
<div class="settings-main">
|
||||||
|
{% for section in sections %}
|
||||||
|
<div class="section" id="{{ section.id }}">
|
||||||
|
<div class="section-header">
|
||||||
|
<p class="section-title">{{ section.title }}</p>
|
||||||
|
{% if section.description %}<p class="section-desc">{{ section.description }}</p>{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{# ---- Standard field rows ---- #}
|
||||||
|
{% for f in section.fields %}
|
||||||
|
<div class="field-row">
|
||||||
|
<div class="field-label">{{ f.label }}</div>
|
||||||
|
<div class="field-body">
|
||||||
|
{% if f.sensitive %}
|
||||||
|
<div class="field-value"><span class="val-masked">••••••••</span></div>
|
||||||
|
{% elif f.type == "boolean" %}
|
||||||
|
<div class="field-value">
|
||||||
|
<span class="val-boolean {{ 'on' if f.value else 'off' }}">
|
||||||
|
{{ 'Enabled' if f.value else 'Disabled' }}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{% elif f.type == "list" %}
|
||||||
|
<div class="field-value">
|
||||||
|
{% if f.value %}
|
||||||
|
<span class="val-list">
|
||||||
|
{% for item in f.value %}<span class="val-tag">{{ item }}</span>{% endfor %}
|
||||||
|
</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="val-empty">None</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
{% elif f.value is none or f.value == "" %}
|
||||||
|
<div class="field-value"><span class="val-empty">Not set</span></div>
|
||||||
|
{% else %}
|
||||||
|
<div class="field-value">{{ f.value }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% if f.description %}
|
||||||
|
<div class="field-desc">{{ f.description }}</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
{# ---- Users section ---- #}
|
||||||
|
{% if section.id == "users" and section.users %}
|
||||||
|
<div style="padding: 0 0 4px;">
|
||||||
|
<table class="mini-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Username</th>
|
||||||
|
<th>Full Name</th>
|
||||||
|
<th>Role</th>
|
||||||
|
<th>Avatar</th>
|
||||||
|
<th>Channels</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for u in section.users %}
|
||||||
|
<tr>
|
||||||
|
<td><strong>{{ u.username }}</strong></td>
|
||||||
|
<td>{{ u.full_name or '—' }}</td>
|
||||||
|
<td>
|
||||||
|
{% if u.admin %}
|
||||||
|
<span class="badge badge-admin">Admin</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-user">User</span>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
<td style="font-size:0.8em; color:#888;">
|
||||||
|
{% if u.avatar %}{{ u.avatar }}{% else %}—{% endif %}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{% if u.notification_channels %}
|
||||||
|
<span class="val-list">
|
||||||
|
{% for ch in u.notification_channels %}
|
||||||
|
<span class="val-tag">{{ ch }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
</span>
|
||||||
|
{% else %}—{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{# ---- Notification channels section ---- #}
|
||||||
|
{% if section.id == "channels" %}
|
||||||
|
{% for ch in section.channels %}
|
||||||
|
<div class="channel-card">
|
||||||
|
<div class="channel-header">
|
||||||
|
<span class="channel-name-text">{{ ch.name }}</span>
|
||||||
|
<span class="ch-type-badge">{{ ch.type_label }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="channel-fields">
|
||||||
|
{% for cf in ch.fields %}
|
||||||
|
<div class="channel-field">
|
||||||
|
<span class="channel-field-label">{{ cf.label }}</span>
|
||||||
|
<span class="channel-field-value">
|
||||||
|
{% if cf.sensitive %}
|
||||||
|
<span class="val-masked">••••••••</span>
|
||||||
|
{% elif cf.value is iterable and cf.value is not string %}
|
||||||
|
{{ cf.value | join(', ') }}
|
||||||
|
{% else %}
|
||||||
|
{{ cf.value }}
|
||||||
|
{% endif %}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% if not section.channels %}
|
||||||
|
<div class="field-row"><span class="val-empty">No notification channels configured.</span></div>
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{# ---- Hosts section ---- #}
|
||||||
|
{% if section.id == "hosts" %}
|
||||||
|
{% if section.hosts %}
|
||||||
|
<div style="overflow-x: auto;">
|
||||||
|
<table class="mini-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Host</th>
|
||||||
|
<th>Watch</th>
|
||||||
|
<th>DynDNS</th>
|
||||||
|
<th>Owner</th>
|
||||||
|
<th>Threshold config</th>
|
||||||
|
<th>Channels</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for h in section.hosts %}
|
||||||
|
<tr>
|
||||||
|
<td><strong>{{ h.name }}</strong></td>
|
||||||
|
<td class="host-bool">
|
||||||
|
<span class="{{ 'dot-yes' if h.watch else 'dot-no' }}">●</span>
|
||||||
|
</td>
|
||||||
|
<td class="host-bool">
|
||||||
|
<span class="{{ 'dot-yes' if h.dyndns else 'dot-no' }}">●</span>
|
||||||
|
</td>
|
||||||
|
<td>{{ h.owner or '—' }}</td>
|
||||||
|
<td>{{ h.threshold_config or '—' }}</td>
|
||||||
|
<td>
|
||||||
|
{% if h.notification_channels %}
|
||||||
|
<span class="val-list">
|
||||||
|
{% for ch in h.notification_channels %}
|
||||||
|
<span class="val-tag">{{ ch }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
</span>
|
||||||
|
{% else %}—{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="field-row"><span class="val-empty">No hosts defined in config.</span></div>
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
</div>{# /section #}
|
||||||
|
{% endfor %}
|
||||||
|
</div>{# /settings-main #}
|
||||||
|
</div>{# /settings-layout #}
|
||||||
|
</div>{# /container #}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Highlight sidebar link for the section currently in view
|
||||||
|
const sections = document.querySelectorAll('.section');
|
||||||
|
const navLinks = document.querySelectorAll('.sidebar-nav a');
|
||||||
|
|
||||||
|
const observer = new IntersectionObserver(entries => {
|
||||||
|
entries.forEach(entry => {
|
||||||
|
if (entry.isIntersecting) {
|
||||||
|
const id = entry.target.id;
|
||||||
|
navLinks.forEach(a => {
|
||||||
|
a.classList.toggle('active', a.getAttribute('href') === '#' + id);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}, { threshold: 0.25 });
|
||||||
|
|
||||||
|
sections.forEach(s => observer.observe(s));
|
||||||
|
|
||||||
|
// Collapsible sidebar on mobile
|
||||||
|
var sidebarToggle = document.getElementById('sidebar-toggle');
|
||||||
|
var sidebarNav = document.getElementById('sidebar-nav');
|
||||||
|
if (sidebarToggle && sidebarNav) {
|
||||||
|
sidebarToggle.addEventListener('click', function() {
|
||||||
|
var open = sidebarNav.classList.toggle('open');
|
||||||
|
sidebarToggle.classList.toggle('open', open);
|
||||||
|
sidebarToggle.setAttribute('aria-expanded', open ? 'true' : 'false');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
<script>
|
||||||
|
function closeSidebar() {
|
||||||
|
var sidebarNav = document.getElementById('sidebar-nav');
|
||||||
|
var sidebarToggle = document.getElementById('sidebar-toggle');
|
||||||
|
if (sidebarNav) { sidebarNav.classList.remove('open'); }
|
||||||
|
if (sidebarToggle) {
|
||||||
|
sidebarToggle.classList.remove('open');
|
||||||
|
sidebarToggle.setAttribute('aria-expanded', 'false');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,509 @@
|
|||||||
|
"""UDP listener and datagram processing."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import socket
|
||||||
|
import struct
|
||||||
|
import time
|
||||||
|
import zlib
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from platform import system as platform_system
|
||||||
|
|
||||||
|
from ..common.proto import stodict, oldmtodict
|
||||||
|
from ..common.utils import dur
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
|
||||||
|
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
|
||||||
|
# Python's socket module on all platforms
|
||||||
|
platform = platform_system()
|
||||||
|
if platform == "Darwin":
|
||||||
|
_SO_TIMESTAMP = 1024 # SO_TIMESTAMP on macOS (not in Python's socket module)
|
||||||
|
elif platform == "Linux":
|
||||||
|
_SO_TIMESTAMP = 29 # Linux value (not in older Python versions)
|
||||||
|
elif platform == "FreeBSD":
|
||||||
|
_SO_TIMESTAMP = 32 # FreeBSD value (not in older Python versions)
|
||||||
|
else:
|
||||||
|
logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
|
||||||
|
_SO_TIMESTAMP = None
|
||||||
|
|
||||||
|
# struct timeval uses two native C longs: tv_sec and tv_usec
|
||||||
|
_TIMEVAL = struct.Struct('@ll')
|
||||||
|
|
||||||
|
|
||||||
|
def enable_kernel_timestamps(sock) -> bool:
|
||||||
|
"""Try to enable SO_TIMESTAMP on *sock*.
|
||||||
|
|
||||||
|
Returns True if the kernel will supply receive timestamps, False otherwise
|
||||||
|
(unsupported platform, older kernel, or insufficient permissions).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
sock.setsockopt(socket.SOL_SOCKET, _SO_TIMESTAMP, 1)
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_kernel_ts(ancdata) -> float | None:
|
||||||
|
"""Parse recvmsg ancillary data and return the kernel receive time.
|
||||||
|
|
||||||
|
Returns seconds as a float, or None if no SO_TIMESTAMP cmsg is present.
|
||||||
|
"""
|
||||||
|
for cmsg_level, cmsg_type, cmsg_data in ancdata:
|
||||||
|
if cmsg_level == socket.SOL_SOCKET and cmsg_type == _SO_TIMESTAMP:
|
||||||
|
if len(cmsg_data) >= _TIMEVAL.size:
|
||||||
|
sec, usec = _TIMEVAL.unpack_from(cmsg_data)
|
||||||
|
return sec + usec * 1e-6
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class RecvmsgTransport:
|
||||||
|
"""Thin wrapper used when SO_TIMESTAMP is active (add_reader path).
|
||||||
|
|
||||||
|
Exposes the same sendto() / close() interface as asyncio's DatagramTransport
|
||||||
|
so the rest of the code does not need to know which path is in use.
|
||||||
|
"""
|
||||||
|
def __init__(self, loop, sock):
|
||||||
|
self._loop = loop
|
||||||
|
self._sock = sock
|
||||||
|
|
||||||
|
def sendto(self, data, addr):
|
||||||
|
try:
|
||||||
|
self._sock.sendto(data, addr)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("sendto failed: %s", e)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
try:
|
||||||
|
self._loop.remove_reader(self._sock.fileno())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
self._sock.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def make_recvmsg_reader(sock, handler, transport):
|
||||||
|
"""Return a callback suitable for loop.add_reader().
|
||||||
|
|
||||||
|
Reads one datagram per call using recvmsg() so that kernel timestamps in
|
||||||
|
the ancillary data are accessible. Falls back to time.time() if the
|
||||||
|
cmsg is missing.
|
||||||
|
|
||||||
|
handler(msg, addr, transport, kernel_ts) – same signature as udp_handler
|
||||||
|
in main.py with the optional kernel_ts argument.
|
||||||
|
"""
|
||||||
|
BUFSIZE = 65536
|
||||||
|
ANCBUFSIZE = 128 # enough for one struct timespec cmsg
|
||||||
|
|
||||||
|
def _read():
|
||||||
|
try:
|
||||||
|
data, ancdata, _, addr = sock.recvmsg(BUFSIZE, ANCBUFSIZE)
|
||||||
|
except BlockingIOError:
|
||||||
|
return
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning("recvmsg error: %s", e)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
kernel_ts = _extract_kernel_ts(ancdata)
|
||||||
|
msg = parse_message(data)
|
||||||
|
if msg:
|
||||||
|
handler(msg, addr, transport, kernel_ts)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error processing datagram from %s", addr)
|
||||||
|
|
||||||
|
return _read
|
||||||
|
|
||||||
|
|
||||||
|
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||||
|
def __init__(self, config=None, handler=None):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config or {}
|
||||||
|
self.handler = handler
|
||||||
|
|
||||||
|
def connection_made(self, transport):
|
||||||
|
self.transport = transport
|
||||||
|
logger.info("UDP Server listening...")
|
||||||
|
|
||||||
|
def datagram_received(self, data, addr):
|
||||||
|
logger.debug("Received from %s", addr)
|
||||||
|
try:
|
||||||
|
msg = parse_message(data)
|
||||||
|
if self.handler:
|
||||||
|
# handler can be a callable provided by the application
|
||||||
|
# pass the transport so handlers can send replies (ACKs/commands)
|
||||||
|
self.handler(msg, addr, self.transport)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error while processing datagram from %s", addr)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_message(data: bytes):
|
||||||
|
"""Parse a raw datagram into a message dict.
|
||||||
|
|
||||||
|
Uses the protocol decoding helpers and falls back to old format when
|
||||||
|
decoding returns an empty dict (compat with older clients).
|
||||||
|
"""
|
||||||
|
msg = stodict(data)
|
||||||
|
if not msg:
|
||||||
|
# fallback to old format
|
||||||
|
msg = oldmtodict(data)
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
|
def dicttos(ID, d):
|
||||||
|
s = []
|
||||||
|
for k in d:
|
||||||
|
if isinstance(d[k], float):
|
||||||
|
s.append("%s=%0.5f" % (k, d[k]))
|
||||||
|
else:
|
||||||
|
s.append("%s=%s" % (k, d[k]))
|
||||||
|
pk = ";".join(s)
|
||||||
|
zpk = zlib.compress(pk.encode(), 6)
|
||||||
|
ID = "!" + ID + ":"
|
||||||
|
opk = ID.encode() + zpk
|
||||||
|
return opk
|
||||||
|
|
||||||
|
|
||||||
|
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def _set_connectivity_alert(host, afam, level_name):
|
||||||
|
"""Update (or clear) a connectivity alert_state entry for a host/address-family.
|
||||||
|
|
||||||
|
level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so
|
||||||
|
that recovered hosts don't clutter the Alerts Dashboard.
|
||||||
|
"""
|
||||||
|
from .threshold import AlertState, AlertLevel
|
||||||
|
metric_path = f"connectivity.{afam}"
|
||||||
|
level = getattr(AlertLevel, level_name, AlertLevel.OK)
|
||||||
|
if level == AlertLevel.OK:
|
||||||
|
host.alert_states.pop(metric_path, None)
|
||||||
|
return
|
||||||
|
if metric_path not in host.alert_states:
|
||||||
|
host.alert_states[metric_path] = AlertState(metric_path)
|
||||||
|
state = host.alert_states[metric_path]
|
||||||
|
state.update(level, level_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_timer_callbacks(uname, host, ctx):
|
||||||
|
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||||
|
|
||||||
|
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||||
|
"""
|
||||||
|
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
|
||||||
|
async def on_unknown(connection):
|
||||||
|
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||||
|
# Keep connectivity alert active when host transitions to unknown
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
|
||||||
|
async def on_overdue(connection):
|
||||||
|
if connection.getstate() != connection.__class__.UP:
|
||||||
|
return
|
||||||
|
now = time.time()
|
||||||
|
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||||
|
msg = f"{connection.afam} overdue"
|
||||||
|
eventlog(uname, "CRITICAL", msg)
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||||
|
))
|
||||||
|
# Track in alert_states so the Alerts Dashboard shows this
|
||||||
|
_set_connectivity_alert(host, connection.afam, "CRITICAL")
|
||||||
|
if threshold_checker:
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path="rtt",
|
||||||
|
value=float("inf"),
|
||||||
|
alert_states=host.alert_states,
|
||||||
|
)
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||||
|
|
||||||
|
return on_overdue, on_unknown
|
||||||
|
|
||||||
|
|
||||||
|
def restore_connection_timers(hbdclass, ctx):
|
||||||
|
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||||
|
|
||||||
|
For UP connections, the remaining time until overdue is calculated from
|
||||||
|
lastbeat so that clients that vanished during hbd's downtime are detected.
|
||||||
|
For OVERDUE connections, the UNKNOWN drop timer is restored.
|
||||||
|
"""
|
||||||
|
now = time.time()
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
grace = cfg.get("grace", 2)
|
||||||
|
|
||||||
|
restored = 0
|
||||||
|
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
interval = host.interval
|
||||||
|
for afam, conn in list(host.connections.items()):
|
||||||
|
state = conn.getstate()
|
||||||
|
if state == hbdclass.Connection.DOWN:
|
||||||
|
continue
|
||||||
|
|
||||||
|
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||||
|
|
||||||
|
if state == hbdclass.Connection.UP and interval > 0:
|
||||||
|
elapsed = now - conn.lastbeat
|
||||||
|
# Give hosts one full (interval + grace) of extra time on startup
|
||||||
|
# so hosts that were silent while hbd was down are not immediately
|
||||||
|
# flagged as overdue before they have a chance to check in.
|
||||||
|
startup_grace = interval + grace
|
||||||
|
remaining = max(startup_grace, 2 * startup_grace - elapsed)
|
||||||
|
conn.reset_overdue_timer(remaining, on_overdue)
|
||||||
|
logger.debug(
|
||||||
|
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
|
||||||
|
uname, afam, remaining, elapsed, startup_grace,
|
||||||
|
)
|
||||||
|
restored += 1
|
||||||
|
|
||||||
|
elif state == hbdclass.Connection.OVERDUE:
|
||||||
|
elapsed_overdue = now - conn.statetime
|
||||||
|
remaining = DROPOVERDUE - elapsed_overdue
|
||||||
|
if remaining <= 1:
|
||||||
|
# Already past the drop window — mark UNKNOWN immediately
|
||||||
|
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||||
|
logger.info(
|
||||||
|
"Marking %s/%s UNKNOWN (overdue %.1f days)",
|
||||||
|
uname, afam, elapsed_overdue / 86400,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.reset_overdue_timer(remaining, on_unknown)
|
||||||
|
logger.debug(
|
||||||
|
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||||
|
uname, afam, remaining,
|
||||||
|
)
|
||||||
|
restored += 1
|
||||||
|
|
||||||
|
logger.info("Restored timers for %d connection(s)", restored)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||||
|
"""Handle a parsed datagram message.
|
||||||
|
|
||||||
|
ctx is a dictionary with runtime dependencies:
|
||||||
|
- config: dict of configuration
|
||||||
|
- hbdclass: module providing Host/Connection classes
|
||||||
|
- log: callable(loghost, message)
|
||||||
|
- msg_to_websockets: callable(typ, data)
|
||||||
|
- msg_journal: MessageJournal instance for logging all messages
|
||||||
|
- DEBUG, verbose
|
||||||
|
"""
|
||||||
|
if not msg:
|
||||||
|
return
|
||||||
|
now = ctx.get("recv_ts") or time.time()
|
||||||
|
|
||||||
|
# Log message to journal
|
||||||
|
msg_journal = ctx.get("msg_journal")
|
||||||
|
if msg_journal:
|
||||||
|
# Create async task to log message (non-blocking)
|
||||||
|
import asyncio
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.create_task(msg_journal.log_message(msg, addr, now))
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Failed to log message to journal: {e}")
|
||||||
|
|
||||||
|
cfg = ctx.get("config", {})
|
||||||
|
hbdcls = ctx.get("hbdclass")
|
||||||
|
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||||
|
DEBUG = ctx.get("DEBUG", 0)
|
||||||
|
verbose = ctx.get("verbose", False)
|
||||||
|
|
||||||
|
# normalize addr (ip, port)
|
||||||
|
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||||
|
name = msg.get("name", "unknown")
|
||||||
|
from ..common.utils import shortname
|
||||||
|
from . import config as config_mod
|
||||||
|
|
||||||
|
uname = shortname(name)
|
||||||
|
|
||||||
|
if uname not in hbdcls.Host.hosts:
|
||||||
|
host = hbdcls.Host(uname)
|
||||||
|
# Use new config function to check dyndns
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||||
|
host.dyn = uname in dyndnshosts
|
||||||
|
# Apply user-access settings from config
|
||||||
|
access = config_mod.get_host_access(cfg, uname)
|
||||||
|
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||||
|
if verbose:
|
||||||
|
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||||
|
newh = True
|
||||||
|
else:
|
||||||
|
host = hbdcls.Host.hosts[uname]
|
||||||
|
newh = False
|
||||||
|
|
||||||
|
cid = msg.get("id", 0)
|
||||||
|
try:
|
||||||
|
rtt = float(msg.get("rtt"))
|
||||||
|
except TypeError:
|
||||||
|
rtt = None
|
||||||
|
|
||||||
|
if msg.get("ID") == "HTB":
|
||||||
|
host.doesack = msg.get("acks", -1)
|
||||||
|
# send ACK back
|
||||||
|
rmsg = {"time": time.time()}
|
||||||
|
opkt = dicttos("ACK", rmsg)
|
||||||
|
try:
|
||||||
|
transport.sendto(opkt, addr)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send ack: %s" % e))
|
||||||
|
|
||||||
|
elif msg.get("ID") == "PLG":
|
||||||
|
# Handle plugin data message
|
||||||
|
plugin_name = msg.get("plugin")
|
||||||
|
if plugin_name:
|
||||||
|
# Extract plugin fields, dropping protocol metadata fields
|
||||||
|
plugin_data = {k: v for k, v in msg.items()
|
||||||
|
if k not in ("ID", "plugin", "id", "name")}
|
||||||
|
# Store plugin data with timestamp
|
||||||
|
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||||
|
if DEBUG > 1:
|
||||||
|
print(f"Stored plugin data for {uname}: {plugin_name}")
|
||||||
|
|
||||||
|
# Check thresholds if checker is available
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker:
|
||||||
|
try:
|
||||||
|
state_changes = threshold_checker.check_plugin_data(
|
||||||
|
host_name=uname,
|
||||||
|
plugin_name=plugin_name,
|
||||||
|
data=plugin_data,
|
||||||
|
alert_states=host.alert_states,
|
||||||
|
)
|
||||||
|
if DEBUG > 1 and state_changes:
|
||||||
|
print(f"Threshold state changes for {uname}: {state_changes}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
|
||||||
|
|
||||||
|
# Notify websockets of plugin update
|
||||||
|
if msg_to_websockets:
|
||||||
|
try:
|
||||||
|
msg_to_websockets("plugin", {
|
||||||
|
"host": uname,
|
||||||
|
"plugin": plugin_name,
|
||||||
|
"data": plugin_data,
|
||||||
|
"timestamp": now
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn, res = host.conndata(cid, ip, rtt, now)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print("conndata failed: %s" % e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if res:
|
||||||
|
eventlog(uname, "WARNING", res)
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
|
||||||
|
))
|
||||||
|
|
||||||
|
interval = int(msg.get("interval", 0) or 0)
|
||||||
|
shutdown = msg.get("shutdown", 0)
|
||||||
|
service = msg.get("service", "unknown")
|
||||||
|
message = msg.get("msg", None)
|
||||||
|
boot = msg.get("boot", 0)
|
||||||
|
|
||||||
|
if boot:
|
||||||
|
eventlog(uname, "INFO", "booted")
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||||
|
))
|
||||||
|
if message:
|
||||||
|
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||||
|
|
||||||
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
|
lasts = conn.state
|
||||||
|
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||||
|
# Clear connectivity alert now that the host is back up
|
||||||
|
_set_connectivity_alert(host, conn.afam, "OK")
|
||||||
|
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||||
|
# it was never down, it just hasn't been seen before.
|
||||||
|
if not newh:
|
||||||
|
if d == 0 or lasts == "unknown":
|
||||||
|
m = "%s is up" % (conn.afam)
|
||||||
|
else:
|
||||||
|
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||||
|
eventlog(uname, "RECOVER", m)
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||||
|
))
|
||||||
|
|
||||||
|
if boot or newh:
|
||||||
|
host.upcount = host.doesack
|
||||||
|
else:
|
||||||
|
host.upcount += 1
|
||||||
|
|
||||||
|
if shutdown:
|
||||||
|
m = "%s shutdown" % conn.afam
|
||||||
|
eventlog(uname, "INFO", m)
|
||||||
|
asyncio.create_task(notify_mod.send_notification(
|
||||||
|
uname,
|
||||||
|
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||||
|
))
|
||||||
|
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||||
|
_set_connectivity_alert(host, conn.afam, "CRITICAL")
|
||||||
|
|
||||||
|
if interval > 0:
|
||||||
|
host.interval = interval
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
# Reset overdue timer on every heartbeat
|
||||||
|
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||||
|
grace = cfg.get("grace", 2)
|
||||||
|
timeout_seconds = interval + grace
|
||||||
|
on_overdue, _ = _make_timer_callbacks(uname, host, ctx)
|
||||||
|
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||||
|
|
||||||
|
# Check RTT thresholds using the threshold checker
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker and rtt and rtt > 0:
|
||||||
|
# Metric path for RTT is simply "rtt"
|
||||||
|
metric_path = "rtt"
|
||||||
|
|
||||||
|
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path=metric_path,
|
||||||
|
value=rtt,
|
||||||
|
alert_states=host.alert_states
|
||||||
|
)
|
||||||
|
|
||||||
|
# send any commands we have queued
|
||||||
|
while len(host.cmds):
|
||||||
|
op, rmsg = host.cmds[0]
|
||||||
|
if op == "CMD":
|
||||||
|
del host.cmds[0]
|
||||||
|
eventlog(uname, "INFO", "command sent")
|
||||||
|
elif op == "UPD":
|
||||||
|
del host.cmds[0]
|
||||||
|
eventlog(uname, "INFO", "update initiated")
|
||||||
|
opkt = dicttos(op, rmsg)
|
||||||
|
try:
|
||||||
|
transport.sendto(opkt, addr)
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send cmd/update: %s" % e))
|
||||||
|
|
||||||
|
if msg_to_websockets:
|
||||||
|
try:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
except Exception as e:
|
||||||
|
if DEBUG > 0:
|
||||||
|
print(("cannot send websocket message: %s" % e))
|
||||||
@@ -0,0 +1,242 @@
|
|||||||
|
"""User management: loading, authentication, and session tracking.
|
||||||
|
|
||||||
|
Users are defined in the config file under the ``users`` key:
|
||||||
|
|
||||||
|
users:
|
||||||
|
alice:
|
||||||
|
full_name: Alice Smith
|
||||||
|
avatar: /path/to/avatar.png # file path, URL, or base64 data URI
|
||||||
|
password: pbkdf2:sha256:... # generated with: hbd passwd
|
||||||
|
admin: true # optional server-level admin
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
Roles are assigned per-host:
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
webserver01:
|
||||||
|
owner: alice
|
||||||
|
managers: [bob]
|
||||||
|
monitors: [carol]
|
||||||
|
|
||||||
|
If no users are defined the server runs in unauthenticated mode (backwards
|
||||||
|
compatible). When users are defined every API call must carry a valid session
|
||||||
|
token in an ``Authorization: Bearer <token>`` or ``X-Auth-Token`` header,
|
||||||
|
obtained via ``POST /api/0/auth/login``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import hmac
|
||||||
|
import logging
|
||||||
|
import secrets
|
||||||
|
import time
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Session lifetime in seconds (24 hours).
|
||||||
|
SESSION_TTL = 86400
|
||||||
|
|
||||||
|
# Global session store: token -> {"username": str, "expires": float, "created": float}
|
||||||
|
_sessions: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# User class
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class User:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str,
|
||||||
|
full_name: str = "",
|
||||||
|
avatar: str = "",
|
||||||
|
password_hash: str = "",
|
||||||
|
admin: bool = False,
|
||||||
|
notification_channels: list | None = None,
|
||||||
|
):
|
||||||
|
self.username = username
|
||||||
|
self.full_name = full_name
|
||||||
|
self.avatar = avatar
|
||||||
|
self.password_hash = password_hash
|
||||||
|
self.admin = admin
|
||||||
|
self.notification_channels: list = notification_channels or []
|
||||||
|
|
||||||
|
def check_password(self, password: str) -> bool:
|
||||||
|
if not self.password_hash:
|
||||||
|
return False
|
||||||
|
return _verify_password(password, self.password_hash)
|
||||||
|
|
||||||
|
def avatar_is_local(self) -> bool:
|
||||||
|
"""Return True when the avatar is a local filesystem path (starts with '/')."""
|
||||||
|
return bool(self.avatar and self.avatar.startswith("/"))
|
||||||
|
|
||||||
|
def avatar_url(self) -> str:
|
||||||
|
"""Return the URL to use as an <img src>.
|
||||||
|
|
||||||
|
Local file paths are served via the /api/0/users/{username}/avatar
|
||||||
|
endpoint. External URLs and data URIs are returned as-is.
|
||||||
|
"""
|
||||||
|
if self.avatar_is_local():
|
||||||
|
return f"/api/0/users/{self.username}/avatar"
|
||||||
|
return self.avatar
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"username": self.username,
|
||||||
|
"full_name": self.full_name,
|
||||||
|
"avatar": self.avatar,
|
||||||
|
"avatar_url": self.avatar_url(),
|
||||||
|
"admin": self.admin,
|
||||||
|
"notification_channels": self.notification_channels,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Password hashing (PBKDF2-HMAC-SHA256, stdlib only)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def hash_password(password: str) -> str:
|
||||||
|
"""Return a storable hash for *password*.
|
||||||
|
|
||||||
|
Format: ``pbkdf2:sha256:<iterations>:<salt>:<hex-digest>``
|
||||||
|
|
||||||
|
Use this to generate the ``password`` value in the config file::
|
||||||
|
|
||||||
|
python -c "from hbd.server.users import hash_password; print(hash_password('secret'))"
|
||||||
|
|
||||||
|
Or via the CLI::
|
||||||
|
|
||||||
|
hbd passwd
|
||||||
|
"""
|
||||||
|
salt = secrets.token_hex(16)
|
||||||
|
iterations = 260_000
|
||||||
|
dk = hashlib.pbkdf2_hmac(
|
||||||
|
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||||
|
)
|
||||||
|
return f"pbkdf2:sha256:{iterations}:{salt}:{dk.hex()}"
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_password(password: str, stored_hash: str) -> bool:
|
||||||
|
"""Return True if *password* matches *stored_hash*."""
|
||||||
|
try:
|
||||||
|
parts = stored_hash.split(":")
|
||||||
|
if len(parts) != 5 or parts[0] != "pbkdf2" or parts[1] != "sha256":
|
||||||
|
return False
|
||||||
|
_, _, iterations_str, salt, expected_hex = parts
|
||||||
|
iterations = int(iterations_str)
|
||||||
|
dk = hashlib.pbkdf2_hmac(
|
||||||
|
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||||
|
)
|
||||||
|
return hmac.compare_digest(dk.hex(), expected_hex)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Global user registry
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# username -> User
|
||||||
|
users: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def load_users(config: dict) -> dict:
|
||||||
|
"""Populate the global user registry from *config*.
|
||||||
|
|
||||||
|
Called once at startup and again on SIGHUP config reload.
|
||||||
|
Returns the new ``users`` dict.
|
||||||
|
"""
|
||||||
|
global users
|
||||||
|
users_cfg = config.get("users", {})
|
||||||
|
if not isinstance(users_cfg, dict):
|
||||||
|
users = {}
|
||||||
|
return users
|
||||||
|
|
||||||
|
result: dict = {}
|
||||||
|
for username, attrs in users_cfg.items():
|
||||||
|
if not isinstance(attrs, dict):
|
||||||
|
logger.warning("Skipping user %r: expected a mapping", username)
|
||||||
|
continue
|
||||||
|
result[username] = User(
|
||||||
|
username=username,
|
||||||
|
full_name=attrs.get("full_name", ""),
|
||||||
|
avatar=attrs.get("avatar", ""),
|
||||||
|
password_hash=attrs.get("password", ""),
|
||||||
|
admin=bool(attrs.get("admin", False)),
|
||||||
|
notification_channels=attrs.get("notification_channels", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
users = result
|
||||||
|
logger.info("Loaded %d user(s) from config", len(users))
|
||||||
|
return users
|
||||||
|
|
||||||
|
|
||||||
|
def users_enabled() -> bool:
|
||||||
|
"""Return True if at least one user is configured (auth-required mode)."""
|
||||||
|
return bool(users)
|
||||||
|
|
||||||
|
|
||||||
|
def get_user(username: str) -> "User | None":
|
||||||
|
return users.get(username)
|
||||||
|
|
||||||
|
|
||||||
|
def authenticate(username: str, password: str) -> "User | None":
|
||||||
|
"""Return the User if credentials are valid, else None."""
|
||||||
|
user = users.get(username)
|
||||||
|
if user and user.check_password(password):
|
||||||
|
return user
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Session management
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def create_session(username: str) -> str:
|
||||||
|
"""Create a new session for *username* and return the opaque token."""
|
||||||
|
_purge_expired_sessions()
|
||||||
|
token = secrets.token_hex(32)
|
||||||
|
_sessions[token] = {
|
||||||
|
"username": username,
|
||||||
|
"expires": time.time() + SESSION_TTL,
|
||||||
|
"created": time.time(),
|
||||||
|
}
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_user(token: str) -> "User | None":
|
||||||
|
"""Return the User for a valid *token*, or None if missing/expired."""
|
||||||
|
if not token:
|
||||||
|
return None
|
||||||
|
session = _sessions.get(token)
|
||||||
|
if not session:
|
||||||
|
return None
|
||||||
|
if session["expires"] < time.time():
|
||||||
|
del _sessions[token]
|
||||||
|
return None
|
||||||
|
return get_user(session["username"])
|
||||||
|
|
||||||
|
|
||||||
|
def delete_session(token: str) -> None:
|
||||||
|
"""Invalidate *token* (logout)."""
|
||||||
|
_sessions.pop(token, None)
|
||||||
|
|
||||||
|
|
||||||
|
def _purge_expired_sessions() -> None:
|
||||||
|
now = time.time()
|
||||||
|
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
|
||||||
|
for t in expired:
|
||||||
|
del _sessions[t]
|
||||||
|
|
||||||
|
|
||||||
|
def save_sessions() -> dict:
|
||||||
|
"""Return a snapshot of non-expired sessions suitable for pickling."""
|
||||||
|
_purge_expired_sessions()
|
||||||
|
return dict(_sessions)
|
||||||
|
|
||||||
|
|
||||||
|
def load_sessions(snapshot: dict) -> None:
|
||||||
|
"""Restore sessions from a pickled snapshot, dropping any that have expired."""
|
||||||
|
global _sessions
|
||||||
|
now = time.time()
|
||||||
|
_sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
|
||||||
|
logger.debug("Restored %d session(s) from pickle", len(_sessions))
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
"""WebSocket handler and broadcast helpers for hbd.
|
||||||
|
|
||||||
|
WebSocket connections are served through the regular HTTP port via the
|
||||||
|
/ws route registered in http.py (aiohttp WebSocketResponse upgrade).
|
||||||
|
The separate standalone WebSocket server on ws_port is no longer used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Callable, Iterable, Optional
|
||||||
|
from . import data
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_connections: set = set()
|
||||||
|
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||||
|
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||||
|
_verbose: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def setup(
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
get_hosts: Optional[Callable[[], Iterable]] = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
):
|
||||||
|
"""Register the running loop and initial-state callback.
|
||||||
|
|
||||||
|
Call this once from _run_async before starting the HTTP server.
|
||||||
|
"""
|
||||||
|
global _loop, _get_hosts, _verbose
|
||||||
|
_loop = loop
|
||||||
|
_get_hosts = get_hosts
|
||||||
|
_verbose = verbose
|
||||||
|
|
||||||
|
|
||||||
|
async def handler(request):
|
||||||
|
"""aiohttp WebSocket upgrade handler — register as GET /ws."""
|
||||||
|
from aiohttp import web
|
||||||
|
|
||||||
|
ws = web.WebSocketResponse()
|
||||||
|
await ws.prepare(request)
|
||||||
|
|
||||||
|
_connections.add(ws)
|
||||||
|
remote = request.remote
|
||||||
|
logger.info("WebSocket connected from %s", remote)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Send current host state to the new client
|
||||||
|
if _get_hosts:
|
||||||
|
try:
|
||||||
|
for h in list(_get_hosts()):
|
||||||
|
await ws.send_str(json.dumps({"type": "host", "data": h}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error sending initial hosts: %s", e)
|
||||||
|
|
||||||
|
# Send recent messages
|
||||||
|
if data.msgs:
|
||||||
|
try:
|
||||||
|
for m in data.msgs:
|
||||||
|
await ws.send_str(json.dumps({"type": "message", "data": m}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error sending initial messages: %s", e)
|
||||||
|
|
||||||
|
# Keep connection open, ignore incoming frames
|
||||||
|
async for msg in ws:
|
||||||
|
from aiohttp import WSMsgType
|
||||||
|
if msg.type == WSMsgType.TEXT:
|
||||||
|
if _verbose:
|
||||||
|
logger.debug("ws recv from %s: %s", remote, msg.data)
|
||||||
|
elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("WebSocket handler error from %s: %s", remote, e)
|
||||||
|
finally:
|
||||||
|
_connections.discard(ws)
|
||||||
|
logger.info("WebSocket disconnected from %s", remote)
|
||||||
|
|
||||||
|
return ws
|
||||||
|
|
||||||
|
|
||||||
|
def broadcast(typ: str, payload) -> bool:
|
||||||
|
"""Thread-safe broadcast to all connected WebSocket clients.
|
||||||
|
|
||||||
|
Can be called from any thread; schedules sends on the event loop.
|
||||||
|
Returns False if the loop is not running yet.
|
||||||
|
"""
|
||||||
|
if not _loop:
|
||||||
|
return False
|
||||||
|
jmsg = json.dumps({"type": typ, "data": payload})
|
||||||
|
|
||||||
|
async def _send_all():
|
||||||
|
dead = set()
|
||||||
|
for ws in list(_connections):
|
||||||
|
try:
|
||||||
|
if not ws.closed:
|
||||||
|
await ws.send_str(jmsg)
|
||||||
|
else:
|
||||||
|
dead.add(ws)
|
||||||
|
except Exception:
|
||||||
|
dead.add(ws)
|
||||||
|
for ws in dead:
|
||||||
|
_connections.discard(ws)
|
||||||
|
|
||||||
|
asyncio.run_coroutine_threadsafe(_send_all(), _loop)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def connection_count() -> int:
|
||||||
|
return len(_connections)
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 5.3 KiB |
@@ -1,7 +0,0 @@
|
|||||||
<head>
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
|
||||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
|
||||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
|
||||||
<title>{{ title }}</title>
|
|
||||||
<script src="{{ extra_scripts }}"></script>
|
|
||||||
</head>
|
|
||||||
@@ -1,281 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
{% include 'head.html' %}
|
|
||||||
|
|
||||||
<style>
|
|
||||||
.content {
|
|
||||||
display: flex;
|
|
||||||
flex-direction: column;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table {
|
|
||||||
/* flex: 1; */
|
|
||||||
flex-grow: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
.log {
|
|
||||||
flex: 2;
|
|
||||||
flex-grow: 1;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable {
|
|
||||||
border-collapse: collapse;
|
|
||||||
font-size: 95%;
|
|
||||||
/* width: 100%; */
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable td,
|
|
||||||
#ntable th {
|
|
||||||
border: 1px solid #ddd;
|
|
||||||
text-align: left;
|
|
||||||
padding: 0px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable tr:nth-child(even) {
|
|
||||||
background-color: #f2f2f2;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable tr:hover {
|
|
||||||
background-color: #ddd;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable th {
|
|
||||||
padding-top: 12px;
|
|
||||||
padding-bottom: 12px;
|
|
||||||
background-color: #9d9d9d;
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ntable
|
|
||||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
|
||||||
content: " \2195";
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Modal for connection status messages */
|
|
||||||
.connection-modal {
|
|
||||||
display: none;
|
|
||||||
position: fixed;
|
|
||||||
z-index: 1000;
|
|
||||||
left: 0;
|
|
||||||
top: 0;
|
|
||||||
width: 100%;
|
|
||||||
height: 100%;
|
|
||||||
background-color: rgba(0, 0, 0, 0.4);
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal.show {
|
|
||||||
display: flex;
|
|
||||||
justify-content: center;
|
|
||||||
align-items: center;
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal-content {
|
|
||||||
background-color: #f9f9f9;
|
|
||||||
padding: 20px;
|
|
||||||
border: 1px solid #888;
|
|
||||||
border-radius: 5px;
|
|
||||||
text-align: center;
|
|
||||||
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
|
|
||||||
min-width: 300px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.connection-modal-content p {
|
|
||||||
margin: 10px 0;
|
|
||||||
font-size: 16px;
|
|
||||||
color: #333;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<script type="text/javascript">
|
|
||||||
var cnt = 0;
|
|
||||||
var nTable = document;
|
|
||||||
var name_idx = {};
|
|
||||||
var c = 0;
|
|
||||||
|
|
||||||
function setup() {
|
|
||||||
name_idx = {};
|
|
||||||
nTable = document.getElementById("ntable");
|
|
||||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
|
||||||
if (i == 0) continue;
|
|
||||||
name = nTable.rows[i].cells[0].innerText;
|
|
||||||
name_idx[name] = nTable.rows[i];
|
|
||||||
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function createRow(data) {
|
|
||||||
var row = document.createElement("tr");
|
|
||||||
var c_name = document.createElement("td");
|
|
||||||
var c_ver = document.createElement("td");
|
|
||||||
var c_ipv4addr = document.createElement("td");
|
|
||||||
var c_ipv4state = document.createElement("td");
|
|
||||||
var c_ipv4latency = document.createElement("td");
|
|
||||||
c_ipv4latency.style.textAlign = "right";
|
|
||||||
var c_ipv4statets = document.createElement("td");
|
|
||||||
c_ipv4statets.style.textAlign = "right";
|
|
||||||
var c_ipv6addr = document.createElement("td");
|
|
||||||
var c_ipv6state = document.createElement("td");
|
|
||||||
var c_ipv6latency = document.createElement("td");
|
|
||||||
c_ipv6latency.style.textAlign = "right";
|
|
||||||
var c_ipv6statets = document.createElement("td");
|
|
||||||
c_ipv6statets.style.textAlign = "right";
|
|
||||||
row.appendChild(c_name);
|
|
||||||
row.appendChild(c_ver);
|
|
||||||
row.appendChild(c_ipv4addr);
|
|
||||||
row.appendChild(c_ipv4state);
|
|
||||||
row.appendChild(c_ipv4latency);
|
|
||||||
row.appendChild(c_ipv4statets);
|
|
||||||
row.appendChild(c_ipv6addr);
|
|
||||||
row.appendChild(c_ipv6state);
|
|
||||||
row.appendChild(c_ipv6latency);
|
|
||||||
row.appendChild(c_ipv6statets);
|
|
||||||
if (data.dyn) {
|
|
||||||
c_name.innerHTML = "<b>" + data.name + "</b>";
|
|
||||||
} else {
|
|
||||||
c_name.innerHTML = data.name;
|
|
||||||
}
|
|
||||||
c_ver.innerHTML = data.cver;
|
|
||||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
|
||||||
c_ipv4state.innerHTML = data.connections[0].state;
|
|
||||||
if (data.connections.length > 1) {
|
|
||||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
|
||||||
c_ipv6state.innerHTML = data.connections[1].state;
|
|
||||||
}
|
|
||||||
var table = document.getElementById("ntablebody"); // find table to append to
|
|
||||||
table.appendChild(row); // append row to table
|
|
||||||
name_idx[c_name] = row;
|
|
||||||
}
|
|
||||||
|
|
||||||
function formatTS(ts) {
|
|
||||||
const milliseconds = ts * 1000;
|
|
||||||
const dateObject = new Date(milliseconds);
|
|
||||||
return dateObject.toLocaleString("de-DE");
|
|
||||||
}
|
|
||||||
|
|
||||||
function update_table(data) {
|
|
||||||
if (!(data.name in name_idx)) {
|
|
||||||
createRow(data);
|
|
||||||
setup();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var i = 0; i < data.connections.length; i++) {
|
|
||||||
name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
|
|
||||||
name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
|
|
||||||
data.connections[i].statetime
|
|
||||||
);
|
|
||||||
if (data.connections[i].state == "up") {
|
|
||||||
state = "up";
|
|
||||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
|
||||||
} else {
|
|
||||||
if (data.connections[i].state == "unknown") {
|
|
||||||
state = "";
|
|
||||||
latency = "";
|
|
||||||
name_idx[data.name].cells[2 + i * 4].innerHTML = "";
|
|
||||||
name_idx[data.name].cells[5 + i * 4].innerHTML = "";
|
|
||||||
} else {
|
|
||||||
state = "<b>" + data.connections[i].state + "</b>";
|
|
||||||
latency = "-";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
name_idx[data.name].cells[3 + i * 4].innerHTML = state;
|
|
||||||
name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function WS_Connect() {
|
|
||||||
if ("WebSocket" in window) {
|
|
||||||
//N.B: subprotocol field causes chrome to error 1006
|
|
||||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}" /*, "hdb" */);
|
|
||||||
|
|
||||||
ws_hbd.onopen = function () {
|
|
||||||
// Web Socket is connected, send data using send()
|
|
||||||
console.log("ws connect");
|
|
||||||
// Hide modal window if visible
|
|
||||||
var modal = document.getElementById("connectionModal");
|
|
||||||
if (modal) {
|
|
||||||
modal.classList.remove("show");
|
|
||||||
}
|
|
||||||
ws_hbd.send("heartbeat_web");
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onerror = function (event) {
|
|
||||||
console.log(event);
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onmessage = function (event) {
|
|
||||||
/* console.log(event.data); */
|
|
||||||
var state = JSON.parse(event.data);
|
|
||||||
/* console.log("State: " + state.type); */
|
|
||||||
if (state.type == "host") {
|
|
||||||
update_table(state.data);
|
|
||||||
} else if (state.type == "message") {
|
|
||||||
var msgs = document.getElementById("messages");
|
|
||||||
msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
|
|
||||||
}
|
|
||||||
cnt++;
|
|
||||||
};
|
|
||||||
|
|
||||||
ws_hbd.onclose = function (event) {
|
|
||||||
/* console.log(event); */
|
|
||||||
console.log("Connection is closed, reopening");
|
|
||||||
// Show modal window
|
|
||||||
var modal = document.getElementById("connectionModal");
|
|
||||||
if (modal) {
|
|
||||||
modal.classList.add("show");
|
|
||||||
}
|
|
||||||
setTimeout(function () {
|
|
||||||
WS_Connect();
|
|
||||||
}, 3000);
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
// The browser doesn't support WebSocket
|
|
||||||
console.log("WebSocket NOT supported by your Browser!");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
WS_Connect();
|
|
||||||
</script>
|
|
||||||
<body>
|
|
||||||
{% include 'menu.html' %}
|
|
||||||
|
|
||||||
<div id="content" class="content" style="overflow: hidden">
|
|
||||||
<div id="table" class="table" style="overflow: hidden">
|
|
||||||
<!-- <h2>{{title}}</h2> -->
|
|
||||||
<table id="ntable" class="sortable">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Name</th>
|
|
||||||
<th>Ver</th>
|
|
||||||
<th>IPv4 Addr</th>
|
|
||||||
<th>State</th>
|
|
||||||
<th style="text-align: right">Latencey</th>
|
|
||||||
<th style="text-align: right">Last State</th>
|
|
||||||
<th>IPv6 Addr</th>
|
|
||||||
<th>State</th>
|
|
||||||
<th style="text-align: right">Latencey</th>
|
|
||||||
<th style="text-align: right">Last State</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody id="ntablebody"></tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
<div id="log" class="log" style="overflow: auto;">
|
|
||||||
<h2>Log of Events</h2>
|
|
||||||
<div id="messages">
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% include 'foot.html' %}
|
|
||||||
|
|
||||||
<!-- Connection status modal -->
|
|
||||||
<div id="connectionModal" class="connection-modal">
|
|
||||||
<div class="connection-modal-content">
|
|
||||||
<p>⚠️ Connection is closed, reopening...</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
setup();
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
<label for="drawer-toggle" id="drawer-toggle-label"></label>
|
|
||||||
<header>{{ header }}</header>
|
|
||||||
|
|
||||||
-233
@@ -1,233 +0,0 @@
|
|||||||
"""UDP listener and datagram processing."""
|
|
||||||
import asyncio
|
|
||||||
import zlib
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
from .proto import stodict, oldmtodict
|
|
||||||
from hbd.utils import dur
|
|
||||||
|
|
||||||
|
|
||||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
|
||||||
def __init__(self, config=None, handler=None):
|
|
||||||
super().__init__()
|
|
||||||
self.config = config or {}
|
|
||||||
self.handler = handler
|
|
||||||
|
|
||||||
def connection_made(self, transport):
|
|
||||||
self.transport = transport
|
|
||||||
logger.info("UDP Server listening...")
|
|
||||||
|
|
||||||
def datagram_received(self, data, addr):
|
|
||||||
logger.debug("Received from %s", addr)
|
|
||||||
try:
|
|
||||||
msg = parse_message(data)
|
|
||||||
if self.handler:
|
|
||||||
# handler can be a callable provided by the application
|
|
||||||
# pass the transport so handlers can send replies (ACKs/commands)
|
|
||||||
self.handler(msg, addr, self.transport)
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Error while processing datagram from %s", addr)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_message(data: bytes):
|
|
||||||
"""Parse a raw datagram into a message dict.
|
|
||||||
|
|
||||||
Uses the protocol decoding helpers and falls back to old format when
|
|
||||||
decoding returns an empty dict (compat with older clients).
|
|
||||||
"""
|
|
||||||
msg = stodict(data)
|
|
||||||
if not msg:
|
|
||||||
# fallback to old format
|
|
||||||
msg = oldmtodict(data)
|
|
||||||
return msg
|
|
||||||
|
|
||||||
def dicttos(ID, d, compress=False):
|
|
||||||
s = []
|
|
||||||
for k in d:
|
|
||||||
if isinstance(d[k], float):
|
|
||||||
s.append("%s=%0.5f" % (k, d[k]))
|
|
||||||
else:
|
|
||||||
s.append("%s=%s" % (k, d[k]))
|
|
||||||
pk = ";".join(s)
|
|
||||||
if compress:
|
|
||||||
zpk = zlib.compress(pk.encode(), 6)
|
|
||||||
ID = "!" + ID + ":"
|
|
||||||
opk = ID.encode() + zpk
|
|
||||||
else:
|
|
||||||
zpk = pk
|
|
||||||
opk = ID + ":" + zpk
|
|
||||||
return opk
|
|
||||||
|
|
||||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|
||||||
"""Handle a parsed datagram message.
|
|
||||||
|
|
||||||
ctx is a dictionary with runtime dependencies:
|
|
||||||
- config: dict of configuration
|
|
||||||
- hbdclass: module providing Host/Connection classes
|
|
||||||
- log: callable(loghost, message)
|
|
||||||
- email: callable(subject, message)
|
|
||||||
- pushmsg: callable(message)
|
|
||||||
- msg_to_websockets: callable(typ, data)
|
|
||||||
- DEBUG, verbose
|
|
||||||
"""
|
|
||||||
if not msg:
|
|
||||||
return
|
|
||||||
now = __import__("time").time()
|
|
||||||
cfg = ctx.get("config", {})
|
|
||||||
hbdcls = ctx.get("hbdclass")
|
|
||||||
log = ctx.get("log")
|
|
||||||
email = ctx.get("email")
|
|
||||||
pushmsg = ctx.get("pushmsg")
|
|
||||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
|
||||||
DEBUG = ctx.get("DEBUG", 0)
|
|
||||||
verbose = ctx.get("verbose", False)
|
|
||||||
|
|
||||||
# normalize addr (ip, port)
|
|
||||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
|
||||||
name = msg.get("name", "unknown")
|
|
||||||
from hbd.utils import shortname
|
|
||||||
uname = shortname(name)
|
|
||||||
|
|
||||||
if uname not in hbdcls.Host.hosts:
|
|
||||||
host = hbdcls.Host(uname)
|
|
||||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
|
||||||
if verbose:
|
|
||||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
|
||||||
newh = True
|
|
||||||
else:
|
|
||||||
host = hbdcls.Host.hosts[uname]
|
|
||||||
newh = False
|
|
||||||
|
|
||||||
cid = msg.get("id", 0)
|
|
||||||
try:
|
|
||||||
rtt = float(msg.get("rtt", None))
|
|
||||||
except Exception:
|
|
||||||
rtt = None
|
|
||||||
|
|
||||||
if msg.get("ID") == "HTB":
|
|
||||||
host.doesack = msg.get("acks", -1)
|
|
||||||
host.setcver(msg.get("ver", 0))
|
|
||||||
|
|
||||||
try:
|
|
||||||
conn, res = host.conndata(cid, ip, rtt, now)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print("conndata failed: %s" % e)
|
|
||||||
return
|
|
||||||
|
|
||||||
if res:
|
|
||||||
if log:
|
|
||||||
log(uname, res)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("address change", "%s %s" % (host.name, res))
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s" % (host.name, res))
|
|
||||||
|
|
||||||
interval = int(msg.get("interval", 0) or 0)
|
|
||||||
shutdown = msg.get("shutdown", 0)
|
|
||||||
service = msg.get("service", "unknown")
|
|
||||||
message = msg.get("msg", None)
|
|
||||||
boot = msg.get("boot", 0)
|
|
||||||
|
|
||||||
if boot:
|
|
||||||
if log:
|
|
||||||
log(uname, "booted")
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
m = "%s booted" % (host.name)
|
|
||||||
if email:
|
|
||||||
email("booted", m)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg(m)
|
|
||||||
if message:
|
|
||||||
if log:
|
|
||||||
log(uname, "msg: %s" % message, service=service)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("msg", message)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg(message)
|
|
||||||
|
|
||||||
if conn.getstate() != hbdcls.Connection.UP:
|
|
||||||
lasts = conn.state
|
|
||||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
|
||||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
|
||||||
if log:
|
|
||||||
log(uname, m)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("%s back" % conn.afam, uname)
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
|
||||||
|
|
||||||
if boot or newh:
|
|
||||||
host.upcount = host.doesack
|
|
||||||
else:
|
|
||||||
host.upcount += 1
|
|
||||||
|
|
||||||
if shutdown:
|
|
||||||
if log:
|
|
||||||
log(uname, "%s shutdown" % conn.afam)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
|
||||||
if email:
|
|
||||||
email("shutdown", "%s %s shutdown" % (uname, conn.afam))
|
|
||||||
if pushmsg:
|
|
||||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
|
||||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
|
||||||
|
|
||||||
if interval > 0:
|
|
||||||
host.interval = interval
|
|
||||||
|
|
||||||
# send ACK back
|
|
||||||
rmsg = {"time": __import__("time").time()}
|
|
||||||
if host.cver < 1:
|
|
||||||
opkt = b"ACK"
|
|
||||||
else:
|
|
||||||
opkt = dicttos("ACK", rmsg, host.cver > 1)
|
|
||||||
try:
|
|
||||||
transport.sendto(opkt, addr)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print(("cannot send ack: %s" % e))
|
|
||||||
|
|
||||||
# send any commands we have queued
|
|
||||||
while len(host.cmds):
|
|
||||||
op, rmsg = host.cmds[0]
|
|
||||||
if op == "CMD":
|
|
||||||
if email:
|
|
||||||
email("%s cmd exec" % uname, "command '%s' sent" % rmsg)
|
|
||||||
del host.cmds[0]
|
|
||||||
if log:
|
|
||||||
log(uname, "command sent")
|
|
||||||
if host.cver < 1:
|
|
||||||
rmsg = rmsg["cmd"]
|
|
||||||
elif op == "UPD":
|
|
||||||
del host.cmds[0]
|
|
||||||
if log:
|
|
||||||
log(uname, "update initiated")
|
|
||||||
if host.cver < 1:
|
|
||||||
if log:
|
|
||||||
log(uname, " ver 0 does not support UPD")
|
|
||||||
continue
|
|
||||||
if host.cver < 1:
|
|
||||||
opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
|
|
||||||
if isinstance(opkt, str):
|
|
||||||
opkt = opkt.encode()
|
|
||||||
else:
|
|
||||||
opkt = dicttos(op, rmsg, True)
|
|
||||||
try:
|
|
||||||
transport.sendto(opkt, addr)
|
|
||||||
except Exception as e:
|
|
||||||
if DEBUG > 0:
|
|
||||||
print(("cannot send cmd/update: %s" % e))
|
|
||||||
|
|
||||||
if msg_to_websockets:
|
|
||||||
try:
|
|
||||||
msg_to_websockets("host", host.stateinfo())
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
"""WebSocket server and broadcast helpers for hbd.
|
|
||||||
|
|
||||||
Provides an asyncio-based WebSocket server and a thread-safe broadcast
|
|
||||||
function that other threads or synchronous code can call.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from typing import Callable, Iterable, Optional
|
|
||||||
|
|
||||||
import websockets
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_connections = set()
|
|
||||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
||||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
|
||||||
_get_msgs: Optional[Callable[[], Iterable]] = None
|
|
||||||
_verbose = False
|
|
||||||
|
|
||||||
|
|
||||||
async def _handler(websocket, path=None):
|
|
||||||
# Some versions of the websockets library call handler(connection) only;
|
|
||||||
# accept optional path and fall back to websocket.path when missing.
|
|
||||||
global _connections
|
|
||||||
_connections.add(websocket)
|
|
||||||
remote_address = getattr(websocket, "remote_address", None)
|
|
||||||
if path is None:
|
|
||||||
path = getattr(websocket, "path", None)
|
|
||||||
if _verbose:
|
|
||||||
logger.info("DBG ws_serve: %s: %s", remote_address, path)
|
|
||||||
try:
|
|
||||||
# send initial hosts
|
|
||||||
if _get_hosts:
|
|
||||||
for h in _get_hosts():
|
|
||||||
jmsg = json.dumps({"type": "host", "data": h})
|
|
||||||
await websocket.send(jmsg)
|
|
||||||
# send recent messages
|
|
||||||
if _get_msgs:
|
|
||||||
for m in list(_get_msgs())[-100:]:
|
|
||||||
jmsg = json.dumps({"type": "message", "data": m})
|
|
||||||
await websocket.send(jmsg)
|
|
||||||
|
|
||||||
# keep connection open until client disconnects
|
|
||||||
async for _ in websocket:
|
|
||||||
# we don't expect meaningful incoming messages besides the initial
|
|
||||||
# client 'hello' that some clients send; ignore for now
|
|
||||||
if _verbose:
|
|
||||||
logger.debug("received ws data: %s", _)
|
|
||||||
|
|
||||||
except (websockets.exceptions.ConnectionClosedOK, websockets.exceptions.ConnectionClosedError) as e:
|
|
||||||
if _verbose:
|
|
||||||
logger.info("ws closed: %r", e)
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("ws handler exception: %s", e)
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
_connections.remove(websocket)
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
await websocket.wait_closed()
|
|
||||||
|
|
||||||
|
|
||||||
async def start(host: str, ws_port: int, wss_port: Optional[int] = None, ssl_context=None, get_hosts: Optional[Callable] = None, get_msgs: Optional[Callable] = None, verbose: bool = False):
|
|
||||||
"""Start WebSocket servers and block until cancelled.
|
|
||||||
|
|
||||||
This is intended to be awaited inside the main asyncio event loop.
|
|
||||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
|
||||||
started.
|
|
||||||
"""
|
|
||||||
global _loop, _get_hosts, _get_msgs, _verbose
|
|
||||||
_loop = asyncio.get_running_loop()
|
|
||||||
_get_hosts = get_hosts
|
|
||||||
_get_msgs = get_msgs
|
|
||||||
_verbose = verbose
|
|
||||||
|
|
||||||
servers = []
|
|
||||||
# plain WebSocket
|
|
||||||
ws_server = websockets.serve(_handler, host, ws_port) #, subprotocols=["hbd"])
|
|
||||||
websockets_logger = logging.getLogger("websockets.server")
|
|
||||||
websockets_logger.setLevel(logging.INFO)
|
|
||||||
servers.append(ws_server)
|
|
||||||
|
|
||||||
# secure WebSocket (optional)
|
|
||||||
if wss_port and ssl_context:
|
|
||||||
wss_server = websockets.serve(_handler, host, wss_port, ssl=ssl_context) #, subprotocols=["hbd"])
|
|
||||||
servers.append(wss_server)
|
|
||||||
|
|
||||||
# await starting of all servers
|
|
||||||
try:
|
|
||||||
for srv in servers:
|
|
||||||
await srv
|
|
||||||
|
|
||||||
if _verbose:
|
|
||||||
logger.info("WebSocket server started on port %s (wss %s)", ws_port, wss_port)
|
|
||||||
|
|
||||||
# block forever (until loop is stopped or cancelled)
|
|
||||||
await asyncio.Future()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
logger.info("WebSocket server shutting down...")
|
|
||||||
# Close all active connections
|
|
||||||
for conn in list(_connections):
|
|
||||||
try:
|
|
||||||
await conn.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
_connections.clear()
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def broadcast(typ: str, data) -> bool:
|
|
||||||
"""Thread-safe broadcast helper.
|
|
||||||
|
|
||||||
Schedules coroutine(s) on the running loop to send message to all
|
|
||||||
connected websockets. Returns False if server was not running.
|
|
||||||
"""
|
|
||||||
global _loop
|
|
||||||
|
|
||||||
if not _loop:
|
|
||||||
return False
|
|
||||||
jmsg = json.dumps({"type": typ, "data": data})
|
|
||||||
to_close = []
|
|
||||||
for ws in list(_connections):
|
|
||||||
if ws.state != websockets.protocol.State.OPEN:
|
|
||||||
to_close.append(ws)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
|
|
||||||
except Exception:
|
|
||||||
to_close.append(ws)
|
|
||||||
logger.debug("ws.send exception: closed")
|
|
||||||
for ws in to_close:
|
|
||||||
try:
|
|
||||||
asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if ws in _connections:
|
|
||||||
_connections.remove(ws)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def connection_count() -> int:
|
|
||||||
return len(_connections)
|
|
||||||
-380
@@ -1,380 +0,0 @@
|
|||||||
"""
|
|
||||||
host and connection class shared between hbd and
|
|
||||||
the websit's heartbeat.py
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import copy
|
|
||||||
import queue
|
|
||||||
|
|
||||||
num = 0
|
|
||||||
|
|
||||||
MAXRTTS = 10
|
|
||||||
|
|
||||||
DEBUG = 2
|
|
||||||
|
|
||||||
|
|
||||||
def log(host, m):
|
|
||||||
if DEBUG:
|
|
||||||
print("class log: %s %s" % (host, m))
|
|
||||||
|
|
||||||
|
|
||||||
class Connection:
|
|
||||||
# map of addrs to names
|
|
||||||
|
|
||||||
htab = {}
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
UP = "up"
|
|
||||||
DOWN = "down"
|
|
||||||
OVERDUE = "overdue"
|
|
||||||
|
|
||||||
def __init__(self, host, cid, addr, afam):
|
|
||||||
self.host = host
|
|
||||||
self.cid = cid
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.addr = addr
|
|
||||||
self.afam = afam
|
|
||||||
self.rtts = [0]
|
|
||||||
self.lastbeat = time.time()
|
|
||||||
self.statetime = self.lastbeat
|
|
||||||
self.deltastatetime = "computed"
|
|
||||||
self.state = Connection.UNKNOWN
|
|
||||||
|
|
||||||
if host:
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
log(self.host.name, "dns update %s" % self.addr)
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
|
|
||||||
def clearstate(self):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = ""
|
|
||||||
d["rtt"] = ""
|
|
||||||
d["lastbeat"] = ""
|
|
||||||
d["state"] = ""
|
|
||||||
d["statetime"] = ""
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
d["rttstate"] = ""
|
|
||||||
return d
|
|
||||||
|
|
||||||
def statedict(self, Null=False):
|
|
||||||
d = self.clearstate()
|
|
||||||
now = time.time()
|
|
||||||
if not Null:
|
|
||||||
d["addr"] = self.addr
|
|
||||||
if self.rtts[-1]:
|
|
||||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
|
||||||
elif self.state == Connection.UNKNOWN:
|
|
||||||
d["rtt"] = ""
|
|
||||||
else:
|
|
||||||
d["rtt"] = "?"
|
|
||||||
d["lastbeat"] = self.lastbeat
|
|
||||||
if self.state == Connection.OVERDUE:
|
|
||||||
d["state"] = "<b>%s</b>" % self.state
|
|
||||||
else:
|
|
||||||
d["state"] = self.state
|
|
||||||
if self.state == Connection.UP:
|
|
||||||
d["rttstate"] = d["rtt"]
|
|
||||||
elif self.state == Connection.OVERDUE:
|
|
||||||
d["rttstate"] = ""
|
|
||||||
else:
|
|
||||||
d["rttstate"] = d["state"]
|
|
||||||
d["statetime"] = time.strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
|
||||||
)
|
|
||||||
delta = now - self.statetime
|
|
||||||
|
|
||||||
if self.state == Connection.UNKNOWN:
|
|
||||||
d["deltastatetime"] = ""
|
|
||||||
elif delta > 86400:
|
|
||||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
|
||||||
elif delta > 3600:
|
|
||||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
|
||||||
elif delta > 60:
|
|
||||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
|
||||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
|
||||||
else:
|
|
||||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
|
||||||
d["deltastatetime"] = "%i secs" % (delta)
|
|
||||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
|
||||||
d = self.clearstate()
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self, afam):
|
|
||||||
d = {}
|
|
||||||
d["addr"] = "%s Addr" % afam
|
|
||||||
d["rtt"] = "Latencey"
|
|
||||||
d["lastbeat"] = "Last Contact"
|
|
||||||
d["state"] = "State"
|
|
||||||
d["statetime"] = "Last State"
|
|
||||||
d["rttstate"] = "Reach"
|
|
||||||
d["deltastatetime"] = "Last State"
|
|
||||||
return d
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.__dict__)
|
|
||||||
|
|
||||||
# set new state, return number of secs in previous state
|
|
||||||
def newstate(self, state, now, when=0):
|
|
||||||
self.state = state
|
|
||||||
delta = now - when
|
|
||||||
s = delta - self.statetime
|
|
||||||
self.statetime = delta
|
|
||||||
return s
|
|
||||||
|
|
||||||
def getstate(self):
|
|
||||||
return self.state
|
|
||||||
|
|
||||||
def newaddr(self, addr, rtt, now):
|
|
||||||
self.lastbeat = now
|
|
||||||
self.rtts.append(rtt)
|
|
||||||
if len(self.rtts) > MAXRTTS:
|
|
||||||
del self.rtts[0]
|
|
||||||
|
|
||||||
if self.addr == addr:
|
|
||||||
r = None
|
|
||||||
else:
|
|
||||||
r = "changed from %s to %s" % (self.addr, addr)
|
|
||||||
try:
|
|
||||||
del Connection.htab[self.addr]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self.addr = addr
|
|
||||||
Connection.htab[addr] = self.host.name
|
|
||||||
if self.host.isDynDns():
|
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
class Host:
|
|
||||||
# Table of Hosts
|
|
||||||
hosts = {}
|
|
||||||
dnsQ = queue.Queue()
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
global num
|
|
||||||
self.name = name
|
|
||||||
if name:
|
|
||||||
num += 1
|
|
||||||
Host.hosts[name] = self
|
|
||||||
self.num = num
|
|
||||||
self.dyn = False
|
|
||||||
self.watched = False
|
|
||||||
self.upcount = 0
|
|
||||||
self.interval = 0
|
|
||||||
self.doesack = -1
|
|
||||||
self.cmds = []
|
|
||||||
self.cver = 0
|
|
||||||
self.connections = {}
|
|
||||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
|
||||||
|
|
||||||
def statedict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = self.name
|
|
||||||
if self.dyn:
|
|
||||||
d["name"] += "*"
|
|
||||||
if self.watched:
|
|
||||||
d["name"] = "<b>%s</b>" % d["name"]
|
|
||||||
d["dyn"] = str(self.dyn)
|
|
||||||
d["ver"] = str(self.cver)
|
|
||||||
d["num"] = self.num
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
cs = self.connections[c].statedict()
|
|
||||||
else:
|
|
||||||
cs = ubConnection.statedict(True)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
def headerdict(self):
|
|
||||||
d = {}
|
|
||||||
d["name"] = "Name"
|
|
||||||
d["dyn"] = "Dyn"
|
|
||||||
d["ver"] = "Ver"
|
|
||||||
d["num"] = "??"
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
cs = ubConnection.headerdict(c)
|
|
||||||
for csv in cs:
|
|
||||||
d["%s.%s" % (c, csv)] = cs[csv]
|
|
||||||
return d
|
|
||||||
|
|
||||||
def registerDns(self):
|
|
||||||
for af in self.connections:
|
|
||||||
self.connections[af].registerDns()
|
|
||||||
|
|
||||||
def stateinfo(self):
|
|
||||||
ddict = {}
|
|
||||||
for d in self.__dict__:
|
|
||||||
if d == "connections":
|
|
||||||
cl = []
|
|
||||||
for c in self.connections:
|
|
||||||
# dirty ugly hack: fix conn to host backpointer
|
|
||||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
|
||||||
cld["host"] = cld["host"].name
|
|
||||||
cl.append(cld)
|
|
||||||
ddict[d] = cl
|
|
||||||
else:
|
|
||||||
ddict[d] = self.__dict__[d]
|
|
||||||
return ddict
|
|
||||||
|
|
||||||
def jsons(self):
|
|
||||||
return json.dumps(self.stateinfo())
|
|
||||||
|
|
||||||
def setcver(self, cver):
|
|
||||||
self.cver = cver
|
|
||||||
|
|
||||||
def isDynDns(self):
|
|
||||||
return self.dyn
|
|
||||||
|
|
||||||
def isIPv4(self, addr):
|
|
||||||
if isinstance(addr, tuple):
|
|
||||||
return addr[0].find(".") > 0
|
|
||||||
else:
|
|
||||||
return addr.find(".") > 0
|
|
||||||
|
|
||||||
def conndata(self, cid, addr, rtt, now):
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
if self.isIPv4(addr):
|
|
||||||
afam = "IPv4"
|
|
||||||
else:
|
|
||||||
afam = "IPv6"
|
|
||||||
|
|
||||||
if afam not in self.connections:
|
|
||||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
|
||||||
|
|
||||||
conn = self.connections[afam]
|
|
||||||
res = conn.newaddr(addr, rtt, now)
|
|
||||||
return conn, res
|
|
||||||
|
|
||||||
# called when reloading class from pickle, add new fields here
|
|
||||||
def fixup(self):
|
|
||||||
for c in ["IPv4", "IPv6"]:
|
|
||||||
if c in self.connections:
|
|
||||||
addr = self.connections[c].addr
|
|
||||||
if addr[0:7] == "::ffff:":
|
|
||||||
addr = addr[7:]
|
|
||||||
self.connections[c].addr = addr
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def dispstate(self):
|
|
||||||
# if self.state in ["down", "overdue"]:
|
|
||||||
# state = "<b>%s</b>" % self.state
|
|
||||||
# elif self.state in ["up", "UP"]:
|
|
||||||
# state = ""
|
|
||||||
# for x in list(self.connections.keys()):
|
|
||||||
# try:
|
|
||||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
|
||||||
# except:
|
|
||||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
|
||||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
|
||||||
# state = ""
|
|
||||||
# else:
|
|
||||||
# state = "%s" % self.state
|
|
||||||
# return state
|
|
||||||
|
|
||||||
def dispstats(self):
|
|
||||||
if self.doesack != -1:
|
|
||||||
if self.upcount > 0:
|
|
||||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
|
||||||
r = ""
|
|
||||||
for v in range(3):
|
|
||||||
a, u = self.hdwcounts[v]
|
|
||||||
if (self.upcount - u) != 0:
|
|
||||||
vs = "%0.0f" % (
|
|
||||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
|
||||||
)
|
|
||||||
if vs == "0":
|
|
||||||
vs = ""
|
|
||||||
else:
|
|
||||||
vs = "-"
|
|
||||||
r += '<td align="right">%s</td>' % vs
|
|
||||||
return r
|
|
||||||
else:
|
|
||||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
|
||||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
|
||||||
|
|
||||||
hostfields_long = [
|
|
||||||
"name",
|
|
||||||
"IPv4.addr",
|
|
||||||
"IPv4.state",
|
|
||||||
("IPv4.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv4.statetime", 'style="text-align: right;"'),
|
|
||||||
"IPv6.addr",
|
|
||||||
"IPv6.state",
|
|
||||||
("IPv6.rtt", 'style="text-align: right;"'),
|
|
||||||
("IPv6.statetime", 'style="text-align: right;"'),
|
|
||||||
"ver",
|
|
||||||
]
|
|
||||||
|
|
||||||
hostfields_short = [
|
|
||||||
"name",
|
|
||||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
|
||||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def gene(self, tag, v, attrib=None):
|
|
||||||
if attrib:
|
|
||||||
a = " %s" % attrib
|
|
||||||
else:
|
|
||||||
a = ""
|
|
||||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
|
||||||
|
|
||||||
def htmltable(self, tag, hd, short):
|
|
||||||
if short:
|
|
||||||
hostfields = Host.hostfields_short
|
|
||||||
else:
|
|
||||||
hostfields = Host.hostfields_long
|
|
||||||
h = []
|
|
||||||
for f in hostfields:
|
|
||||||
if isinstance(f, tuple):
|
|
||||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
|
||||||
else:
|
|
||||||
h.append(self.gene(tag, hd[f]))
|
|
||||||
return self.gene("tr", "\n".join(h))
|
|
||||||
|
|
||||||
def buildhosttable(self, short=False):
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: start")
|
|
||||||
res = []
|
|
||||||
res.append('<table id="ntable" class="sortable">')
|
|
||||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
|
||||||
hosts_sorted = list(Host.hosts.keys())
|
|
||||||
if len(hosts_sorted):
|
|
||||||
hosts_sorted.sort()
|
|
||||||
for h in hosts_sorted:
|
|
||||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
|
||||||
res.append("</table>")
|
|
||||||
if DEBUG > 1:
|
|
||||||
print("DBG buildhosttable: %s" % res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def buildmsgtable(self, msgs):
|
|
||||||
res = []
|
|
||||||
le = max(40 - len(Host.hosts), 3)
|
|
||||||
res.append("<h4>Log of Events</h4>")
|
|
||||||
for m in msgs[len(msgs) - le:]:
|
|
||||||
res.append("%s<BR>" % m)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
# create fake "unbound objects", remove in Python 3.0
|
|
||||||
ubHost = Host(None)
|
|
||||||
ubConnection = Connection(None, "", "", "")
|
|
||||||
@@ -1,194 +0,0 @@
|
|||||||
Metadata-Version: 2.4
|
|
||||||
Name: heartbeat
|
|
||||||
Version: 0.1.0
|
|
||||||
Summary: Heartbeat daemon (hbd) — receive heartbeats and act on them
|
|
||||||
Author: heartbeat contributors
|
|
||||||
License: MIT
|
|
||||||
Keywords: heartbeat,monitoring,dns,websocket
|
|
||||||
Requires-Python: >=3.10
|
|
||||||
Description-Content-Type: text/markdown
|
|
||||||
Requires-Dist: websockets>=13.2
|
|
||||||
Requires-Dist: mattermostdriver>=7.3.0
|
|
||||||
Requires-Dist: PyYAML>=6.0
|
|
||||||
Requires-Dist: aiohttp>=3.8
|
|
||||||
Requires-Dist: Jinja2>=3.1.0
|
|
||||||
Provides-Extra: dev
|
|
||||||
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
||||||
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
||||||
Requires-Dist: flake8>=5.0; extra == "dev"
|
|
||||||
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Heartbeat Daemon (hbd) ✅
|
|
||||||
|
|
||||||
A lightweight daemon that listens for UDP heartbeat messages and acts on them: keeps host state, optionally updates DNS records via `nsupdate`, forwards messages to WebSocket clients, and sends notifications (email, Pushover, Mattermost, Signal). It is a refactor of a previously monolithic script into a modular Python package (`hbd`).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📌 Features
|
|
||||||
|
|
||||||
- Receive and parse heartbeat datagrams (text or zlib-compressed) ✅
|
|
||||||
- Maintain host state and detect up/down transitions ✅
|
|
||||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
|
||||||
- WebSocket API for live updates (hosts & messages) ✅
|
|
||||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
|
||||||
- Modular codebase suitable for unit testing and CI ✅
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ⚙️ Quickstart
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
- Python 3.10+ (project uses language features from recent Python)
|
|
||||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
|
||||||
|
|
||||||
Install dependencies (recommended into a venv):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 -m venv .venv
|
|
||||||
source .venv/bin/activate
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
python -m pip install -r requirements.txt
|
|
||||||
# for development/testing tools
|
|
||||||
python -m pip install -r requirements-dev.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Run the daemon (example):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# run with default config lookup (~/.hb.yaml)
|
|
||||||
PYTHONPATH=. hbd -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also run it directly via the package entrypoint after installation:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m hbd.cli -c /path/to/config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🐞 Debugging in VS Code
|
|
||||||
|
|
||||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
|
||||||
|
|
||||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
|
||||||
- Use **F5** and pick one of these configurations from the Run view:
|
|
||||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
|
||||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
|
||||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
|
||||||
|
|
||||||
To start `hbd` manually and wait for the debugger to attach, run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
|
||||||
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🛠 Configuration
|
|
||||||
|
|
||||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
|
||||||
|
|
||||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
|
||||||
- `hbd_port`: internal control port (default: 50004)
|
|
||||||
- `hbd_host`: bind address for HTTP/WSS
|
|
||||||
- `pickfile`: path for persisted state
|
|
||||||
- `logfile`: path to log file
|
|
||||||
- `logfmt`: `text` or `msg`
|
|
||||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
|
||||||
- `interval` / `grace`: heartbeat timing configuration
|
|
||||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
|
||||||
- `nsupdate_bin`: path to nsupdate binary
|
|
||||||
|
|
||||||
Example `.hb.yaml` (minimal):
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
hbd_host: 0.0.0.0
|
|
||||||
hbd_port: 50004
|
|
||||||
dyndomains:
|
|
||||||
- example.com
|
|
||||||
nsupdate_bin: /usr/bin/nsupdate
|
|
||||||
pushsrv: pushover
|
|
||||||
```
|
|
||||||
|
|
||||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Architecture & Modules
|
|
||||||
|
|
||||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
|
||||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
|
||||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and a background DNS thread (`start_dns_thread`)
|
|
||||||
- `hbd.notify` — email and push notification helpers
|
|
||||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
|
||||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
|
||||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
|
||||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
|
||||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
|
||||||
|
|
||||||
This modular layout makes the code easier to test and maintain.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Testing & Dev
|
|
||||||
|
|
||||||
Tests are implemented using `unittest` and additional tests rely on `pytest` if you prefer. To run tests locally without installing anything beyond the dev requirements:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# with project root on PYTHONPATH
|
|
||||||
PYTHONPATH=. python -m unittest discover -v
|
|
||||||
# or with pytest if installed
|
|
||||||
pytest -q
|
|
||||||
```
|
|
||||||
|
|
||||||
Developer tooling included:
|
|
||||||
- `pyproject.toml` — project metadata and dependencies
|
|
||||||
- `requirements-dev.txt` — dev/test dependencies
|
|
||||||
- `tox.ini` — convenience wrappers for running tests, lint, and mypy
|
|
||||||
|
|
||||||
To run linters and type checks locally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# after installing dev deps
|
|
||||||
tox -e lint
|
|
||||||
tox -e mypy
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Running in production
|
|
||||||
|
|
||||||
- Use your system service manager (systemd, launchd, etc.) to run `hbd` in the background.
|
|
||||||
- Ensure `nsupdate` and necessary credentials are available for dynamic DNS updates.
|
|
||||||
- Configure TLS for WSS if you enable secure websockets.
|
|
||||||
|
|
||||||
> Note: The project contains a small example for obtaining DNS-verified certs (certbot with RFC2136) — see earlier commit history or ask me to re-add the example to this README if you want it documented here.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🤝 Contributing
|
|
||||||
|
|
||||||
Contributions welcome! Please:
|
|
||||||
1. Open an issue to discuss larger changes.
|
|
||||||
2. Create a topic branch and a clear PR.
|
|
||||||
3. Add tests for new features and run linters.
|
|
||||||
4. Keep changes focused and documented.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📜 License
|
|
||||||
|
|
||||||
This repository is licensed under the MIT license. See `LICENSE` for details.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
If you'd like, I can also:
|
|
||||||
- add a **GitHub Actions** workflow that runs tests and lint on push/PR 🔁
|
|
||||||
- add a `CONTRIBUTING.md` template for PRs and code style 💬
|
|
||||||
|
|
||||||
Which one should I do next? ✨
|
|
||||||
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
README.md
|
|
||||||
pyproject.toml
|
|
||||||
hbd/__init__.py
|
|
||||||
hbd/cli.py
|
|
||||||
hbd/config.py
|
|
||||||
hbd/dns.py
|
|
||||||
hbd/hbdclass.py
|
|
||||||
hbd/http.py
|
|
||||||
hbd/monitor.py
|
|
||||||
hbd/notify.py
|
|
||||||
hbd/proto.py
|
|
||||||
hbd/server.py
|
|
||||||
hbd/udp.py
|
|
||||||
hbd/utils.py
|
|
||||||
hbd/ws.py
|
|
||||||
heartbeat.egg-info/PKG-INFO
|
|
||||||
heartbeat.egg-info/SOURCES.txt
|
|
||||||
heartbeat.egg-info/dependency_links.txt
|
|
||||||
heartbeat.egg-info/entry_points.txt
|
|
||||||
heartbeat.egg-info/requires.txt
|
|
||||||
heartbeat.egg-info/top_level.txt
|
|
||||||
tests/test_dns.py
|
|
||||||
tests/test_handle_datagram.py
|
|
||||||
tests/test_proto.py
|
|
||||||
tests/test_udp.py
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
[console_scripts]
|
|
||||||
hbd = hbd.cli:main
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
websockets>=13.2
|
|
||||||
mattermostdriver>=7.3.0
|
|
||||||
PyYAML>=6.0
|
|
||||||
aiohttp>=3.8
|
|
||||||
Jinja2>=3.1.0
|
|
||||||
|
|
||||||
[dev]
|
|
||||||
pytest>=7.0
|
|
||||||
pytest-cov>=4.0
|
|
||||||
flake8>=5.0
|
|
||||||
mypy>=1.10
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
hbd
|
|
||||||
-15
@@ -1,15 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# install hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
|
|
||||||
|
|
||||||
set -e
|
|
||||||
if [ ! -d ~/venvs/hbd ]; then
|
|
||||||
mkdir -p ~/venvs
|
|
||||||
python3 -m venv ~/venvs/hbd
|
|
||||||
fi
|
|
||||||
. ~/venvs/hbd/bin/activate
|
|
||||||
pip install 'git+ssh://git@git.wrede.ca/andreas/heartbeat.git'
|
|
||||||
rm -f ~/bin/hbd
|
|
||||||
rm -f ~/bin/hbc
|
|
||||||
ln -sf $(which hbd) ~/bin/hbd
|
|
||||||
ln -sf $(which hbc) ~/bin/hbc
|
|
||||||
Executable
+4
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
#echo "OK - all is well"
|
||||||
|
echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
|
||||||
+46
-14
@@ -4,26 +4,45 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "hbd"
|
name = "hbd"
|
||||||
version = "5.0"
|
version = "5.1.11"
|
||||||
description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
|
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.11"
|
||||||
license = { text = "MIT" }
|
license = "MIT"
|
||||||
keywords = ["heartbeat", "monitoring", "dns", "websocket"]
|
keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "heartbeat contributors" }
|
{ name = "heartbeat contributors" }
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Core dependencies (required for both client and server)
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"websockets>=13.2",
|
|
||||||
"mattermostdriver>=7.3.0",
|
|
||||||
"PyYAML>=6.0",
|
"PyYAML>=6.0",
|
||||||
"aiohttp>=3.8",
|
|
||||||
"Jinja2>=3.1.0",
|
|
||||||
"fastapi>=0.95.0",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
# Client-only dependencies (hbc - system monitoring client)
|
||||||
|
client = [
|
||||||
|
"psutil>=5.9.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Server-only dependencies (hbd - heartbeat daemon/server)
|
||||||
|
server = [
|
||||||
|
"websockets>=13.2",
|
||||||
|
"mattermostdriver>=7.3.0",
|
||||||
|
"aiohttp>=3.11",
|
||||||
|
"Jinja2>=3.1.6",
|
||||||
|
"matrix-nio>=0.24",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Minimal client — hbc_mini only, no external dependencies
|
||||||
|
mini = []
|
||||||
|
|
||||||
|
# Install both client and server
|
||||||
|
all = [
|
||||||
|
"hbd[client,server]",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Development dependencies
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=7.0",
|
"pytest>=7.0",
|
||||||
"pytest-cov>=4.0",
|
"pytest-cov>=4.0",
|
||||||
@@ -31,17 +50,30 @@ dev = [
|
|||||||
"mypy>=1.10",
|
"mypy>=1.10",
|
||||||
"black>=23.0",
|
"black>=23.0",
|
||||||
"isort>=5.0",
|
"isort>=5.0",
|
||||||
"re-commit>=3.0",
|
|
||||||
"tox>=4.0",
|
"tox>=4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
hbd = "hbd.cli:main"
|
hbd = "hbd.server.cli:main"
|
||||||
hbc = "hbd.hbc:main"
|
hbc = "hbd.client.main:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
script-files = ["scripts/hb_install.sh", "scripts/hbc_mini.py"]
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
where = ["."]
|
where = ["."]
|
||||||
include = ["hbd*"]
|
include = ["hbd*"]
|
||||||
|
|
||||||
[tool.setuptools.package-data]
|
[tool.setuptools.package-data]
|
||||||
"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||||
|
"hbd.client" = ["*.yaml"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 111
|
||||||
|
|
||||||
|
[tool.flake8]
|
||||||
|
max-line-length = 111
|
||||||
|
|
||||||
|
[tool.pylint.format]
|
||||||
|
max-line-length = 111
|
||||||
|
|||||||
@@ -1,12 +1,17 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -e
|
||||||
uv version --bump patch
|
uv version --bump patch
|
||||||
VER=$(uv version --short)
|
VER=$(uv version --short)
|
||||||
sed -i "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" moninbox/const.py
|
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||||
|
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" scripts/hbc_mini.py
|
||||||
|
|
||||||
# commit pyproject.toml
|
# commit pyproject.toml
|
||||||
git commit -m "version $VER" pyproject.toml moninbox/const.py
|
git commit -m "version $VER" pyproject.toml hbd/__init__.py scripts/hbc_mini.py
|
||||||
git push
|
git push
|
||||||
# tag version
|
# tag version
|
||||||
git tag -a v$VER -m "Version $VER"
|
git tag -a v$VER -m "Version $VER"
|
||||||
git push --tags
|
git push --tags
|
||||||
|
|
||||||
|
rm hbd/__init__.py.bak
|
||||||
|
rm scripts/hbc_mini.py.bak
|
||||||
|
|||||||
@@ -0,0 +1,390 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Demo script for HTTP API endpoints.
|
||||||
|
Tests and demonstrates the plugin data and alert APIs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:50004"
|
||||||
|
|
||||||
|
def print_section(title):
|
||||||
|
"""Print a formatted section header."""
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f" {title}")
|
||||||
|
print('=' * 70)
|
||||||
|
|
||||||
|
def format_timestamp(timestamp):
|
||||||
|
"""Convert Unix timestamp to readable format."""
|
||||||
|
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
def format_duration(seconds):
|
||||||
|
"""Format duration in human-readable format."""
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{int(seconds)}s"
|
||||||
|
elif seconds < 3600:
|
||||||
|
minutes = int(seconds / 60)
|
||||||
|
secs = int(seconds % 60)
|
||||||
|
return f"{minutes}m {secs}s"
|
||||||
|
elif seconds < 86400:
|
||||||
|
hours = int(seconds / 3600)
|
||||||
|
minutes = int((seconds % 3600) / 60)
|
||||||
|
return f"{hours}h {minutes}m"
|
||||||
|
else:
|
||||||
|
days = int(seconds / 86400)
|
||||||
|
hours = int((seconds % 86400) / 3600)
|
||||||
|
return f"{days}d {hours}h"
|
||||||
|
|
||||||
|
def test_hosts_api():
|
||||||
|
"""Test GET /api/0/hosts endpoint."""
|
||||||
|
print_section("1. List All Monitored Hosts")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
hosts = response.json()
|
||||||
|
|
||||||
|
print(f"Found {len(hosts)} hosts:\n")
|
||||||
|
for host in hosts:
|
||||||
|
name = host.get('name', 'unknown')
|
||||||
|
dyn = host.get('dyn', False)
|
||||||
|
conn_count = len(host.get('connections', []))
|
||||||
|
|
||||||
|
print(f" • {name}")
|
||||||
|
print(f" - Protocol: IPv{ver}")
|
||||||
|
print(f" - Dynamic: {dyn}")
|
||||||
|
print(f" - Connections: {conn_count}")
|
||||||
|
|
||||||
|
return hosts
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def test_host_plugins_api(hostname):
|
||||||
|
"""Test GET /api/0/hosts/{hostname}/plugins endpoint."""
|
||||||
|
print_section(f"2. Get All Plugins for Host: {hostname}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/plugins", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
plugins = data.get('plugins', {})
|
||||||
|
print(f"Found {len(plugins)} plugins:\n")
|
||||||
|
|
||||||
|
for plugin_name, plugin_data in plugins.items():
|
||||||
|
timestamp = plugin_data.get('timestamp', 0)
|
||||||
|
sample_count = plugin_data.get('sample_count', 0)
|
||||||
|
metrics = plugin_data.get('data', {})
|
||||||
|
|
||||||
|
print(f" 📦 {plugin_name}")
|
||||||
|
print(f" Last update: {format_timestamp(timestamp)}")
|
||||||
|
print(f" Samples: {sample_count}")
|
||||||
|
print(f" Metrics: {len(metrics)}")
|
||||||
|
|
||||||
|
# Show first few metrics
|
||||||
|
for i, (metric, value) in enumerate(metrics.items()):
|
||||||
|
if i < 3: # Show only first 3 metrics
|
||||||
|
if isinstance(value, float):
|
||||||
|
print(f" - {metric}: {value:.2f}")
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
print(f" - {metric}: [nested data, {len(value)} keys]")
|
||||||
|
else:
|
||||||
|
print(f" - {metric}: {value}")
|
||||||
|
|
||||||
|
if len(metrics) > 3:
|
||||||
|
print(f" ... and {len(metrics) - 3} more")
|
||||||
|
print()
|
||||||
|
|
||||||
|
return list(plugins.keys())
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def test_plugin_detail_api(hostname, plugin_name, limit=5):
|
||||||
|
"""Test GET /api/0/hosts/{hostname}/plugins/{plugin_name} endpoint."""
|
||||||
|
print_section(f"3. Get Detailed Data: {hostname}/{plugin_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = f"{BASE_URL}/api/0/hosts/{hostname}/plugins/{plugin_name}"
|
||||||
|
params = {'limit': limit}
|
||||||
|
response = requests.get(url, params=params, timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
samples = data.get('samples', [])
|
||||||
|
print(f"Retrieved {len(samples)} samples (limit={limit}):\n")
|
||||||
|
|
||||||
|
for i, sample in enumerate(samples):
|
||||||
|
timestamp = sample.get('timestamp', 0)
|
||||||
|
metrics = sample.get('data', {})
|
||||||
|
|
||||||
|
print(f" [{i+1}] {format_timestamp(timestamp)}")
|
||||||
|
for metric, value in sorted(metrics.items())[:5]: # Show first 5 metrics
|
||||||
|
if isinstance(value, float):
|
||||||
|
print(f" {metric}: {value:.2f}")
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
print(f" {metric}: [nested: {len(value)} keys]")
|
||||||
|
else:
|
||||||
|
print(f" {metric}: {value}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
return samples
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def test_host_alerts_api(hostname):
|
||||||
|
"""Test GET /api/0/hosts/{hostname}/alerts endpoint."""
|
||||||
|
print_section(f"4. Get Alerts for Host: {hostname}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/alerts", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
alerts = data.get('alerts', [])
|
||||||
|
summary = data.get('summary', {})
|
||||||
|
|
||||||
|
print(f"Summary:")
|
||||||
|
print(f" ✓ OK: {summary.get('ok', 0)}")
|
||||||
|
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||||
|
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||||
|
print(f" ❓ Unknown: {summary.get('unknown', 0)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show non-OK alerts
|
||||||
|
active_alerts = [a for a in alerts if a.get('level') != 'OK']
|
||||||
|
if active_alerts:
|
||||||
|
print(f"Active Alerts ({len(active_alerts)}):")
|
||||||
|
for alert in active_alerts:
|
||||||
|
metric = alert.get('metric_path', 'unknown')
|
||||||
|
level = alert.get('level', 'UNKNOWN')
|
||||||
|
value = alert.get('last_value', 0)
|
||||||
|
since = alert.get('since', 0)
|
||||||
|
duration = datetime.now().timestamp() - since
|
||||||
|
|
||||||
|
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||||
|
print(f" {icon} {metric}")
|
||||||
|
print(f" Level: {level}")
|
||||||
|
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||||
|
print(f" Duration: {format_duration(duration)}")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print("✓ No active alerts - all systems normal!")
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def test_all_alerts_api():
|
||||||
|
"""Test GET /api/0/alerts endpoint."""
|
||||||
|
print_section("5. Get All Active Alerts Across All Hosts")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
alerts = data.get('alerts', [])
|
||||||
|
summary = data.get('summary', {})
|
||||||
|
host_count = data.get('host_count', 0)
|
||||||
|
|
||||||
|
print(f"Monitoring {host_count} hosts")
|
||||||
|
print(f"Active Alerts: {summary.get('total', 0)}")
|
||||||
|
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||||
|
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if alerts:
|
||||||
|
print("Alert Details:")
|
||||||
|
for alert in alerts:
|
||||||
|
hostname = alert.get('hostname', 'unknown')
|
||||||
|
metric = alert.get('metric_path', 'unknown')
|
||||||
|
level = alert.get('level', 'UNKNOWN')
|
||||||
|
value = alert.get('last_value', 0)
|
||||||
|
since = alert.get('since', 0)
|
||||||
|
duration = datetime.now().timestamp() - since
|
||||||
|
notification_count = alert.get('notification_count', 0)
|
||||||
|
|
||||||
|
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||||
|
print(f" {icon} {hostname} / {metric}")
|
||||||
|
print(f" Level: {level}")
|
||||||
|
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||||
|
print(f" Duration: {format_duration(duration)}")
|
||||||
|
print(f" Notifications: {notification_count}")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print("✅ All systems normal - no active alerts!")
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def test_messages_api():
|
||||||
|
"""Test GET /api/0/messages endpoint."""
|
||||||
|
print_section("6. Get Recent Messages")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/messages", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
messages = response.json()
|
||||||
|
|
||||||
|
print(f"Last {len(messages)} messages:\n")
|
||||||
|
for msg in messages[-5:]: # Show last 5
|
||||||
|
timestamp = msg.get('time', 0)
|
||||||
|
host = msg.get('host', 'unknown')
|
||||||
|
text = msg.get('msg', '')
|
||||||
|
|
||||||
|
print(f" [{format_timestamp(timestamp)}] {host}: {text}")
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def test_error_handling():
|
||||||
|
"""Test API error handling."""
|
||||||
|
print_section("7. Error Handling Tests")
|
||||||
|
|
||||||
|
# Test non-existent host
|
||||||
|
print("Testing non-existent host...")
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/hosts/nonexistenthost/plugins", timeout=5)
|
||||||
|
if response.status_code == 404:
|
||||||
|
error_data = response.json()
|
||||||
|
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Error: {e}")
|
||||||
|
|
||||||
|
# Test non-existent plugin
|
||||||
|
print("\nTesting non-existent plugin...")
|
||||||
|
try:
|
||||||
|
# Get first host
|
||||||
|
hosts = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5).json()
|
||||||
|
if hosts:
|
||||||
|
hostname = hosts[0]['name']
|
||||||
|
response = requests.get(
|
||||||
|
f"{BASE_URL}/api/0/hosts/{hostname}/plugins/nonexistentplugin",
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
if response.status_code == 404:
|
||||||
|
error_data = response.json()
|
||||||
|
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Error: {e}")
|
||||||
|
|
||||||
|
def demo_monitoring_loop():
|
||||||
|
"""Demonstrate continuous monitoring."""
|
||||||
|
print_section("8. Continuous Monitoring Demo (5 iterations)")
|
||||||
|
|
||||||
|
print("Monitoring alerts every 3 seconds (Ctrl+C to stop)...\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i in range(5):
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
summary = data.get('summary', {})
|
||||||
|
critical = summary.get('critical', 0)
|
||||||
|
warning = summary.get('warning', 0)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime('%H:%M:%S')
|
||||||
|
status = "🔴 CRITICAL" if critical > 0 else "⚠️ WARNING" if warning > 0 else "✅ OK"
|
||||||
|
|
||||||
|
print(f"[{timestamp}] {status} - Critical: {critical}, Warning: {warning}")
|
||||||
|
|
||||||
|
if i < 4: # Don't sleep after last iteration
|
||||||
|
sleep(3)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nMonitoring stopped by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Error: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all API tests."""
|
||||||
|
print("""
|
||||||
|
╔══════════════════════════════════════════════════════════════╗
|
||||||
|
║ Heartbeat Daemon HTTP API Demo & Test Suite ║
|
||||||
|
╚══════════════════════════════════════════════════════════════╝
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"Testing API at: {BASE_URL}")
|
||||||
|
print(f"Ensure the heartbeat daemon is running!")
|
||||||
|
|
||||||
|
# Test basic connectivity
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=2)
|
||||||
|
response.raise_for_status()
|
||||||
|
print("✅ API is reachable\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Cannot connect to API: {e}")
|
||||||
|
print("\nPlease ensure:")
|
||||||
|
print(" 1. Heartbeat daemon is running")
|
||||||
|
print(" 2. HTTP server is enabled in configuration")
|
||||||
|
print(f" 3. Server is listening on port {BASE_URL.split(':')[-1]}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Run test suite
|
||||||
|
hosts = test_hosts_api()
|
||||||
|
|
||||||
|
if not hosts:
|
||||||
|
print("\n⚠️ No hosts found. Ensure clients are sending heartbeats.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Pick first host for detailed testing
|
||||||
|
hostname = hosts[0].get('name', '')
|
||||||
|
|
||||||
|
if hostname:
|
||||||
|
plugins = test_host_plugins_api(hostname)
|
||||||
|
|
||||||
|
if plugins:
|
||||||
|
# Test detailed plugin data
|
||||||
|
test_plugin_detail_api(hostname, plugins[0], limit=3)
|
||||||
|
|
||||||
|
# Test alert endpoints
|
||||||
|
test_host_alerts_api(hostname)
|
||||||
|
|
||||||
|
# Test global endpoints
|
||||||
|
test_all_alerts_api()
|
||||||
|
test_messages_api()
|
||||||
|
|
||||||
|
# Test error handling
|
||||||
|
test_error_handling()
|
||||||
|
|
||||||
|
# Continuous monitoring demo
|
||||||
|
demo_monitoring_loop()
|
||||||
|
|
||||||
|
print_section("Test Suite Complete")
|
||||||
|
print("""
|
||||||
|
Next Steps:
|
||||||
|
• View the web UI at http://localhost:50004/live
|
||||||
|
• Check plugin metrics at http://localhost:50004/plugins
|
||||||
|
• Monitor alerts at http://localhost:50004/alerts
|
||||||
|
• Read API documentation: docs/HTTP_API.md
|
||||||
|
""")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nDemo interrupted by user")
|
||||||
|
sys.exit(0)
|
||||||
Executable
+115
@@ -0,0 +1,115 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Helper script to install the heartbeat tools. By default, it will only
|
||||||
|
# install the heartbeat client, hbc. The server is installed when the arg 'server' is passed
|
||||||
|
# to the script. The script will install the heartbeat tools in a python
|
||||||
|
# virtual environment in ~/venvs/hbd. The hbd and hbc commands will be
|
||||||
|
# installed from the wheel and symlinked to ~/bin/hbd and ~/bin/hbc,
|
||||||
|
# respectively. If the virtual environment already exists, it will be
|
||||||
|
# reused. The script will also remove any existing symlinks for hbd and hbc
|
||||||
|
# in ~/bin before creating new ones.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
what=$1
|
||||||
|
on_ha=0
|
||||||
|
where=""
|
||||||
|
venv=""
|
||||||
|
[ "$2" = "HA" ] && on_ha=1
|
||||||
|
[ -z "$what" ] && what="client"
|
||||||
|
|
||||||
|
if [ -d /homeassistant ]; then # if running from HA command line
|
||||||
|
echo "HA, running \"docker exec homeassistant /config/bin/hb_install.sh $@\""
|
||||||
|
docker exec homeassistant /config/bin/hb_install.sh $@ HA
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
echo "Failed to install heartbeat in HA, please check the logs for more details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $on_ha -eq 1 ] || [ -r /.dockerenv ] && [ -d /config/bin ]; then
|
||||||
|
# Installing under docker on Home Assistant OS, using /config/bin for executables and /config/venvs for virtual environments
|
||||||
|
echo "Home Assistant OS detected, installing under docker"
|
||||||
|
where="/config/bin"
|
||||||
|
venv="/config/venvs"
|
||||||
|
else
|
||||||
|
if [ ! -d $HOME/.local/bin ] && [ ! -d $HOME/bin ]; then
|
||||||
|
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
for where in $HOME/bin $HOME/.local/bin notset ; do
|
||||||
|
if echo ":$PATH:" | grep -q ":$where:" ; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$where" = "notset" ]; then
|
||||||
|
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ "$what" = "mini" ]; then
|
||||||
|
venv=""
|
||||||
|
else
|
||||||
|
venv="$HOME/venvs"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo "Installing $what to $where"
|
||||||
|
if [ ! -z "$venv" ]; then
|
||||||
|
echo "Using virtual environment at $venv/hbd"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$venv" != "" ] && [ ! -d $venv/hbd ]; then
|
||||||
|
arg=""
|
||||||
|
have_pip=$(python3 -c "import pip" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||||
|
if [ "$have_pip" = "Not Installed" ]; then
|
||||||
|
# some systems do not have pip installed by default, so we need to fetch get-pip.py and install pip
|
||||||
|
echo "pip is not installed, fetching get-pip.py and installing pip"
|
||||||
|
arg="--without-pip"
|
||||||
|
fi
|
||||||
|
mkdir -p $venv
|
||||||
|
have_venv=$(python3 -c "import venv" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||||
|
if [ "$have_venv" = "Not Installed" ]; then
|
||||||
|
if [ "$have_pip" = "Not Installed" ]; then
|
||||||
|
echo "python has no venv, and no pip to install virtualenv, cannot continue"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "python venv module not found, installing virtualenv"
|
||||||
|
python3 -m pip install --user virtualenv
|
||||||
|
python3 -m virtualenv $venv/hbd --system-site-packages $arg
|
||||||
|
else
|
||||||
|
python3 -m venv $venv/hbd --system-site-packages $arg
|
||||||
|
fi
|
||||||
|
. $venv/hbd/bin/activate
|
||||||
|
if [ -n "$arg" ]; then
|
||||||
|
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py
|
||||||
|
fi
|
||||||
|
deactivate
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -z "$venv" ]; then
|
||||||
|
. $venv/hbd/bin/activate
|
||||||
|
fi
|
||||||
|
if [ "$what" = "mini" ]; then
|
||||||
|
curl -s -o $where/hbc_mini https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hbc_mini.py
|
||||||
|
chmod +x $where/hbc_mini
|
||||||
|
else
|
||||||
|
python3 -mpip install --upgrade --index-url https://git.wrede.ca/api/packages/andreas/pypi/simple/ --extra-index-url https://pypi.org/simple hbd[$what]
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -z "$venv" ]; then
|
||||||
|
echo "linking executables to $where"
|
||||||
|
if [ "$what" = "server" ]; then
|
||||||
|
rm -f $where/hbd
|
||||||
|
ln -sf $(which hbd) $where/hbd
|
||||||
|
elif [ "$what" = "client" ]; then
|
||||||
|
rm -f $where/hbc
|
||||||
|
ln -sf $(which hbc) $where/hbc
|
||||||
|
fi
|
||||||
|
rm -f $where/hb_install.sh
|
||||||
|
ln -sf $(which hb_install.sh) $where/hb_install.sh
|
||||||
|
fi
|
||||||
|
echo "Installation complete. To upgrade, run the following:"
|
||||||
|
echo " $where/hb_install.sh $what"
|
||||||
|
echo "To install on another machine, run the following obtain the install script and run it:"
|
||||||
|
echo "from https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hb_install.sh"
|
||||||
|
echo "and then run sh hb_install.sh [mini|client]"
|
||||||
Executable
+1147
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user