Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 79bf00abfd | |||
| d77277857f | |||
| 3232239a85 | |||
| 014781de5e | |||
| 68b1c65384 | |||
| e8bb553349 | |||
| e4ecb8723f | |||
| 5edbaacf81 | |||
| 8421f472f2 | |||
| 51f9bdc2b5 | |||
| 02bc42fbf0 | |||
| 832a8b0bda | |||
| 57c4b86430 | |||
| 43fad7beed | |||
| 8dd002d159 | |||
| 2373b55d8b | |||
| 81530636ec | |||
| 190199b36d | |||
| 73aa89f8f4 | |||
| 941f3ea4b0 | |||
| c5770006f7 | |||
| 84c1aef51f | |||
| 460d2be9e9 | |||
| 090d341244 | |||
| 079e84f729 | |||
| dd23d9d163 | |||
| ad7178ebcb | |||
| 0543266c92 |
@@ -0,0 +1,51 @@
|
||||
name: Release
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: FreeBSD
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
# - name: Set up Python
|
||||
# uses: actions/setup-python@v5
|
||||
# with:
|
||||
# python-version: '3.11'
|
||||
- name: Set up Python
|
||||
# Use a generic run step for FreeBSD if actions/setup-python
|
||||
# fails in restricted environments.
|
||||
run: |
|
||||
python3 --version
|
||||
python3 -m ensurepip --upgrade
|
||||
|
||||
- name: Install build tools
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Upload to Gitea PyPI registry
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
python -m twine upload --repository-url https://git.wrede.ca/api/packages/andreas/pypi dist/*
|
||||
|
||||
- name: Create release
|
||||
uses: actions/gitea-release-action@v1
|
||||
with:
|
||||
files: |
|
||||
dist/*.whl
|
||||
dist/*.tar.gz
|
||||
title: "Release ${{ steps.get_version.outputs.VERSION }}"
|
||||
body: "Release version ${{ steps.get_version.outputs.VERSION }}"
|
||||
@@ -2,43 +2,278 @@
|
||||
hb_port: 50003
|
||||
hbd_host: ''
|
||||
#logfile: "/home/andreas/public_html/messages/andreas"
|
||||
logfile: "/Users/andreas/public_html/messages/andreas"
|
||||
logfile: "/home/andreas/logs/heartbeat/heartbeat.log"
|
||||
#logfile: "/Users/andreas/public_html/messages/andreas"
|
||||
logfmt: "msg"
|
||||
grace: 40
|
||||
interval: 10
|
||||
watchhosts:
|
||||
# "localhost":
|
||||
# "haschloss" :
|
||||
# "cotgate":
|
||||
"wentworth":
|
||||
notify: +4915123456789
|
||||
src: "signal"
|
||||
"y":
|
||||
notify: +4915123456789
|
||||
src: "signal"
|
||||
"winter":
|
||||
notify: +14168226179
|
||||
src: "signal"
|
||||
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
|
||||
autosave_interval: 300 # Autosave interval in seconds (default: 5 minutes)
|
||||
|
||||
|
||||
users:
|
||||
andreas:
|
||||
full_name: Andreas Wrede
|
||||
password: pbkdf2:sha256:260000:eece9cdaebc22247566f78983bf5b2a3:f8c74cc057c5590943c115a60bac62f9458e9ba0d2e7e7421b6f0fe5d860e18f # hbd passwd andreas
|
||||
avatar: /home/andreas/.avatar/Andreas-avatar3-small.png
|
||||
admin: true
|
||||
ops:
|
||||
full_name: Operations Team
|
||||
password: pbkdf2:sha256:260000:... # hbd passwd ops
|
||||
admin: false
|
||||
readonly:
|
||||
full_name: Read-Only User
|
||||
password: pbkdf2:sha256:260000:... # hbd
|
||||
|
||||
default_owner: andreas
|
||||
|
||||
hosts:
|
||||
weekend:
|
||||
owner: andreas
|
||||
managers: [ops]
|
||||
monitors: [readonly]
|
||||
|
||||
|
||||
# Notification Channels - Define notification providers centrally
|
||||
# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration
|
||||
notification_channels:
|
||||
|
||||
pushover_standard:
|
||||
type: pushover
|
||||
token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf
|
||||
user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK
|
||||
|
||||
signal_andreas:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +14168226179
|
||||
recipient: +14168226179
|
||||
|
||||
email_andreas:
|
||||
type: email
|
||||
recipients: [aew.hbd.notify@wrede.ca]
|
||||
sender: aew.hbd@wrede.ca
|
||||
smtp_server: smtp.fastmail.com
|
||||
smtp_port: 587
|
||||
smtp_user: andreas@wrede.ca
|
||||
smtp_password: pvtvefyp5gbhnch2
|
||||
|
||||
# Example additional channels (commented out)
|
||||
# pushover_urgent:
|
||||
# type: pushover
|
||||
# token: your-app-token
|
||||
# user: your-user-key
|
||||
#
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
|
||||
# Default notification channels (used if host doesn't specify channels)
|
||||
default_notification_channels: [pushover_standard]
|
||||
|
||||
# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications
|
||||
hosts:
|
||||
wentworth:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
y:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
winter:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
wally:
|
||||
threshold_config: freebsd_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
eris:
|
||||
threshold_config: truenas_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
haschloss:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
wayback:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
wertvoll:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
weekend:
|
||||
threshold_config: freebsd_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
cotgate:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
rvgate:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
draper:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
# Hosts to drop/ignore
|
||||
drophosts: {"unknown", "wookie15", "wort"}
|
||||
|
||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
||||
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
|
||||
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
|
||||
pushsrv: "pushover"
|
||||
|
||||
dyndomains: {"wrede.org"}
|
||||
toemail: ["aew.hbd.notify@wrede.ca"]
|
||||
fromemail: "aew.hbd@wrede.ca"
|
||||
smtpserver: "smtp.fastmail.com"
|
||||
smtpuser: "andreas@wrede.ca"
|
||||
smtppassword: "r8psra6wj6gcakkp"
|
||||
smtpport: 587
|
||||
|
||||
ws_port: 50005
|
||||
wss_port: 50006
|
||||
cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
|
||||
cert_path: "ssl/"
|
||||
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
|
||||
# cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
|
||||
# cert_path: "test/"
|
||||
# CERT_PATH = "./test/"
|
||||
wss_pem: "fullchain.pem"
|
||||
wss_key: "privkey.pem"
|
||||
# wss_pem: "fullchain.pem"
|
||||
# wss_key: "privkey.pem"
|
||||
|
||||
journal_enabled: true # Enable/disable journaling
|
||||
journal_dir: /home/andreas/logs/heartbeat # Journal directory
|
||||
journal_file: messages.journal # Base filename
|
||||
journal_max_size: 104857600 # Max size (100MB default)
|
||||
journal_max_backups: 10 # Number of backups to keep
|
||||
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
rtt:
|
||||
warning: 200
|
||||
critical: 250.0
|
||||
|
||||
|
||||
freebsd_server:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
memory_percent:
|
||||
warning: 97.0
|
||||
critical: 100.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
nagios_runner:
|
||||
# overall_status_code:
|
||||
# warning: 1
|
||||
# critical: 2
|
||||
# operator: ">="
|
||||
load_status:
|
||||
warning: WARNING
|
||||
critical: CRITICAL
|
||||
operator: "=="
|
||||
ups_load:
|
||||
display: "load to high: {ups_output}"
|
||||
warning: 70
|
||||
critical: 80
|
||||
operator: ">="
|
||||
ups_status_code:
|
||||
display: "{ups_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
nextcloud_apps_status_code:
|
||||
display: "{nextcloud_apps_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
rtt:
|
||||
warning: 200
|
||||
critical: 250.0
|
||||
|
||||
truenas_server:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 3.0
|
||||
critical: 95.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
nagios_runner:
|
||||
# overall_status_code:
|
||||
# warning: 1
|
||||
# critical: 2
|
||||
# operator: ">="
|
||||
load_status:
|
||||
warning: WARNING
|
||||
critical: CRITICAL
|
||||
operator: "=="
|
||||
ups_load:
|
||||
display: "load to high: {ups_output}"
|
||||
WARNING: 70
|
||||
CRITICAL: 80
|
||||
OPERATOR: ">="
|
||||
ups_status_code:
|
||||
DISPLAY: "{ups_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
nextcloud_apps_status_code:
|
||||
display: "{nextcloud_apps_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
rtt:
|
||||
warning: 120
|
||||
critical: 250.0
|
||||
|
||||
|
||||
|
||||
Vendored
+3
-3
@@ -8,8 +8,8 @@
|
||||
"name": "Python: Run hbd (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "hbd.cli",
|
||||
"args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
|
||||
"module": "hbd.server.cli",
|
||||
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
@@ -32,7 +32,7 @@
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "debugpy",
|
||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
|
||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", ".hb.yaml", "-f", "-v"],
|
||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
|
||||
Vendored
+4
-1
@@ -2,5 +2,8 @@
|
||||
"python.pythonPath": "/usr/bin/python3",
|
||||
"python.linting.enabled": true,
|
||||
"python.formatting.provider": "black",
|
||||
"python.linting.flake8Enabled": true
|
||||
"python.linting.flake8Enabled": true,
|
||||
"chat.tools.terminal.autoApprove": {
|
||||
"mv": true
|
||||
}
|
||||
}
|
||||
@@ -11,10 +11,359 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
||||
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
|
||||
- WebSocket API for live updates (hosts & messages) ✅
|
||||
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
|
||||
- **User management & access control** ✅
|
||||
- Optional user accounts with bcrypt-style password hashing (stdlib only)
|
||||
- Per-host roles: owner, manager, monitor
|
||||
- Session-based auth with cookie support (browser login page included)
|
||||
- Backwards compatible: no auth required when no users are configured
|
||||
- **HTTP API & Web UI** ✅
|
||||
- REST API for plugin data, alerts, host information, and user management
|
||||
- Live dashboard with WebSocket updates
|
||||
- Interactive plugin metrics visualization
|
||||
- Alerts dashboard with filtering and summaries
|
||||
- **Message journal with automatic log rotation** ✅
|
||||
- Logs all received messages in JSON format
|
||||
- Size-based automatic rotation
|
||||
- Configurable retention and backup management
|
||||
- **Plugin system for extensible monitoring** ✅
|
||||
- Collect system metrics (CPU, memory, disk, network)
|
||||
- Execute existing Nagios monitoring plugins
|
||||
- Create custom plugins with simple Python classes
|
||||
- **Threshold alerting system** ✅
|
||||
- Monitor metrics against configurable WARNING/CRITICAL thresholds
|
||||
- Hysteresis to prevent alert flapping
|
||||
- Automatic notifications on state changes
|
||||
- Re-notification for ongoing alerts
|
||||
- Modular codebase suitable for unit testing and CI ✅
|
||||
|
||||
---
|
||||
|
||||
## 🔌 Plugin System
|
||||
|
||||
Heartbeat includes a comprehensive plugin architecture that extends monitoring beyond simple heartbeats. The plugin system allows you to:
|
||||
|
||||
- **Collect system information**: OS details, hardware info, system configuration
|
||||
- **Monitor resources**: CPU usage, memory, disk space, network statistics
|
||||
- **Run Nagios plugins**: Execute thousands of existing Nagios monitoring plugins without modification
|
||||
- **Create custom plugins**: Build your own monitoring logic with simple Python classes
|
||||
|
||||
### Plugin Types
|
||||
|
||||
- **InfoPlugin**: Collects static information once (e.g., OS version, hardware specs)
|
||||
- **MonitorPlugin**: Collects metrics periodically (e.g., CPU usage every 30 seconds)
|
||||
|
||||
### Built-in Plugins
|
||||
|
||||
- `os_info`: Collects OS, kernel, distribution, and architecture information
|
||||
- `cpu_monitor`: Monitors CPU usage, load average, frequency, and process counts
|
||||
- `memory_monitor`: Monitors RAM and swap usage, available memory
|
||||
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
|
||||
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
|
||||
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
|
||||
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
|
||||
|
||||
### Nagios Integration
|
||||
|
||||
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
|
||||
|
||||
- Executes plugins via subprocess with timeout protection
|
||||
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
|
||||
- Extracts performance data with thresholds
|
||||
- Reports aggregated status across all configured checks
|
||||
|
||||
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
|
||||
|
||||
### Creating Custom Plugins
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
|
||||
class DiskMonitorPlugin(MonitorPlugin):
|
||||
name = "disk_monitor"
|
||||
interval = 60 # Run every 60 seconds
|
||||
|
||||
async def collect(self):
|
||||
return {
|
||||
"disk_usage": get_disk_usage(),
|
||||
"timestamp": time.time()
|
||||
}
|
||||
```
|
||||
|
||||
Place plugins in `hbd/plugins/` and they'll be automatically discovered and loaded by the client.
|
||||
|
||||
---
|
||||
|
||||
## 📝 Message Journal
|
||||
|
||||
Heartbeat includes a message journal that logs all received messages with automatic rotation.
|
||||
|
||||
### Features
|
||||
|
||||
- **JSON Format**: All messages logged in JSONL (JSON Lines) format for easy parsing
|
||||
- **Automatic Rotation**: Size-based rotation with configurable thresholds
|
||||
- **Backup Management**: Keeps configurable number of rotated log files
|
||||
- **Non-blocking**: Async logging with minimal performance impact
|
||||
|
||||
### Configuration
|
||||
|
||||
```yaml
|
||||
# Message journal settings
|
||||
journal_enabled: true # Enable/disable journaling
|
||||
journal_dir: /var/log/heartbeat # Journal directory
|
||||
journal_file: messages.journal # Base filename
|
||||
journal_max_size: 104857600 # Max size (100MB default)
|
||||
journal_max_backups: 10 # Number of backups to keep
|
||||
```
|
||||
|
||||
### Example Journal Entry
|
||||
|
||||
```json
|
||||
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||
```
|
||||
|
||||
### Analyzing Journal Files
|
||||
|
||||
```bash
|
||||
# View recent messages
|
||||
tail -100 /var/log/heartbeat/messages.journal | jq .
|
||||
|
||||
# Count messages by type
|
||||
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||
|
||||
# Filter by hostname
|
||||
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||
```
|
||||
|
||||
See [docs/MESSAGE_JOURNAL.md](docs/MESSAGE_JOURNAL.md) for complete documentation including rotation behavior, integration with log management systems, and analysis examples.
|
||||
|
||||
---
|
||||
|
||||
## 🚨 Threshold Alerting
|
||||
|
||||
Heartbeat includes a sophisticated threshold alerting system that monitors plugin metrics and triggers notifications when values exceed configured limits.
|
||||
|
||||
### Features
|
||||
|
||||
- **Multi-level alerts**: WARNING and CRITICAL severity levels
|
||||
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
|
||||
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
|
||||
- **Smart notifications**: Alerts only on state changes, not every check
|
||||
- **Re-notifications**: Periodic reminders for ongoing alerts
|
||||
- **Journal integration**: All threshold events logged for audit trail
|
||||
|
||||
### Configuration
|
||||
|
||||
```yaml
|
||||
thresholds:
|
||||
# RTT (Round-Trip Time) thresholds for heartbeat monitoring
|
||||
# These are checked on every HTB message arrival
|
||||
rtt:
|
||||
webserver01:
|
||||
warning: 100.0 # Warn when RTT > 100ms
|
||||
critical: 500.0 # Critical when RTT > 500ms
|
||||
|
||||
database01:
|
||||
warning: 50.0
|
||||
critical: 200.0
|
||||
|
||||
# Plugin metric thresholds
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0 # Warn when CPU > 80%
|
||||
critical: 90.0 # Critical when CPU > 90%
|
||||
operator: ">"
|
||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
free_gb:
|
||||
warning: 10.0 # Alert when < 10GB free
|
||||
critical: 5.0
|
||||
operator: "<" # Inverse threshold
|
||||
|
||||
# Global settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
|
||||
```
|
||||
|
||||
### RTT Monitoring
|
||||
|
||||
Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
|
||||
|
||||
- **Per-host configuration**: Set different thresholds for each monitored host
|
||||
- **Real-time checking**: Thresholds evaluated on every HTB message arrival
|
||||
- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
|
||||
- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
|
||||
- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
|
||||
- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
|
||||
- **Re-notification**: Periodic reminders for ongoing RTT issues
|
||||
- **Event & journal logging**: All RTT events logged for audit trail
|
||||
|
||||
**Configuration format:**
|
||||
```yaml
|
||||
thresholds:
|
||||
rtt:
|
||||
<hostname>:
|
||||
warning: <milliseconds> # Warn when RTT > this value
|
||||
critical: <milliseconds> # Critical when RTT > this value
|
||||
hysteresis: 0.1 # Optional: 10% hysteresis (default)
|
||||
```
|
||||
|
||||
**Example alerts:**
|
||||
```
|
||||
WARNING: webserver01 - rtt.webserver01 = 125.3
|
||||
CRITICAL: database01 - rtt.database01 = 520.1
|
||||
RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
|
||||
```
|
||||
|
||||
RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
|
||||
|
||||
### Alert Behavior
|
||||
|
||||
1. **State Changes**: Notifications sent when crossing thresholds
|
||||
- OK → WARNING: Early notification
|
||||
- WARNING → CRITICAL: Escalation
|
||||
- CRITICAL → OK: Recovery
|
||||
|
||||
2. **Hysteresis**: Prevents rapid state transitions
|
||||
```
|
||||
Critical threshold: 90%
|
||||
Hysteresis: 10%
|
||||
Recovery threshold: 81% (90 - 10% of 90)
|
||||
|
||||
Value 91% → CRITICAL (threshold crossed)
|
||||
Value 85% → CRITICAL (still above 81%)
|
||||
Value 79% → OK (below recovery threshold)
|
||||
```
|
||||
|
||||
3. **Re-notifications**: Periodic reminders for ongoing alerts
|
||||
- Default: Every 60 minutes
|
||||
- Configurable via `threshold_renotify_interval`
|
||||
|
||||
### Example Notifications
|
||||
|
||||
```
|
||||
WARNING: webserver01 - cpu_monitor.cpu_percent = 85.0
|
||||
CRITICAL: webserver01 - memory_monitor.percent = 96.0
|
||||
RECOVERED: database01 - disk_monitor./.percent = 75.0 (WARNING -> OK)
|
||||
REMINDER (CRITICAL): mailserver - cpu_monitor.load_1min = 12.5 (ongoing for 3600s)
|
||||
```
|
||||
|
||||
### Supported Metrics
|
||||
|
||||
All plugin metrics can be thresholded:
|
||||
|
||||
- **CPU**: cpu_percent, load_1min, load_5min, load_15min
|
||||
- **Memory**: percent, available_mb, swap_percent
|
||||
- **Disk**: Per-partition percent, free_gb, free_mb
|
||||
- **Network**: errors_total, dropped packets, connection counts
|
||||
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL)
|
||||
|
||||
See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration.
|
||||
|
||||
---
|
||||
|
||||
## 👥 User Management
|
||||
|
||||
Heartbeat supports optional user accounts with role-based access control per host.
|
||||
|
||||
### Roles
|
||||
|
||||
- **monitor** — view status, plugin data, alerts
|
||||
- **manager** — monitor + queue commands, trigger DNS, queue upgrades
|
||||
- **owner** — manager + drop host, transfer ownership, update access
|
||||
- **admin** (user flag) — owner-level access on every host
|
||||
|
||||
When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||
|
||||
### Quick setup
|
||||
|
||||
```yaml
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
password: pbkdf2:sha256:... # hbd passwd alice
|
||||
admin: true
|
||||
|
||||
default_owner: alice
|
||||
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
```
|
||||
|
||||
```bash
|
||||
# Generate a password hash
|
||||
hbd passwd alice
|
||||
```
|
||||
|
||||
Browser users are redirected to `/login` automatically. The session cookie is set on login, so `fetch()` calls from dashboards work without any JavaScript changes.
|
||||
|
||||
See [docs/USERS.md](docs/USERS.md) for complete user management documentation.
|
||||
|
||||
---
|
||||
|
||||
## 🌐 HTTP API & Web UI
|
||||
|
||||
Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST API and web-based dashboards for monitoring and visualization.
|
||||
|
||||
### Features
|
||||
|
||||
- **User auth**: Optional session-based authentication with per-host role enforcement
|
||||
- **REST API**: JSON endpoints for accessing plugin data, alerts, host information, and user management
|
||||
- **Live Dashboard**: Real-time WebSocket-powered host status view
|
||||
- **Plugin Metrics**: Interactive visualization of all plugin data with auto-refresh
|
||||
- **Alerts Dashboard**: Comprehensive alert monitoring with filtering and summaries
|
||||
|
||||
### Web Dashboards
|
||||
|
||||
- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
|
||||
- **Live View** (`/live`): Real-time host connectivity, latency, and messages
|
||||
- **Plugin Metrics** (`/plugins`): Browse and visualize metrics from all plugins
|
||||
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering
|
||||
|
||||
### API Endpoints
|
||||
|
||||
```bash
|
||||
# Log in (when auth is configured)
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||
|
||||
# List all monitored hosts
|
||||
curl $AUTH http://localhost:50004/api/0/hosts
|
||||
|
||||
# Get all plugin data for a host
|
||||
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||
|
||||
# Get detailed plugin history (last 50 samples)
|
||||
curl $AUTH "http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=50"
|
||||
|
||||
# Get alert states for a specific host
|
||||
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||
|
||||
# Get all active alerts across all hosts
|
||||
curl $AUTH http://localhost:50004/api/0/alerts
|
||||
|
||||
# View/update host access roles
|
||||
curl $AUTH http://localhost:50004/api/0/hosts/webserver01/access
|
||||
```
|
||||
|
||||
See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation including response formats, error handling, and integration examples.
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Quickstart
|
||||
|
||||
Prerequisites:
|
||||
@@ -43,6 +392,46 @@ You can also run it directly via the package entrypoint after installation:
|
||||
python -m hbd.cli -c /path/to/config.yaml
|
||||
```
|
||||
|
||||
### Running the Client
|
||||
|
||||
The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
|
||||
|
||||
```bash
|
||||
# Basic usage pointing to server
|
||||
python -m hbd.hbc --server your-server.example.com
|
||||
|
||||
# With custom configuration
|
||||
python -m hbd.hbc --server 192.168.1.100 --port 50003 --interval 30
|
||||
|
||||
# Run with specific plugins enabled/disabled
|
||||
python -m hbd.hbc --server hbd.local --disable-plugin os_info
|
||||
```
|
||||
|
||||
Client configuration can also be specified in YAML:
|
||||
|
||||
```yaml
|
||||
server: hbd.example.com
|
||||
port: 50003
|
||||
interval: 30
|
||||
plugins:
|
||||
cpu_monitor:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
per_core: true
|
||||
memory_monitor:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
disk_monitor:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
network_monitor:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
nagios_runner:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
commands:
|
||||
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
```
|
||||
|
||||
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
|
||||
|
||||
## 🐞 Debugging in VS Code
|
||||
|
||||
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
|
||||
@@ -84,6 +473,8 @@ Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py
|
||||
- `cert_path`: directory where TLS certificate and key are looked up (default: /usr/local/etc/ssl/)
|
||||
- `wss_pem`: filename for the certificate chain (default: fullchain.pem)
|
||||
- `wss_key`: filename for the private key (default: privkey.pem)
|
||||
- `users`: mapping of username → user attributes (full_name, avatar, password, admin, notification_channels)
|
||||
- `default_owner`: username that owns hosts with no explicit owner (falls back to first admin user)
|
||||
|
||||
Example `.hb.yaml` (minimal):
|
||||
|
||||
@@ -102,7 +493,7 @@ pushsrv: pushover
|
||||
|
||||
## 🔧 Architecture & Modules
|
||||
|
||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
|
||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
|
||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
|
||||
The DNS worker now runs as an `asyncio` task and the package exposes a
|
||||
@@ -112,6 +503,10 @@ pushsrv: pushover
|
||||
- `hbd.notify` — email and push notification helpers
|
||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
||||
- `hbd.journal` — message journal with size-based log rotation and backup management
|
||||
- `hbd.plugin` — plugin framework with base classes, registry, and dynamic loader
|
||||
- `hbd.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
|
||||
- `hbd.hbc` — heartbeat client that sends heartbeats and plugin data to server
|
||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
||||
|
||||
+234
@@ -0,0 +1,234 @@
|
||||
# HBD/HBC Separation Refactoring
|
||||
|
||||
## Overview
|
||||
|
||||
The heartbeat monitoring system has been refactored into a modular package structure with separate client and server components. This allows users to install only what they need and provides clear separation of concerns.
|
||||
|
||||
## New Package Structure
|
||||
|
||||
```
|
||||
hbd/
|
||||
├── __init__.py # Main package (minimal)
|
||||
├── client/ # HBC - System monitoring client
|
||||
│ ├── __init__.py
|
||||
│ ├── main.py # Entry point (was hbc.py)
|
||||
│ ├── config.py # Client-specific configuration
|
||||
│ ├── plugin.py # Plugin framework
|
||||
│ ├── threshold.py # Threshold checking
|
||||
│ └── plugins/ # Monitoring plugins
|
||||
│ ├── cpu_monitor.py
|
||||
│ ├── disk_monitor.py
|
||||
│ ├── memory_monitor.py
|
||||
│ ├── network_monitor.py
|
||||
│ ├── filesystem_info.py
|
||||
│ ├── os_info.py
|
||||
│ └── nagios_runner.py
|
||||
├── server/ # HBD - Heartbeat daemon/server
|
||||
│ ├── __init__.py
|
||||
│ ├── main.py # Server runtime (was server.py)
|
||||
│ ├── cli.py # Command-line interface
|
||||
│ ├── config.py # Server-specific configuration
|
||||
│ ├── http.py # HTTP/REST API
|
||||
│ ├── ws.py # WebSocket server
|
||||
│ ├── udp.py # UDP heartbeat listener
|
||||
│ ├── dns.py # DNS update functionality
|
||||
│ ├── notify.py # Notification handlers
|
||||
│ ├── monitor.py # Host monitoring
|
||||
│ ├── hbdclass.py # Host class definitions
|
||||
│ ├── journal.py # Message journaling
|
||||
│ ├── templates/ # Jinja2 web templates
|
||||
│ └── static/ # Web UI assets
|
||||
└── common/ # Shared utilities
|
||||
├── __init__.py
|
||||
├── proto.py # Protocol encoding/decoding
|
||||
└── utils.py # Common utilities
|
||||
|
||||
## Configuration Files
|
||||
|
||||
### Client Configuration (hbd/client/config.py)
|
||||
|
||||
Client-specific defaults:
|
||||
- `hb_port`: Port where hbd servers listen (default: 50003)
|
||||
- `interval`: Heartbeat interval in seconds (default: 10)
|
||||
- `plugins`: Per-plugin configuration
|
||||
- `thresholds`: Threshold configuration for monitoring
|
||||
|
||||
### Server Configuration (hbd/server/config.py)
|
||||
|
||||
Server-specific defaults:
|
||||
- `hb_port`: Port to listen for heartbeats (default: 50003)
|
||||
- `hbd_port`: HTTP API port (default: 50004)
|
||||
- `ws_port`: WebSocket port (default: 50005)
|
||||
- `logfile`, `logfmt`: Logging configuration
|
||||
- `pushsrv`, `pushover_token`, etc.: Notification settings
|
||||
- `watchhosts`, `dyndnshosts`: Host monitoring
|
||||
- `smtpserver`, etc.: Email settings
|
||||
- `journal_*`: Message journaling settings
|
||||
|
||||
## Installation Options
|
||||
|
||||
### Install Core Only (minimal, PyYAML only)
|
||||
```bash
|
||||
pip install hbd
|
||||
```
|
||||
|
||||
### Install Client Only (for monitoring)
|
||||
```bash
|
||||
pip install hbd[client]
|
||||
# Installs: PyYAML, psutil
|
||||
```
|
||||
|
||||
### Install Server Only (for daemon)
|
||||
```bash
|
||||
pip install hbd[server]
|
||||
# Installs: PyYAML, websockets, mattermostdriver, aiohttp, Jinja2
|
||||
```
|
||||
|
||||
### Install Everything
|
||||
```bash
|
||||
pip install hbd[all]
|
||||
# Installs all dependencies for both client and server
|
||||
```
|
||||
|
||||
### Development Installation
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
# Includes all dependencies plus testing/linting tools
|
||||
```
|
||||
|
||||
## Command-Line Interfaces
|
||||
|
||||
### HBC (Client)
|
||||
```bash
|
||||
hbc [options] host1 [host2 ...]
|
||||
|
||||
# Entry point: hbd.client.main:main
|
||||
# Location: hbd/client/main.py
|
||||
```
|
||||
|
||||
### HBD (Server)
|
||||
```bash
|
||||
hbd [options]
|
||||
|
||||
# Entry point: hbd.server.cli:main
|
||||
# Location: hbd/server/cli.py → hbd/server/main.py
|
||||
```
|
||||
|
||||
## Import Changes
|
||||
|
||||
### Client Code
|
||||
```python
|
||||
# Old imports
|
||||
from .config import load_config
|
||||
from .proto import dicttos, stodict
|
||||
from .plugin import PluginRegistry
|
||||
|
||||
# New imports
|
||||
from .config import load_config # Still in client/
|
||||
from ..common.proto import dicttos # Moved to common/
|
||||
from .plugin import PluginRegistry # Still in client/
|
||||
```
|
||||
|
||||
### Server Code
|
||||
```python
|
||||
# Old imports
|
||||
from .config import load_config
|
||||
from .proto import stodict
|
||||
from .threshold import AlertLevel
|
||||
|
||||
# New imports
|
||||
from .config import load_config # Server-specific config
|
||||
from ..common.proto import stodict # Moved to common/
|
||||
from ..client.threshold import AlertLevel # Client module
|
||||
```
|
||||
|
||||
### Plugin Code
|
||||
```python
|
||||
# Old import
|
||||
from hbd.plugin import MonitorPlugin
|
||||
|
||||
# New import
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
```
|
||||
|
||||
## Benefits
|
||||
|
||||
1. **Modular Installation**: Install only what you need
|
||||
- Client-only systems don't need web server dependencies
|
||||
- Server-only systems don't need psutil
|
||||
|
||||
2. **Clearer Architecture**: Explicit separation of concerns
|
||||
- Client: System monitoring and data collection
|
||||
- Server: Heartbeat reception, web UI, notifications
|
||||
- Common: Shared protocol and utilities
|
||||
|
||||
3. **Independent Evolution**: Client and server can evolve separately
|
||||
- Different release cycles possible
|
||||
- Clear API boundaries via common/
|
||||
|
||||
4. **Smaller Footprint**: Reduced dependency installation
|
||||
- Client: ~1 dependency (psutil)
|
||||
- Server: ~4 dependencies (websockets, aiohttp, Jinja2, mattermostdriver)
|
||||
|
||||
## Migration Guide
|
||||
|
||||
### For Existing Installations
|
||||
|
||||
1. **Reinstall the package**:
|
||||
```bash
|
||||
pip install -e ".[all]" # For development
|
||||
# or
|
||||
pip install hbd[all] # For production
|
||||
```
|
||||
|
||||
2. **Configuration files remain unchanged**:
|
||||
- Both client and server read from `~/.hb.yaml`
|
||||
- All existing config keys are supported in both configs
|
||||
- Server has additional keys (journal, websocket, email, etc.)
|
||||
- Client has minimal keys (interval, plugins, thresholds)
|
||||
|
||||
3. **Commands remain the same**:
|
||||
- `hbc` command works identically
|
||||
- `hbd` command works identically
|
||||
|
||||
### For New Deployments
|
||||
|
||||
1. **Client-only system** (monitoring host):
|
||||
```bash
|
||||
pip install hbd[client]
|
||||
hbc server1.example.com server2.example.com
|
||||
```
|
||||
|
||||
2. **Server-only system** (monitoring daemon):
|
||||
```bash
|
||||
pip install hbd[server]
|
||||
hbd -c /etc/hbd.yaml -f
|
||||
```
|
||||
|
||||
3. **Combined system** (dev/test):
|
||||
```bash
|
||||
pip install hbd[all]
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
All imports and entry points have been tested and validated:
|
||||
- ✅ Package imports work correctly
|
||||
- ✅ `hbc` command entry point functional
|
||||
- ✅ `hbd` command entry point functional
|
||||
- ✅ Optional dependencies properly configured
|
||||
- ✅ All internal imports updated
|
||||
|
||||
## Files Archived
|
||||
|
||||
The following files were renamed to avoid conflicts:
|
||||
- `hbd/config.py` → `hbd/config.py.old` (split into client/server configs)
|
||||
- `hbd/hbc_old.py` → `hbd/hbc_old.py.bak` (backup file)
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Test client functionality with a monitoring host
|
||||
2. Test server functionality with web UI and notifications
|
||||
3. Update documentation (README.md) with new structure
|
||||
4. Consider publishing to PyPI with new structure
|
||||
5. Update any deployment scripts/Dockerfiles to use optional dependencies
|
||||
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demonstration of the threshold alerting system.
|
||||
|
||||
This script shows how thresholds work by simulating plugin data
|
||||
with values that cross various threshold boundaries.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from hbd.threshold import ThresholdChecker, AlertLevel
|
||||
|
||||
|
||||
def demo_basic_thresholds():
|
||||
"""Demonstrate basic threshold checking."""
|
||||
print("=" * 70)
|
||||
print("DEMO 1: Basic Threshold Checking")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
"operator": ">",
|
||||
"hysteresis": 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
|
||||
def notifier(msg):
|
||||
notifications.append(msg)
|
||||
print(f" 📧 NOTIFICATION: {msg}")
|
||||
|
||||
checker = ThresholdChecker(config, notification_callback=notifier)
|
||||
alert_states = {}
|
||||
|
||||
# Simulate CPU values over time
|
||||
test_values = [
|
||||
(50.0, "Normal operation"),
|
||||
(85.0, "Crosses WARNING threshold"),
|
||||
(87.0, "Still in WARNING"),
|
||||
(95.0, "Escalates to CRITICAL"),
|
||||
(92.0, "Still CRITICAL (in hysteresis)"),
|
||||
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
|
||||
(79.0, "Recovers to OK"),
|
||||
(50.0, "Back to normal"),
|
||||
]
|
||||
|
||||
print("\nSimulating CPU usage over time:")
|
||||
print("-" * 70)
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n📊 CPU: {value}% - {description}")
|
||||
|
||||
plugin_data = {"cpu_percent": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||
if current_state:
|
||||
print(f" Current state: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
for metric, old_level, new_level, val in state_changes:
|
||||
print(f" ⚠️ State change: {old_level.name} → {new_level.name}")
|
||||
|
||||
print(f"\n📈 Summary: {len(notifications)} notifications sent")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_multiple_metrics():
|
||||
"""Demonstrate monitoring multiple metrics."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 2: Multiple Metrics and Alert Summary")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
||||
"load_1min": {"warning": 4.0, "critical": 8.0},
|
||||
},
|
||||
"memory_monitor": {
|
||||
"percent": {"warning": 85.0, "critical": 95.0},
|
||||
"available_mb": {
|
||||
"warning": 1000,
|
||||
"critical": 500,
|
||||
"operator": "<",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
# Simulate problematic system state
|
||||
print("\nSimulating a system under load:")
|
||||
print("-" * 70)
|
||||
|
||||
scenarios = [
|
||||
{
|
||||
"name": "Initial state - all OK",
|
||||
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||
},
|
||||
{
|
||||
"name": "CPU spikes to WARNING",
|
||||
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
||||
},
|
||||
{
|
||||
"name": "Memory also reaches WARNING",
|
||||
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||
},
|
||||
{
|
||||
"name": "CPU escalates to CRITICAL",
|
||||
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
|
||||
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
||||
},
|
||||
{
|
||||
"name": "System recovering",
|
||||
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
|
||||
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
|
||||
},
|
||||
]
|
||||
|
||||
for scenario in scenarios:
|
||||
print(f"\n📍 {scenario['name']}")
|
||||
|
||||
# Check CPU metrics
|
||||
checker.check_plugin_data(
|
||||
"testhost",
|
||||
"cpu_monitor",
|
||||
scenario["cpu_monitor"],
|
||||
alert_states
|
||||
)
|
||||
|
||||
# Check memory metrics
|
||||
checker.check_plugin_data(
|
||||
"testhost",
|
||||
"memory_monitor",
|
||||
scenario["memory_monitor"],
|
||||
alert_states
|
||||
)
|
||||
|
||||
# Show alert summary
|
||||
summary = checker.get_alert_summary(alert_states)
|
||||
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
|
||||
|
||||
# Show active alerts
|
||||
active = checker.get_active_alerts(alert_states)
|
||||
if active:
|
||||
print(f" Active alerts:")
|
||||
for alert in active:
|
||||
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
|
||||
|
||||
print(f"\n📈 Total notifications sent: {len(notifications)}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_hysteresis():
|
||||
"""Demonstrate hysteresis effect."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 3: Hysteresis Prevents Flapping")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
"hysteresis": 0.1, # 10% hysteresis
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
print("\nCritical threshold: 90%")
|
||||
print("Hysteresis: 10%")
|
||||
print("Recovery threshold: 81% (90 - 10% of 90)")
|
||||
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
|
||||
print("-" * 70)
|
||||
|
||||
# Simulate fluctuating values
|
||||
test_values = [
|
||||
(75.0, "Normal"),
|
||||
(92.0, "Crosses CRITICAL"),
|
||||
(88.0, "Drops but still above 81% (stays CRITICAL)"),
|
||||
(86.0, "Still above 81% (stays CRITICAL)"),
|
||||
(83.0, "Still above 81% (stays CRITICAL)"),
|
||||
(80.0, "Below 81% - recovers to OK"),
|
||||
(88.0, "Rises again but below 90% (stays OK)"),
|
||||
(91.0, "Crosses CRITICAL again"),
|
||||
]
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n📊 CPU: {value:5.1f}% - {description}")
|
||||
|
||||
plugin_data = {"cpu_percent": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
"testhost",
|
||||
"cpu_monitor",
|
||||
plugin_data,
|
||||
alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
||||
print(f" State: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
print(f" 📧 Notification sent (state changed)")
|
||||
else:
|
||||
print(f" ✓ No notification (state unchanged - hysteresis working)")
|
||||
|
||||
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def demo_inverse_threshold():
|
||||
"""Demonstrate inverse thresholds (less than)."""
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMO 4: Inverse Thresholds (Alert When Low)")
|
||||
print("=" * 70)
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"memory_monitor": {
|
||||
"available_mb": {
|
||||
"warning": 1000, # Warn when < 1000 MB
|
||||
"critical": 500, # Critical when < 500 MB
|
||||
"operator": "<",
|
||||
"hysteresis": 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
print("\nMonitoring available memory (alert when LOW):")
|
||||
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
|
||||
print("-" * 70)
|
||||
|
||||
test_values = [
|
||||
(2000, "Plenty of memory"),
|
||||
(800, "Drops below 1000 MB - WARNING"),
|
||||
(450, "Drops below 500 MB - CRITICAL"),
|
||||
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
|
||||
(600, "Enough recovery - back to WARNING"),
|
||||
(1200, "Fully recovered - OK"),
|
||||
]
|
||||
|
||||
for value, description in test_values:
|
||||
print(f"\n💾 Available: {value} MB - {description}")
|
||||
|
||||
plugin_data = {"available_mb": value}
|
||||
state_changes = checker.check_plugin_data(
|
||||
"testhost",
|
||||
"memory_monitor",
|
||||
plugin_data,
|
||||
alert_states,
|
||||
)
|
||||
|
||||
current_state = alert_states.get("memory_monitor.available_mb")
|
||||
print(f" State: {current_state.level.name}")
|
||||
|
||||
if state_changes:
|
||||
for metric, old_level, new_level, val in state_changes:
|
||||
print(f" 📧 {old_level.name} → {new_level.name}")
|
||||
|
||||
print(f"\n📈 Notifications sent: {len(notifications)}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n")
|
||||
print("╔" + "═" * 68 + "╗")
|
||||
print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
|
||||
print("╚" + "═" * 68 + "╝")
|
||||
|
||||
demo_basic_thresholds()
|
||||
demo_multiple_metrics()
|
||||
demo_hysteresis()
|
||||
demo_inverse_threshold()
|
||||
|
||||
print("\n\n" + "=" * 70)
|
||||
print("DEMONSTRATION COMPLETE")
|
||||
print("=" * 70)
|
||||
print("\nKey takeaways:")
|
||||
print(" • Thresholds detect when metrics exceed configured limits")
|
||||
print(" • Notifications sent only on state changes, not every check")
|
||||
print(" • Hysteresis prevents alert flapping")
|
||||
print(" • Supports both 'greater than' and 'less than' thresholds")
|
||||
print(" • Multiple metrics can be monitored simultaneously")
|
||||
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
|
||||
print("=" * 70)
|
||||
print()
|
||||
@@ -0,0 +1,292 @@
|
||||
# Configuration Reload
|
||||
|
||||
The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
|
||||
|
||||
## How to Reload Configuration
|
||||
|
||||
Send a SIGHUP signal to the running hbd process:
|
||||
|
||||
```bash
|
||||
# Find the process ID
|
||||
ps aux | grep hbd
|
||||
|
||||
# Or use pidof/pgrep
|
||||
pidof hbd
|
||||
pgrep -f hbd
|
||||
|
||||
# Send SIGHUP signal
|
||||
kill -HUP <pid>
|
||||
|
||||
# Or if using systemd
|
||||
systemctl reload heartbeat
|
||||
```
|
||||
|
||||
## What Can Be Reloaded
|
||||
|
||||
The following configuration sections can be reloaded without restarting:
|
||||
|
||||
### ✅ Fully Reloadable
|
||||
|
||||
- **Notification Channels** (`notification_channels`)
|
||||
- Add, remove, or modify notification channel definitions
|
||||
- Update tokens, API keys, SMTP credentials
|
||||
- Change recipient lists
|
||||
|
||||
- **Threshold Configurations** (`threshold_configs`)
|
||||
- Modify warning and critical thresholds
|
||||
- Add or remove threshold rules
|
||||
- Change operators and hysteresis values
|
||||
- Update display formats
|
||||
|
||||
- **Host Configuration** (`hosts`)
|
||||
- Change watch status
|
||||
- Update notification channel assignments
|
||||
- Modify threshold config assignments
|
||||
- Change dyndns status
|
||||
|
||||
- **Host Lists**
|
||||
- `watchhosts` - hosts to monitor
|
||||
- `dyndnshosts` - hosts with dynamic DNS
|
||||
- `drophosts` - hosts to ignore
|
||||
|
||||
- **Runtime Settings**
|
||||
- `grace` - grace period multiplier
|
||||
- `interval` - expected heartbeat interval
|
||||
- `threshold_renotify_interval` - re-notification interval
|
||||
- `debug` - debug level
|
||||
- `verbose` - verbose output
|
||||
|
||||
- **DNS Settings**
|
||||
- `dyndomains` - dynamic DNS domains
|
||||
- `nsupdate_bin` - nsupdate binary path
|
||||
- `rndc_key` - RNDC key path
|
||||
|
||||
### ⚠️ Requires Restart
|
||||
|
||||
The following settings **cannot** be reloaded and require a service restart:
|
||||
|
||||
- **Network Ports**
|
||||
- `hb_port` - UDP heartbeat port
|
||||
- `hbd_port` - HTTP API port
|
||||
- `ws_port` - WebSocket port
|
||||
- `wss_port` - Secure WebSocket port
|
||||
|
||||
- **SSL/TLS Settings**
|
||||
- `cert_path` - SSL certificate path
|
||||
- `wss_pem` - SSL certificate file
|
||||
- `wss_key` - SSL key file
|
||||
|
||||
- **Persistence**
|
||||
- `pickfile` - Pickle file path
|
||||
|
||||
- **Logging**
|
||||
- `logfile` - Log file path
|
||||
- `logfmt` - Log format
|
||||
|
||||
- **Journal Settings**
|
||||
- `journal_enabled` - Enable/disable journaling
|
||||
- `journal_dir` - Journal directory
|
||||
- `journal_file` - Journal filename
|
||||
- `journal_max_size` - Maximum journal size
|
||||
- `journal_max_backups` - Number of backup files
|
||||
|
||||
## Reload Process
|
||||
|
||||
When a SIGHUP signal is received:
|
||||
|
||||
1. **Configuration File Loading**
|
||||
- The config file is re-read from disk
|
||||
- YAML parsing is performed
|
||||
- Validation checks are run
|
||||
|
||||
2. **Component Updates**
|
||||
- Notification system is updated with new channel definitions
|
||||
- Threshold checker reloads all threshold configurations
|
||||
- Alert states are preserved to maintain hysteresis
|
||||
|
||||
3. **Error Handling**
|
||||
- If reload fails, the previous configuration is kept
|
||||
- Error messages are logged
|
||||
- Service continues running with old configuration
|
||||
|
||||
4. **Logging**
|
||||
- Reload start and completion are logged
|
||||
- Each component reports its reload status
|
||||
- Total number of thresholds is reported
|
||||
|
||||
## Example Reload Session
|
||||
|
||||
```bash
|
||||
# Terminal 1: Watch the logs
|
||||
tail -f /var/log/heartbeat.log
|
||||
|
||||
# Terminal 2: Edit configuration
|
||||
vim /path/to/.hb.yaml
|
||||
|
||||
# Make changes to notification channels or thresholds
|
||||
# Save the file
|
||||
|
||||
# Terminal 3: Trigger reload
|
||||
kill -HUP $(pgrep -f hbd)
|
||||
|
||||
# Terminal 1: See reload messages
|
||||
2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Starting configuration reload...
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
|
||||
2026-04-01 12:34:56 INFO: Notification configuration reloaded
|
||||
2026-04-01 12:34:56 INFO: Reloading threshold configuration...
|
||||
2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
2026-04-01 12:34:56 INFO: Configuration reload completed successfully
|
||||
2026-04-01 12:34:56 INFO: ============================================================
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Update Notification Credentials
|
||||
|
||||
If you need to rotate API keys or update SMTP passwords:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
pushover_standard:
|
||||
type: pushover
|
||||
token: new-token-here # Updated
|
||||
user: new-user-key-here # Updated
|
||||
```
|
||||
|
||||
Just edit the config file and send SIGHUP - no restart needed.
|
||||
|
||||
### 2. Adjust Threshold Values
|
||||
|
||||
Fine-tune alerting thresholds based on observed behavior:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 85.0 # Increased from 80.0
|
||||
critical: 95.0 # Increased from 90.0
|
||||
```
|
||||
|
||||
Send SIGHUP to apply the new thresholds immediately.
|
||||
|
||||
### 3. Add New Notification Channels
|
||||
|
||||
Add a new notification destination:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
email_oncall:
|
||||
type: email
|
||||
recipients: [oncall@example.com]
|
||||
sender: alerts@example.com
|
||||
smtp_server: smtp.example.com
|
||||
|
||||
hosts:
|
||||
critical_server:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard, email_oncall] # Added
|
||||
```
|
||||
|
||||
The new channel becomes active immediately after SIGHUP.
|
||||
|
||||
### 4. Update Watch List
|
||||
|
||||
Start or stop monitoring hosts without restart:
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
new_server:
|
||||
threshold_config: default
|
||||
watch: true # Start watching
|
||||
notification_channels: [pushover_standard]
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Test Configuration Before Reload**
|
||||
- Validate YAML syntax before sending SIGHUP
|
||||
- Check for typos in channel names
|
||||
- Verify threshold values are reasonable
|
||||
|
||||
2. **Monitor Reload Logs**
|
||||
- Always check logs after reload to confirm success
|
||||
- Look for error messages if reload fails
|
||||
- Verify expected number of thresholds loaded
|
||||
|
||||
3. **Backup Before Changes**
|
||||
- Keep a backup of working configuration
|
||||
- Use version control (git) for config files
|
||||
- Document why changes were made
|
||||
|
||||
4. **Gradual Rollout**
|
||||
- Test changes on development server first
|
||||
- Apply to one production server at a time
|
||||
- Verify behavior before applying everywhere
|
||||
|
||||
5. **Plan for Restart-Required Changes**
|
||||
- Schedule downtime for port or SSL changes
|
||||
- Use blue-green deployment if possible
|
||||
- Keep service downtime minimal
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Reload Doesn't Apply Changes
|
||||
|
||||
**Check:**
|
||||
- Is the config file path correct?
|
||||
- Did you save the file after editing?
|
||||
- Are there YAML syntax errors?
|
||||
- Check the logs for error messages
|
||||
|
||||
**Solution:**
|
||||
```bash
|
||||
# Validate YAML syntax
|
||||
python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
|
||||
|
||||
# Check file modification time
|
||||
ls -l .hb.yaml
|
||||
|
||||
# View logs
|
||||
journalctl -u heartbeat -f
|
||||
```
|
||||
|
||||
### Partial Configuration Applied
|
||||
|
||||
**Cause:** Some sections reloaded, others didn't.
|
||||
|
||||
**Solution:** Check logs to see which components failed. Common issues:
|
||||
- Invalid channel type
|
||||
- Missing required threshold fields
|
||||
- Invalid host references
|
||||
|
||||
### Service Becomes Unresponsive
|
||||
|
||||
**Cause:** Malformed configuration caused an exception.
|
||||
|
||||
**Solution:**
|
||||
1. Revert to backup configuration
|
||||
2. Send SIGHUP again to reload the good config
|
||||
3. If service is completely stuck, restart it
|
||||
|
||||
## Implementation Details
|
||||
|
||||
The reload mechanism uses:
|
||||
|
||||
- **Signal Handling**: SIGHUP triggers reload event
|
||||
- **Async-Safe Reloading**: Configuration is loaded asynchronously
|
||||
- **Component Coordination**: All affected components are updated atomically
|
||||
- **State Preservation**: Alert states and hysteresis information are maintained
|
||||
- **Error Recovery**: Failed reloads don't affect running configuration
|
||||
|
||||
## See Also
|
||||
|
||||
- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
|
||||
- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
|
||||
- Configuration examples in `hbd/config_*.yaml`
|
||||
@@ -0,0 +1,632 @@
|
||||
# HTTP API and Web UI Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
|
||||
|
||||
## Base URL
|
||||
|
||||
All API endpoints are relative to the server base URL:
|
||||
```
|
||||
http://your-server:50004
|
||||
```
|
||||
|
||||
Default port is `50004` (configurable via `hbd_port` in configuration).
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
When [user accounts are configured](USERS.md), every request must be authenticated.
|
||||
|
||||
- **Browser requests** to HTML pages are redirected to `/login` automatically. JavaScript `fetch()` calls on the dashboards send the session cookie automatically — no JS changes are needed.
|
||||
- **API / programmatic requests** must include the token in an `Authorization: Bearer <token>` header or an `X-Auth-Token` header.
|
||||
|
||||
Unauthenticated API requests receive `401 Unauthorized`. When no users are configured the server runs in unauthenticated mode and all endpoints are open.
|
||||
|
||||
### Login
|
||||
|
||||
```bash
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"alice","password":"secret"}' | jq -r .token)
|
||||
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||
```
|
||||
|
||||
See [User Management](USERS.md) for full authentication documentation.
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Authentication
|
||||
|
||||
| Method | Path | Description | Auth required |
|
||||
|--------|------|-------------|---------------|
|
||||
| `POST` | `/api/0/auth/login` | Obtain session token | No |
|
||||
| `POST` | `/api/0/auth/logout` | Invalidate session | Token |
|
||||
|
||||
### Users
|
||||
|
||||
| Method | Path | Description | Role |
|
||||
|--------|------|-------------|------|
|
||||
| `GET` | `/api/0/users` | List all users | Admin |
|
||||
| `GET` | `/api/0/users/me` | Own profile | Authenticated |
|
||||
|
||||
### Host Management
|
||||
|
||||
#### GET /api/0/hosts
|
||||
Get list of all monitored hosts with their state information. When auth is enabled, only hosts the caller has at least **monitor** access to are returned.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"name": "webserver01",
|
||||
"dyn": false,
|
||||
"owner": "alice",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"],
|
||||
"connections": [...]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
#### GET /api/0/messages
|
||||
Get recent heartbeat messages (last 30).
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"time": 1711234567.123,
|
||||
"host": "webserver01",
|
||||
"msg": "heartbeat received"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Plugin Data Endpoints
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/plugins
|
||||
Get all plugin data for a specific host.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"plugins": {
|
||||
"cpu_monitor": {
|
||||
"timestamp": 1711234567.123,
|
||||
"data": {
|
||||
"cpu_percent": 45.2,
|
||||
"load_1min": 2.5,
|
||||
"load_5min": 2.1,
|
||||
"load_15min": 1.8
|
||||
},
|
||||
"sample_count": 100
|
||||
},
|
||||
"memory_monitor": {
|
||||
"timestamp": 1711234568.456,
|
||||
"data": {
|
||||
"percent": 65.4,
|
||||
"available_mb": 4096,
|
||||
"total_mb": 16384
|
||||
},
|
||||
"sample_count": 100
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins
|
||||
```
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
|
||||
Get detailed historical data for a specific plugin.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
- `plugin_name` (path): Name of the plugin
|
||||
- `limit` (query, optional): Number of recent samples to return (default: 10)
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"plugin": "cpu_monitor",
|
||||
"samples": [
|
||||
{
|
||||
"timestamp": 1711234567.123,
|
||||
"data": {
|
||||
"cpu_percent": 45.2,
|
||||
"load_1min": 2.5
|
||||
}
|
||||
},
|
||||
{
|
||||
"timestamp": 1711234267.123,
|
||||
"data": {
|
||||
"cpu_percent": 42.1,
|
||||
"load_1min": 2.3
|
||||
}
|
||||
}
|
||||
],
|
||||
"sample_count": 2
|
||||
}
|
||||
```
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Get last 1 sample (most recent)
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
|
||||
|
||||
# Get last 50 samples
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
|
||||
|
||||
# Get disk monitor data
|
||||
curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Host Access
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/access
|
||||
Get owner/managers/monitors for a host. Requires **monitor** role or higher.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"owner": "alice",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"]
|
||||
}
|
||||
```
|
||||
|
||||
#### PUT /api/0/hosts/{hostname}/access
|
||||
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||
|
||||
**Request body** (all fields optional):
|
||||
```json
|
||||
{ "owner": "bob", "managers": ["carol"], "monitors": [] }
|
||||
```
|
||||
|
||||
Changes take effect immediately but are not written back to the config file. Update the config file and send `SIGHUP` to make them permanent.
|
||||
|
||||
---
|
||||
|
||||
### Alert Endpoints
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/alerts
|
||||
Get alert states for a specific host.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname` (path): Name of the host
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"alerts": [
|
||||
{
|
||||
"metric_path": "cpu_monitor.cpu_percent",
|
||||
"level": "WARNING",
|
||||
"since": 1711234000.0,
|
||||
"last_value": 85.5,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 2
|
||||
},
|
||||
{
|
||||
"metric_path": "disk_monitor./.percent",
|
||||
"level": "OK",
|
||||
"since": 1711230000.0,
|
||||
"last_value": 65.0,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 0
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"ok": 15,
|
||||
"warning": 1,
|
||||
"critical": 0,
|
||||
"unknown": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/hosts/webserver01/alerts
|
||||
```
|
||||
|
||||
#### GET /api/0/alerts
|
||||
Get all active alerts across all monitored hosts.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"alerts": [
|
||||
{
|
||||
"hostname": "webserver01",
|
||||
"metric_path": "cpu_monitor.cpu_percent",
|
||||
"level": "CRITICAL",
|
||||
"since": 1711234000.0,
|
||||
"last_value": 95.5,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 3
|
||||
},
|
||||
{
|
||||
"hostname": "database01",
|
||||
"metric_path": "memory_monitor.percent",
|
||||
"level": "WARNING",
|
||||
"since": 1711233000.0,
|
||||
"last_value": 88.2,
|
||||
"last_check": 1711234567.123,
|
||||
"notification_count": 1
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"critical": 1,
|
||||
"warning": 1,
|
||||
"unknown": 0,
|
||||
"total": 2
|
||||
},
|
||||
"host_count": 5
|
||||
}
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
curl http://localhost:50004/api/0/alerts | jq .
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Web UI Pages
|
||||
|
||||
### Login
|
||||
**URL:** `/login`
|
||||
|
||||
Shown automatically when a browser request is made without a valid session (when users are configured). After successful login the browser is redirected to the originally requested page.
|
||||
|
||||
### Logout
|
||||
**URL:** `/logout`
|
||||
|
||||
Clears the session cookie and redirects to `/login`.
|
||||
|
||||
### Live Dashboard
|
||||
**URL:** `/live`
|
||||
|
||||
Real-time dashboard showing:
|
||||
- Host connection states
|
||||
- IPv4/IPv6 connectivity
|
||||
- Latency metrics
|
||||
- Recent messages
|
||||
|
||||
**Features:**
|
||||
- WebSocket-powered live updates
|
||||
- Sortable columns
|
||||
- Color-coded status indicators
|
||||
|
||||
### Plugin Metrics
|
||||
**URL:** `/plugins`
|
||||
|
||||
Interactive visualization of plugin metrics:
|
||||
- Select host and plugin from dropdown
|
||||
- View current metric values
|
||||
- Automatic refresh every 30 seconds
|
||||
- Support for nested metrics (e.g., per-partition disk stats)
|
||||
|
||||
**Features:**
|
||||
- Card-based metric display
|
||||
- Unit formatting (%, MB, GB)
|
||||
- Nested object visualization
|
||||
- Auto-refresh
|
||||
|
||||
**Screenshots of available data:**
|
||||
- CPU usage, load average, frequency
|
||||
- Memory usage, available memory, swap
|
||||
- Disk usage per partition, I/O statistics
|
||||
- Network interface statistics, connection counts
|
||||
- Custom plugin data
|
||||
|
||||
### Alerts Dashboard
|
||||
**URL:** `/alerts`
|
||||
|
||||
Comprehensive alert monitoring:
|
||||
- Summary cards (Critical, Warning, Total Hosts)
|
||||
- Filter by severity (All, Critical, Warning)
|
||||
- Alert details with duration
|
||||
- Auto-refresh every 15 seconds
|
||||
|
||||
**Features:**
|
||||
- Color-coded alert levels
|
||||
- Duration tracking
|
||||
- Filterable list
|
||||
- Real-time updates
|
||||
- Summary statistics
|
||||
|
||||
---
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Monitoring Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Check for critical alerts and send notification
|
||||
|
||||
# Log in first (when auth is configured)
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"monitor","password":"secret"}' | jq -r .token)
|
||||
AUTH="-H \"Authorization: Bearer $TOKEN\""
|
||||
|
||||
RESPONSE=$(curl -s $AUTH http://localhost:50004/api/0/alerts)
|
||||
CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
|
||||
|
||||
if [ "$CRITICAL_COUNT" -gt 0 ]; then
|
||||
echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
|
||||
echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
|
||||
# Send notification
|
||||
# mail -s "Critical Alerts" admin@example.com < alert_details.txt
|
||||
fi
|
||||
```
|
||||
|
||||
### Python Client
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
BASE = 'http://localhost:50004'
|
||||
|
||||
# Log in (skip if auth not configured)
|
||||
resp = requests.post(f'{BASE}/api/0/auth/login',
|
||||
json={"username": "alice", "password": "secret"})
|
||||
token = resp.json().get("token")
|
||||
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
||||
|
||||
# Get all plugin data for a host
|
||||
response = requests.get(f'{BASE}/api/0/hosts/webserver01/plugins', headers=headers)
|
||||
data = response.json()
|
||||
|
||||
print(f"Host: {data['hostname']}")
|
||||
print(f"Plugins: {', '.join(data['plugins'].keys())}")
|
||||
|
||||
for plugin, info in data['plugins'].items():
|
||||
print(f"\n{plugin}:")
|
||||
for metric, value in info['data'].items():
|
||||
print(f" {metric}: {value}")
|
||||
|
||||
# Check for alerts
|
||||
response = requests.get(f'{BASE}/api/0/alerts', headers=headers)
|
||||
alerts = response.json()
|
||||
|
||||
if alerts['summary']['critical'] > 0:
|
||||
print(f"\n⚠️ {alerts['summary']['critical']} CRITICAL ALERTS!")
|
||||
for alert in alerts['alerts']:
|
||||
if alert['level'] == 'CRITICAL':
|
||||
print(f" - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
|
||||
```
|
||||
|
||||
### Grafana Integration
|
||||
|
||||
The API endpoints can be used with Grafana's JSON datasource plugin:
|
||||
|
||||
1. Install the SimpleJSON datasource plugin
|
||||
2. Configure datasource URL: `http://your-server:50004`
|
||||
3. Create queries:
|
||||
- Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
|
||||
- Alerts: `/api/0/alerts`
|
||||
|
||||
### Prometheus Integration
|
||||
|
||||
Export metrics in Prometheus format (future enhancement):
|
||||
|
||||
```python
|
||||
# Example prometheus exporter
|
||||
from prometheus_client import Gauge, generate_latest
|
||||
import requests
|
||||
|
||||
cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
|
||||
memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
|
||||
|
||||
def collect_metrics():
|
||||
hosts = requests.get('http://localhost:50004/api/0/hosts').json()
|
||||
for host in hosts:
|
||||
hostname = host['name']
|
||||
plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
|
||||
|
||||
if 'cpu_monitor' in plugins['plugins']:
|
||||
cpu_data = plugins['plugins']['cpu_monitor']['data']
|
||||
cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
|
||||
|
||||
if 'memory_monitor' in plugins['plugins']:
|
||||
mem_data = plugins['plugins']['memory_monitor']['data']
|
||||
memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Response Formats
|
||||
|
||||
### Success Response
|
||||
All successful API calls return HTTP 200 with JSON body:
|
||||
```json
|
||||
{
|
||||
"field": "value",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Error Response
|
||||
API errors return appropriate HTTP status codes with JSON:
|
||||
```json
|
||||
{
|
||||
"error": "Host 'unknown-host' not found"
|
||||
}
|
||||
```
|
||||
|
||||
**Common Status Codes:**
|
||||
- `200 OK` - Success
|
||||
- `400 Bad Request` - Invalid parameters
|
||||
- `401 Unauthorized` - Missing or invalid session token
|
||||
- `403 Forbidden` - Authenticated but insufficient role
|
||||
- `404 Not Found` - Resource not found
|
||||
- `500 Internal Server Error` - Server error
|
||||
|
||||
---
|
||||
|
||||
## WebSocket API
|
||||
|
||||
For real-time updates, connect to the WebSocket endpoint:
|
||||
|
||||
**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
|
||||
|
||||
**Messages:**
|
||||
```json
|
||||
{
|
||||
"type": "host",
|
||||
"data": {
|
||||
"name": "webserver01",
|
||||
"state": "UP"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "plugin",
|
||||
"data": {
|
||||
"host": "webserver01",
|
||||
"plugin": "cpu_monitor",
|
||||
"data": {...},
|
||||
"timestamp": 1711234567.123
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Enable HTTP Server
|
||||
|
||||
```yaml
|
||||
# In your hbd configuration file
|
||||
hbd_host: "" # Listen on all interfaces
|
||||
hbd_port: 50004 # HTTP port
|
||||
ws_port: 50005 # WebSocket port (optional)
|
||||
# wss_port: 50006 # Secure WebSocket (requires SSL)
|
||||
```
|
||||
|
||||
### SSL/TLS Configuration
|
||||
|
||||
For secure WebSocket connections:
|
||||
|
||||
```yaml
|
||||
wss_port: 50006
|
||||
cert_path: /etc/heartbeat/certs/
|
||||
wss_pem: server.pem
|
||||
wss_key: server.key
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
The API currently does not implement rate limiting. For production use, consider:
|
||||
|
||||
- Placing behind a reverse proxy (nginx, Apache)
|
||||
- Using API gateway for rate limiting
|
||||
- Implementing caching for frequently accessed endpoints
|
||||
|
||||
---
|
||||
|
||||
## CORS Support
|
||||
|
||||
By default, CORS is not enabled. To enable for web applications:
|
||||
|
||||
```python
|
||||
# In http.py, add CORS middleware
|
||||
from aiohttp_cors import setup as cors_setup
|
||||
|
||||
app = web.Application()
|
||||
cors = cors_setup(app)
|
||||
|
||||
# Configure CORS for all routes
|
||||
for route in list(app.router.routes()):
|
||||
cors.add(route, {
|
||||
"*": aiohttp_cors.ResourceOptions(
|
||||
allow_credentials=True,
|
||||
expose_headers="*",
|
||||
allow_headers="*",
|
||||
)
|
||||
})
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Caching
|
||||
- Plugin data is cached in memory (last 100 samples per plugin)
|
||||
- No database queries required
|
||||
- Responses are fast (<10ms typical)
|
||||
|
||||
### Scalability
|
||||
- Each host stores its own data independently
|
||||
- Memory usage: ~1KB per host + ~1KB per plugin sample
|
||||
- For 100 hosts with 5 plugins: ~50MB memory
|
||||
|
||||
### Best Practices
|
||||
1. Use `limit` parameter to control response size
|
||||
2. Cache responses on client side when appropriate
|
||||
3. Use WebSocket for real-time updates instead of polling
|
||||
4. Consider pagination for large deployments (future enhancement)
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### API Returns 401
|
||||
- Auth is configured — include `Authorization: Bearer <token>` header
|
||||
- Token may have expired (24 h TTL) — log in again
|
||||
|
||||
### API Returns 403
|
||||
- Authenticated user lacks the required role for this host/action
|
||||
- Check host's `owner`, `managers`, `monitors` config
|
||||
|
||||
### API Returns 404
|
||||
- Verify hostname in URL matches actual host name
|
||||
- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
|
||||
|
||||
### No Plugin Data
|
||||
- Verify client is configured with plugins
|
||||
- Check client logs for plugin errors
|
||||
- Ensure plugins are sending data (check journal logs)
|
||||
|
||||
### Empty Alerts
|
||||
- Verify thresholds are configured
|
||||
- Check host is in `watchhosts` list
|
||||
- Ensure plugins are collecting metrics
|
||||
- Review server logs for threshold checker errors
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [User Management](USERS.md)
|
||||
- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
|
||||
- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
|
||||
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
||||
- Configuration examples: `hbd/config_example.yaml`
|
||||
@@ -0,0 +1,413 @@
|
||||
# Message Journal
|
||||
|
||||
The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
|
||||
|
||||
## Overview
|
||||
|
||||
The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
|
||||
- Audit message history
|
||||
- Debug connection issues
|
||||
- Analyze traffic patterns
|
||||
- Replay messages for testing
|
||||
- Create historical reports
|
||||
|
||||
## Features
|
||||
|
||||
- **JSON Format**: Each message is logged as a single JSON line for easy parsing
|
||||
- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
|
||||
- **Automatic Cleanup**: Keeps only a configurable number of backup files
|
||||
- **Thread-Safe**: Safe for concurrent access from multiple async tasks
|
||||
- **Configurable**: All settings controllable via configuration file
|
||||
- **Performance**: Non-blocking async operation with minimal overhead
|
||||
|
||||
## Configuration
|
||||
|
||||
Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
|
||||
|
||||
```yaml
|
||||
# Message journal configuration
|
||||
journal_enabled: true # Enable/disable journaling
|
||||
journal_dir: /var/log/heartbeat # Directory for journal files
|
||||
journal_file: messages.journal # Base filename
|
||||
journal_max_size: 104857600 # Max size in bytes (100MB default)
|
||||
journal_max_backups: 10 # Number of backup files to keep
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
| Option | Default | Description |
|
||||
|--------|---------|-------------|
|
||||
| `journal_enabled` | `true` | Enable or disable message journaling |
|
||||
| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
|
||||
| `journal_file` | `messages.journal` | Base filename for the journal |
|
||||
| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
|
||||
| `journal_max_backups` | `10` | Number of rotated backup files to keep |
|
||||
|
||||
## File Format
|
||||
|
||||
Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
|
||||
|
||||
```json
|
||||
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
|
||||
{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
|
||||
```
|
||||
|
||||
### Entry Structure
|
||||
|
||||
Each journal entry contains:
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `timestamp` | float | Unix timestamp (seconds since epoch) |
|
||||
| `datetime` | string | ISO 8601 formatted datetime |
|
||||
| `source_ip` | string | Source IP address |
|
||||
| `source_port` | integer | Source UDP port |
|
||||
| `message` | object | Complete parsed message dictionary |
|
||||
|
||||
## Log Rotation
|
||||
|
||||
### How Rotation Works
|
||||
|
||||
1. Journal writes messages to the current file
|
||||
2. When file size exceeds `journal_max_size`, rotation is triggered
|
||||
3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
|
||||
4. New empty file is created as the current journal
|
||||
5. Old backup files exceeding `journal_max_backups` are deleted
|
||||
|
||||
### Example File Structure
|
||||
|
||||
```
|
||||
/var/log/heartbeat/
|
||||
├── messages.journal # Current active journal
|
||||
├── messages.journal.20260328-120000 # Rotated backup
|
||||
├── messages.journal.20260328-140000 # Rotated backup
|
||||
└── messages.journal.20260328-160000 # Rotated backup (oldest)
|
||||
```
|
||||
|
||||
### Rotation Behavior
|
||||
|
||||
- Rotation is triggered when the next message would exceed the size limit
|
||||
- Rotation is automatic and requires no manual intervention
|
||||
- Old backups are deleted in FIFO order (oldest first)
|
||||
- Rotation is thread-safe and won't lose messages
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Reading Journal Files
|
||||
|
||||
#### Using Python
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
# Read all entries from current journal
|
||||
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||
for line in f:
|
||||
entry = json.loads(line)
|
||||
print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
|
||||
```
|
||||
|
||||
#### Using jq (command line)
|
||||
|
||||
```bash
|
||||
# View all messages
|
||||
cat /var/log/heartbeat/messages.journal | jq .
|
||||
|
||||
# Filter by message type
|
||||
cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
|
||||
|
||||
# Filter by hostname
|
||||
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
|
||||
|
||||
# Count messages by type
|
||||
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
|
||||
|
||||
# Extract timestamps and source IPs
|
||||
cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
|
||||
```
|
||||
|
||||
#### Using shell tools
|
||||
|
||||
```bash
|
||||
# Count total messages
|
||||
wc -l /var/log/heartbeat/messages.journal
|
||||
|
||||
# View recent messages
|
||||
tail -n 100 /var/log/heartbeat/messages.journal | jq .
|
||||
|
||||
# Search for specific host
|
||||
grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
|
||||
|
||||
# Check journal file size
|
||||
du -h /var/log/heartbeat/messages.journal
|
||||
```
|
||||
|
||||
### Analyzing Historical Data
|
||||
|
||||
```bash
|
||||
# Combine all journal files (current + backups)
|
||||
cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
|
||||
|
||||
# Count messages per host
|
||||
cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
|
||||
|
||||
# Find all plugin messages
|
||||
cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
|
||||
|
||||
# Extract CPU metrics from plugin messages
|
||||
cat /var/log/heartbeat/messages.journal* | \
|
||||
jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
|
||||
```
|
||||
|
||||
## Integration with Log Management
|
||||
|
||||
### Logrotate
|
||||
|
||||
While the journal has built-in rotation, you can also use logrotate for additional management:
|
||||
|
||||
```
|
||||
/var/log/heartbeat/messages.journal.* {
|
||||
daily
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
missingok
|
||||
notifempty
|
||||
}
|
||||
```
|
||||
|
||||
### Elasticsearch/OpenSearch
|
||||
|
||||
Import journal data into Elasticsearch for advanced analysis:
|
||||
|
||||
```python
|
||||
from elasticsearch import Elasticsearch
|
||||
import json
|
||||
|
||||
es = Elasticsearch(['localhost:9200'])
|
||||
|
||||
with open('/var/log/heartbeat/messages.journal', 'r') as f:
|
||||
for line in f:
|
||||
entry = json.loads(line)
|
||||
es.index(index='heartbeat-messages', body=entry)
|
||||
```
|
||||
|
||||
### Splunk
|
||||
|
||||
Create a Splunk input for the journal:
|
||||
|
||||
```ini
|
||||
[monitor:///var/log/heartbeat/messages.journal*]
|
||||
sourcetype = heartbeat_json
|
||||
index = heartbeat
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Overhead
|
||||
|
||||
- Journal writing is async and non-blocking
|
||||
- Typical overhead: < 1ms per message
|
||||
- Minimal impact on heartbeat processing
|
||||
|
||||
### Disk Usage
|
||||
|
||||
Calculate expected disk usage:
|
||||
|
||||
```
|
||||
Messages per day = (86400 seconds / interval) * number_of_hosts
|
||||
Average message size ≈ 200-500 bytes
|
||||
Daily disk usage = Messages per day * Average message size
|
||||
|
||||
Example:
|
||||
- 100 hosts
|
||||
- 30 second interval
|
||||
- 2880 messages/day per host
|
||||
- 288,000 messages/day total
|
||||
- ~60-140 MB/day
|
||||
```
|
||||
|
||||
### Recommendations
|
||||
|
||||
- **Small deployments** (< 50 hosts): Default settings work well
|
||||
- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
|
||||
- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Check Journal Status
|
||||
|
||||
The journal exposes statistics that can be queried:
|
||||
|
||||
```python
|
||||
from hbd.journal import get_journal
|
||||
|
||||
journal = get_journal()
|
||||
stats = journal.get_stats()
|
||||
print(f"Current size: {stats['current_size']:,} bytes")
|
||||
print(f"Rotation threshold: {stats['rotation_threshold']}")
|
||||
```
|
||||
|
||||
### Log Messages
|
||||
|
||||
Journal operations are logged at appropriate levels:
|
||||
|
||||
- `INFO`: Initialization, rotation events, cleanup
|
||||
- `DEBUG`: Individual message logging
|
||||
- `WARNING`: Non-critical issues
|
||||
- `ERROR`: Critical failures
|
||||
|
||||
Check hbd logs for journal-related messages:
|
||||
|
||||
```bash
|
||||
grep journal /var/log/heartbeat.log
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Journal Files Not Created
|
||||
|
||||
**Problem**: No journal files appear in the configured directory.
|
||||
|
||||
**Solutions**:
|
||||
- Check `journal_enabled: true` in configuration
|
||||
- Verify directory exists and hbd has write permissions
|
||||
- Check hbd logs for initialization errors
|
||||
- Verify disk space is available
|
||||
|
||||
### Rotation Not Working
|
||||
|
||||
**Problem**: Journal file grows beyond `journal_max_size`.
|
||||
|
||||
**Solutions**:
|
||||
- Check that `journal_max_size` is properly configured
|
||||
- Verify hbd has permission to rename/create files
|
||||
- Check for filesystem issues
|
||||
- Review hbd logs for rotation errors
|
||||
|
||||
### Missing Messages
|
||||
|
||||
**Problem**: Some messages don't appear in journal.
|
||||
|
||||
**Solutions**:
|
||||
- Verify `journal_enabled: true`
|
||||
- Check for write errors in hbd logs
|
||||
- Verify sufficient disk space
|
||||
- Check if filesystem is read-only
|
||||
|
||||
### Performance Issues
|
||||
|
||||
**Problem**: Journal causing slow message processing.
|
||||
|
||||
**Solutions**:
|
||||
- Use faster storage (SSD) for journal directory
|
||||
- Increase `journal_max_size` to reduce rotation frequency
|
||||
- Disable journal if not needed: `journal_enabled: false`
|
||||
- Consider async syslog forwarding instead
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### File Permissions
|
||||
|
||||
Ensure proper permissions on journal files:
|
||||
|
||||
```bash
|
||||
# Journal directory
|
||||
chmod 750 /var/log/heartbeat
|
||||
chown hbd:hbd /var/log/heartbeat
|
||||
|
||||
# Journal files
|
||||
chmod 640 /var/log/heartbeat/messages.journal*
|
||||
```
|
||||
|
||||
### Sensitive Data
|
||||
|
||||
Journal files may contain:
|
||||
- Hostnames and IP addresses
|
||||
- System metrics
|
||||
- Custom message content
|
||||
|
||||
**Recommendations**:
|
||||
- Restrict read access to authorized users only
|
||||
- Consider encryption for archived journals
|
||||
- Implement log retention policies
|
||||
- Sanitize data if sharing for debugging
|
||||
|
||||
## API Reference
|
||||
|
||||
### MessageJournal Class
|
||||
|
||||
```python
|
||||
class MessageJournal:
|
||||
def __init__(self, config: Dict[str, Any])
|
||||
async def initialize(self) -> bool
|
||||
async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
|
||||
async def close(self)
|
||||
def get_stats(self) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
### Module Functions
|
||||
|
||||
```python
|
||||
def get_journal(config: Dict = None) -> MessageJournal
|
||||
async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
|
||||
```
|
||||
|
||||
## Example: Custom Message Processing
|
||||
|
||||
Process journal messages in real-time:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
async def tail_journal(journal_path):
|
||||
"""Follow journal file and process new messages."""
|
||||
path = Path(journal_path)
|
||||
|
||||
with open(path, 'r') as f:
|
||||
# Jump to end
|
||||
f.seek(0, 2)
|
||||
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
entry = json.loads(line)
|
||||
await process_message(entry)
|
||||
else:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def process_message(entry):
|
||||
"""Process a journal entry."""
|
||||
msg = entry['message']
|
||||
|
||||
# Alert on boot messages
|
||||
if msg.get('boot'):
|
||||
print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
|
||||
|
||||
# Track CPU usage
|
||||
if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
|
||||
cpu = msg.get('cpu_percent', 0)
|
||||
if cpu > 90:
|
||||
print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
|
||||
```
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for future versions:
|
||||
|
||||
- Compression of rotated logs (gzip)
|
||||
- Time-based rotation in addition to size-based
|
||||
- Filtering to exclude certain message types
|
||||
- Structured logging output formats (CEF, GELF)
|
||||
- Remote syslog forwarding
|
||||
- Message deduplication
|
||||
- Journal file encryption
|
||||
- Signed journal entries
|
||||
|
||||
## See Also
|
||||
|
||||
- [Configuration Guide](../hbd/config.py) - Full configuration options
|
||||
- [UDP Protocol](../hbd/udp.py) - Message handling
|
||||
- [Server Architecture](../hbd/server.py) - Server initialization
|
||||
@@ -0,0 +1,331 @@
|
||||
# Nagios Plugin Integration Guide
|
||||
|
||||
The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Install Nagios Plugins
|
||||
|
||||
**Debian/Ubuntu:**
|
||||
```bash
|
||||
sudo apt-get install nagios-plugins
|
||||
```
|
||||
|
||||
**RHEL/CentOS/Fedora:**
|
||||
```bash
|
||||
sudo yum install nagios-plugins-all
|
||||
# or
|
||||
sudo dnf install nagios-plugins-all
|
||||
```
|
||||
|
||||
**Arch Linux:**
|
||||
```bash
|
||||
sudo pacman -S monitoring-plugins
|
||||
```
|
||||
|
||||
### 2. Configure Heartbeat
|
||||
|
||||
Add the `nagios_runner` section to your `~/.hb.yaml` config:
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
interval: 60 # Run plugins every 60 seconds
|
||||
timeout: 30 # Command timeout in seconds
|
||||
commands:
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
```
|
||||
|
||||
### 3. Start Heartbeat Client
|
||||
|
||||
```bash
|
||||
hbc -v localhost
|
||||
```
|
||||
|
||||
The client will now execute the configured Nagios plugins and send their results to the server.
|
||||
|
||||
## How It Works
|
||||
|
||||
### Nagios Plugin Standard
|
||||
|
||||
Nagios plugins follow a simple interface:
|
||||
|
||||
1. **Exit Codes:**
|
||||
- `0` = OK
|
||||
- `1` = WARNING
|
||||
- `2` = CRITICAL
|
||||
- `3` = UNKNOWN
|
||||
|
||||
2. **Output Format:**
|
||||
```
|
||||
STATUS - Message | performance_data
|
||||
```
|
||||
|
||||
3. **Performance Data Format:**
|
||||
```
|
||||
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
```
|
||||
|
||||
### Example Plugin Output
|
||||
|
||||
```bash
|
||||
$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
|
||||
```
|
||||
|
||||
This output includes:
|
||||
- **Status:** `DISK OK`
|
||||
- **Message:** `free space: / 156 GB (78%)`
|
||||
- **Performance Data:** `/=44GB;127;142;0;159`
|
||||
- Current value: 44GB
|
||||
- Warning threshold: 127GB
|
||||
- Critical threshold: 142GB
|
||||
- Min: 0GB
|
||||
- Max: 159GB
|
||||
|
||||
### Data Collected
|
||||
|
||||
The `nagios_runner` plugin collects:
|
||||
|
||||
**For each configured command:**
|
||||
- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
|
||||
- `{name}_status_code` - Numeric exit code (0-3)
|
||||
- `{name}_output` - Status message
|
||||
- `{name}_{metric}` - Each performance metric value
|
||||
- `{name}_{metric}_uom` - Unit of measurement (if present)
|
||||
- `{name}_{metric}_warn` - Warning threshold (if present)
|
||||
- `{name}_{metric}_crit` - Critical threshold (if present)
|
||||
- `{name}_{metric}_min` - Minimum value (if present)
|
||||
- `{name}_{metric}_max` - Maximum value (if present)
|
||||
|
||||
**Overall:**
|
||||
- `overall_status` - Worst status from all commands
|
||||
- `overall_status_code` - Worst status code
|
||||
- `plugin_count` - Number of Nagios plugins executed
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
# Collection interval in seconds (default: 60)
|
||||
interval: 60
|
||||
|
||||
# Command execution timeout in seconds (default: 30)
|
||||
timeout: 30
|
||||
|
||||
# Execute commands via shell (default: true)
|
||||
# Set to false for direct execution (more secure but less flexible)
|
||||
shell: true
|
||||
|
||||
# List of Nagios plugins to run
|
||||
commands:
|
||||
- name: unique_name # Required: unique identifier
|
||||
command: /path/to/plugin [args] # Required: full command to execute
|
||||
```
|
||||
|
||||
## Common Nagios Plugins
|
||||
|
||||
### System Resources
|
||||
|
||||
**Disk Space:**
|
||||
```yaml
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
```
|
||||
|
||||
**Load Average:**
|
||||
```yaml
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
```
|
||||
|
||||
**Swap Usage:**
|
||||
```yaml
|
||||
- name: check_swap
|
||||
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
```
|
||||
|
||||
**Process Count:**
|
||||
```yaml
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
```
|
||||
|
||||
**Users Logged In:**
|
||||
```yaml
|
||||
- name: check_users
|
||||
command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
|
||||
```
|
||||
|
||||
### Network Services
|
||||
|
||||
**SSH:**
|
||||
```yaml
|
||||
- name: check_ssh
|
||||
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||
```
|
||||
|
||||
**HTTP:**
|
||||
```yaml
|
||||
- name: check_http_local
|
||||
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||
|
||||
- name: check_http_ssl
|
||||
command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
|
||||
```
|
||||
|
||||
**DNS:**
|
||||
```yaml
|
||||
- name: check_dns
|
||||
command: /usr/lib/nagios/plugins/check_dns -H google.com
|
||||
```
|
||||
|
||||
**Ping:**
|
||||
```yaml
|
||||
- name: check_ping_gateway
|
||||
command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
|
||||
```
|
||||
|
||||
### Databases
|
||||
|
||||
**MySQL:**
|
||||
```yaml
|
||||
- name: check_mysql
|
||||
command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
|
||||
```
|
||||
|
||||
**PostgreSQL:**
|
||||
```yaml
|
||||
- name: check_pgsql
|
||||
command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
|
||||
```
|
||||
|
||||
## Writing Custom Nagios Plugins
|
||||
|
||||
You can write your own Nagios-compatible plugins in any language. Here's a simple example:
|
||||
|
||||
**Bash:**
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# /usr/local/bin/check_example.sh
|
||||
|
||||
# Get the value to check
|
||||
value=$(some_command)
|
||||
|
||||
# Define thresholds
|
||||
warn=80
|
||||
crit=90
|
||||
|
||||
# Check and output result
|
||||
if [ $value -ge $crit ]; then
|
||||
echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 2
|
||||
elif [ $value -ge $warn ]; then
|
||||
echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
|
||||
exit 0
|
||||
fi
|
||||
```
|
||||
|
||||
**Python:**
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
# /usr/local/bin/check_example.py
|
||||
|
||||
import sys
|
||||
|
||||
def check_something():
|
||||
value = get_value() # Your check logic here
|
||||
warn = 80
|
||||
crit = 90
|
||||
|
||||
perfdata = f"value={value};{warn};{crit};0;100"
|
||||
|
||||
if value >= crit:
|
||||
print(f"CRITICAL - Value is {value} | {perfdata}")
|
||||
sys.exit(2)
|
||||
elif value >= warn:
|
||||
print(f"WARNING - Value is {value} | {perfdata}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"OK - Value is {value} | {perfdata}")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_something()
|
||||
```
|
||||
|
||||
Then configure in Heartbeat:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
commands:
|
||||
- name: my_custom_check
|
||||
command: /usr/local/bin/check_example.sh
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Plugin not found
|
||||
```
|
||||
Error: Command not found
|
||||
```
|
||||
**Solution:** Use the full path to the plugin. Common locations:
|
||||
- `/usr/lib/nagios/plugins/`
|
||||
- `/usr/lib64/nagios/plugins/`
|
||||
- `/usr/local/nagios/libexec/`
|
||||
|
||||
### Permission denied
|
||||
```
|
||||
Error: Permission denied
|
||||
```
|
||||
**Solution:** Ensure the plugin is executable:
|
||||
```bash
|
||||
chmod +x /path/to/plugin
|
||||
```
|
||||
|
||||
### Timeout errors
|
||||
```
|
||||
Command timed out after 30s
|
||||
```
|
||||
**Solution:** Increase the timeout in config:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
timeout: 60 # Increase timeout
|
||||
```
|
||||
|
||||
### No performance data
|
||||
If performance data is not being parsed:
|
||||
1. Check plugin output includes `|` separator
|
||||
2. Verify performance data format: `'label'=value[UOM];...`
|
||||
3. Enable debug logging: `hbc -v -x localhost`
|
||||
|
||||
## Benefits
|
||||
|
||||
1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
|
||||
2. **No Rewriting:** Use plugins as-is without modification
|
||||
3. **Community Support:** Well-documented and maintained plugins
|
||||
4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
|
||||
5. **Standard Interface:** Consistent exit codes and output format
|
||||
6. **Performance Data:** Automatic extraction of metrics
|
||||
|
||||
## Resources
|
||||
|
||||
- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
|
||||
- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
|
||||
- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
|
||||
- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Configure threshold alerts based on Nagios plugin status codes
|
||||
- View plugin data in the Heartbeat web UI
|
||||
- Create custom plugins for your specific monitoring needs
|
||||
- Integrate with existing Nagios/Icinga configurations
|
||||
@@ -0,0 +1,533 @@
|
||||
# Notification System
|
||||
|
||||
## Overview
|
||||
|
||||
The Heartbeat Monitoring System includes a flexible notification system that can send alerts through multiple channels including Email, Pushover, Signal, and Mattermost. The system supports centralized channel definitions with per-host routing, allowing fine-grained control over notification delivery.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Components
|
||||
|
||||
1. **Notification Channels** (`notification_channels` in config)
|
||||
- Centralized definitions of notification providers
|
||||
- Each channel has a type and type-specific credentials
|
||||
- Reusable across multiple hosts
|
||||
|
||||
2. **Channel Dispatcher** (`hbd/server/notify.py`)
|
||||
- `pushmsg_for_host(hostname, message)`: Main entry point for host-specific notifications
|
||||
- `_dispatch_to_channel(channel_name, channel_config, message)`: Routes to specific provider
|
||||
- Provider functions: `pushover()`, `pushsignal()`, `pushmattermost()`, `send_email()`
|
||||
|
||||
3. **Configuration Utilities** (`hbd/server/config.py`)
|
||||
- `get_notification_channels_for_host(config, hostname)`: Retrieves channel names for a host
|
||||
- `get_notification_channels_config(config, hostname)`: Retrieves full channel configurations
|
||||
- `get_channel_config(config, channel_name)`: Gets configuration for a specific channel
|
||||
|
||||
4. **Integration Points**
|
||||
- **Threshold alerts**: `threshold.py` calls `notify_mod.pushmsg_for_host()`
|
||||
- **Heartbeat events**: `udp.py` calls `notify_mod.pushmsg_for_host()` for boot/shutdown/overdue
|
||||
- **Custom alerts**: Any code can call `notify_mod.pushmsg_for_host(hostname, message)`
|
||||
|
||||
## Configuration
|
||||
|
||||
### Centralized Channel Definitions
|
||||
|
||||
Define notification channels once in your configuration file:
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
# Signal notifications
|
||||
signal_ops:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890 # Your Signal number
|
||||
recipient: +1234567890 # Recipient number
|
||||
|
||||
signal_oncall:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890
|
||||
recipient: +0987654321 # Different recipient
|
||||
|
||||
# Email notifications
|
||||
email_ops:
|
||||
type: email
|
||||
recipients:
|
||||
- ops@example.com
|
||||
- alerts@example.com
|
||||
sender: heartbeat@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587
|
||||
smtp_user: heartbeat@example.com
|
||||
smtp_password: your-smtp-password
|
||||
|
||||
email_devteam:
|
||||
type: email
|
||||
recipients: [dev-alerts@example.com]
|
||||
sender: heartbeat-dev@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587
|
||||
smtp_user: heartbeat-dev@example.com
|
||||
smtp_password: your-smtp-password
|
||||
|
||||
# Pushover notifications
|
||||
pushover_urgent:
|
||||
type: pushover
|
||||
token: your-pushover-app-token
|
||||
user: your-pushover-user-key
|
||||
|
||||
pushover_normal:
|
||||
type: pushover
|
||||
token: your-pushover-app-token
|
||||
user: another-user-key
|
||||
|
||||
# Mattermost notifications
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: your-webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
```
|
||||
|
||||
### Default Notification Channels
|
||||
|
||||
Specify default channels for hosts that don't have specific channel assignments:
|
||||
|
||||
```yaml
|
||||
default_notification_channels:
|
||||
- email_ops
|
||||
- mattermost_devops
|
||||
```
|
||||
|
||||
Hosts without `notification_channels` defined will use these defaults.
|
||||
|
||||
### Per-Host Channel Assignment
|
||||
|
||||
Assign specific channels to each host in the `hosts` section:
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
# Critical production web server - multiple channels for redundancy
|
||||
prod-web-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels:
|
||||
- signal_oncall # Immediate mobile notification
|
||||
- pushover_urgent # Secondary mobile notification
|
||||
- email_ops # Email for record keeping
|
||||
dyndns: false
|
||||
|
||||
# Database server - ops team notifications only
|
||||
prod-db-01:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels:
|
||||
- signal_ops
|
||||
- email_ops
|
||||
dyndns: false
|
||||
|
||||
# Development server - email only, no urgent notifications
|
||||
dev-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
notification_channels:
|
||||
- email_devteam
|
||||
dyndns: false
|
||||
|
||||
# Test server - uses default_notification_channels
|
||||
test-server-01:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: false
|
||||
# No notification_channels specified = uses default_notification_channels
|
||||
```
|
||||
|
||||
## Channel Types
|
||||
|
||||
### Email
|
||||
|
||||
Sends notifications via SMTP.
|
||||
|
||||
**Configuration fields:**
|
||||
```yaml
|
||||
type: email
|
||||
recipients: [email1@example.com, email2@example.com] # Required: List of recipients
|
||||
sender: heartbeat@example.com # Required: From address
|
||||
smtp_server: smtp.example.com # Required: SMTP server hostname
|
||||
smtp_port: 587 # Optional: Default 587
|
||||
smtp_user: heartbeat@example.com # Optional: For authenticated SMTP
|
||||
smtp_password: your-password # Optional: For authenticated SMTP
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Supports multiple recipients
|
||||
- TLS/STARTTLS support on port 587
|
||||
- Authenticated and unauthenticated SMTP
|
||||
|
||||
**Example:**
|
||||
```yaml
|
||||
notification_channels:
|
||||
email_critical:
|
||||
type: email
|
||||
recipients: [admin@example.com, oncall@example.com]
|
||||
sender: alerts@example.com
|
||||
smtp_server: smtp.fastmail.com
|
||||
smtp_port: 587
|
||||
smtp_user: alerts@example.com
|
||||
smtp_password: app-specific-password
|
||||
```
|
||||
|
||||
### Pushover
|
||||
|
||||
Sends push notifications to mobile devices via Pushover API.
|
||||
|
||||
**Configuration fields:**
|
||||
```yaml
|
||||
type: pushover
|
||||
token: your-application-token # Required: Your Pushover app token
|
||||
user: your-user-key # Required: Recipient's user key
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Instant mobile push notifications
|
||||
- Works on iOS and Android
|
||||
- Supports delivery confirmations
|
||||
|
||||
**Setup:**
|
||||
1. Create a Pushover account at https://pushover.net
|
||||
2. Create an application to get your app token
|
||||
3. Note your user key from your account dashboard
|
||||
|
||||
**Example:**
|
||||
```yaml
|
||||
notification_channels:
|
||||
pushover_admin:
|
||||
type: pushover
|
||||
token: azGDORePK8gMaC0QOYAMyEEuzJnyUi
|
||||
user: uQiRzpo4DXghDmr9QzzfQu27cmVRsG
|
||||
```
|
||||
|
||||
### Signal
|
||||
|
||||
Sends notifications via Signal messenger using signal-cli.
|
||||
|
||||
**Configuration fields:**
|
||||
```yaml
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli # Optional: Path to signal-cli binary
|
||||
user: +1234567890 # Required: Your Signal phone number
|
||||
recipient: +0987654321 # Required: Recipient phone number
|
||||
```
|
||||
|
||||
**Prerequisites:**
|
||||
1. Install signal-cli: https://github.com/AsamK/signal-cli
|
||||
2. Register signal-cli with your phone number:
|
||||
```bash
|
||||
signal-cli -u +1234567890 register
|
||||
signal-cli -u +1234567890 verify CODE
|
||||
```
|
||||
3. Ensure signal-cli is in PATH or specify full path in config
|
||||
|
||||
**Features:**
|
||||
- End-to-end encrypted messaging
|
||||
- Works without phone being online
|
||||
- No API fees or rate limits
|
||||
|
||||
**Example:**
|
||||
```yaml
|
||||
notification_channels:
|
||||
signal_admin:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +12025551234
|
||||
recipient: +12025559999
|
||||
```
|
||||
|
||||
### Mattermost
|
||||
|
||||
Sends notifications to Mattermost team chat via incoming webhooks.
|
||||
|
||||
**Configuration fields:**
|
||||
```yaml
|
||||
type: mattermost
|
||||
host: mattermost.example.com # Required: Mattermost server hostname
|
||||
token: your-webhook-token # Required: Incoming webhook token
|
||||
channel: channel-name # Required: Target channel name
|
||||
username: heartbeat-bot # Optional: Bot display name
|
||||
icon: https://example.com/icon.png # Optional: Bot icon URL
|
||||
```
|
||||
|
||||
**Prerequisites:**
|
||||
1. Enable incoming webhooks in Mattermost
|
||||
2. Create an incoming webhook for your team
|
||||
3. Note the webhook token from the webhook URL
|
||||
|
||||
**Features:**
|
||||
- Team-wide visibility
|
||||
- Rich formatting support
|
||||
- Message threading
|
||||
|
||||
**Example:**
|
||||
```yaml
|
||||
notification_channels:
|
||||
mattermost_ops:
|
||||
type: mattermost
|
||||
host: chat.example.com
|
||||
token: abc123def456ghi789
|
||||
channel: infrastructure-alerts
|
||||
username: heartbeat-monitor
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
```
|
||||
|
||||
## Notification Events
|
||||
|
||||
The system sends notifications for various events:
|
||||
|
||||
### Threshold Alerts
|
||||
|
||||
When monitored metrics exceed configured thresholds:
|
||||
|
||||
- **State changes**: OK → WARNING, WARNING → CRITICAL, CRITICAL → OK
|
||||
- **Format**: `{LEVEL}: {hostname} - {metric_path} = {value} {threshold_info}`
|
||||
- **Example**: `CRITICAL: prod-web-01 - cpu_monitor.cpu_percent = 95.2 (threshold: > 90.0)`
|
||||
- **Re-notifications**: Periodic reminders for ongoing alerts (default: hourly)
|
||||
|
||||
### Heartbeat Events
|
||||
|
||||
Host lifecycle events:
|
||||
|
||||
- **Host boot**: `{hostname} booted`
|
||||
- **Host shutdown**: `{hostname} {connection_type} shutdown`
|
||||
- **Host recovery**: `{hostname} {connection_type} is back`
|
||||
- **Connection issues**: `{hostname} {message}`
|
||||
- **Host overdue**: `{hostname} {connection_type} overdue`
|
||||
|
||||
Only hosts with `watch: true` send heartbeat event notifications.
|
||||
|
||||
### Custom Alerts
|
||||
|
||||
Application code can send custom notifications:
|
||||
|
||||
```python
|
||||
from hbd.server import notify as notify_mod
|
||||
|
||||
# Send to host-specific channels
|
||||
notify_mod.pushmsg_for_host("prod-web-01", "Custom alert message")
|
||||
|
||||
# Send using global config
|
||||
notify_mod.pushmsg_from_config("Global notification")
|
||||
|
||||
# Send to specific config
|
||||
notify_mod.pushmsg(custom_config_dict, "Targeted notification")
|
||||
```
|
||||
|
||||
## Design Principles
|
||||
|
||||
The notification system follows these core principles:
|
||||
|
||||
- **Centralization**: Define notification providers once, reference them by name
|
||||
- **Flexibility**: Each host can use different channels for different notification needs
|
||||
- **Redundancy**: Critical hosts can specify multiple channels for failover
|
||||
- **Clarity**: Clean separation between channel definition and channel assignment
|
||||
- **Type Safety**: Provider-specific validation at configuration time
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Channel Organization
|
||||
|
||||
- **Create purpose-specific channels**: `email_ops`, `signal_oncall`, `pushover_urgent`
|
||||
- **Separate by team/role**: `email_devteam`, `signal_dbateam`, `mattermost_security`
|
||||
- **Use descriptive names**: Channel names appear in logs and debugging
|
||||
|
||||
### Redundancy
|
||||
|
||||
For critical hosts, use multiple notification channels:
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
critical-db:
|
||||
notification_channels:
|
||||
- signal_oncall # Primary: Mobile alert
|
||||
- pushover_urgent # Backup: Different mobile platform
|
||||
- email_ops # Tertiary: Email for record-keeping
|
||||
```
|
||||
|
||||
### Notification Fatigue Prevention
|
||||
|
||||
- **Use `watch: false`** for non-critical hosts
|
||||
- **Configure appropriate thresholds** to avoid false positives
|
||||
- **Set different channels for different severities**
|
||||
- **Use `default_notification_channels`** for baseline, add more for critical systems
|
||||
|
||||
### Security
|
||||
|
||||
- **Protect credentials**: Use file permissions to protect config files with passwords/tokens
|
||||
- **Rotate tokens**: Periodically rotate API tokens and passwords
|
||||
- **Use app-specific passwords**: For email, use app-specific passwords instead of main account password
|
||||
- **Separate accounts**: Consider separate notification accounts for different environments (prod vs dev)
|
||||
|
||||
### Testing
|
||||
|
||||
Test notification channels before relying on them:
|
||||
|
||||
```bash
|
||||
# Test signal-cli directly
|
||||
signal-cli -u +1234567890 send -m "Test message" +0987654321
|
||||
|
||||
# Test SMTP
|
||||
echo "Test" | mail -s "Test Subject" admin@example.com
|
||||
|
||||
# Test through heartbeat system (Python REPL)
|
||||
from hbd.server import notify as notify_mod, config as config_mod
|
||||
cfg = config_mod.load_config(".hb.yaml")
|
||||
notify_mod.setup(cfg)
|
||||
notify_mod.pushmsg_for_host("test-host", "Test notification")
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Notifications Not Sending
|
||||
|
||||
1. **Check logs**: Look for "Failed to send notification" errors
|
||||
2. **Verify host is watched**: Ensure `watch: true` in host definition
|
||||
3. **Check channel configuration**: Verify credentials and settings
|
||||
4. **Test channel directly**: Use command-line tools to test provider
|
||||
5. **Check network**: Ensure server can reach notification endpoints
|
||||
|
||||
### Signal Issues
|
||||
|
||||
- **signal-cli not found**: Specify full path in `cli_path`
|
||||
- **Not registered**: Run `signal-cli -u +NUMBER register` and verify
|
||||
- **Trust issues**: Run `signal-cli -u +NUMBER receive` to sync trust store
|
||||
- **Recipient not found**: Ensure recipient is in your Signal contacts
|
||||
|
||||
### Email Issues
|
||||
|
||||
- **Authentication failed**: Check SMTP username/password
|
||||
- **TLS errors**: Verify SMTP port (587 for STARTTLS, 465 for SSL)
|
||||
- **Relay denied**: Ensure SMTP server allows relay from your IP
|
||||
- **Timeout**: Check firewall rules for SMTP ports
|
||||
|
||||
### Pushover Issues
|
||||
|
||||
- **Invalid token/user**: Verify token and user key from Pushover dashboard
|
||||
- **API rate limits**: Pushover has monthly message limits on free tier
|
||||
- **HTTP errors**: Check Pushover API status page
|
||||
|
||||
### Mattermost Issues
|
||||
|
||||
- **Webhook not found**: Verify webhook token and ensure webhook is enabled
|
||||
- **Channel not found**: Check channel name spelling and permissions
|
||||
- **Driver import error**: Install mattermostdriver: `pip install mattermostdriver`
|
||||
|
||||
## API Reference
|
||||
|
||||
### Main Functions
|
||||
|
||||
#### `pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict`
|
||||
|
||||
Send notification to host-specific channels.
|
||||
|
||||
**Parameters:**
|
||||
- `hostname`: Name of the host (used to look up notification channels)
|
||||
- `msg`: Message to send
|
||||
- `debug`: Debug level (0=no debug, 1+=debug output)
|
||||
|
||||
**Returns:** Dictionary of results per channel: `{"signal_ops": True, "email_ops": False}`
|
||||
|
||||
**Example:**
|
||||
```python
|
||||
from hbd.server import notify as notify_mod
|
||||
|
||||
notify_mod.pushmsg_for_host("prod-web-01", "Server CPU at 95%")
|
||||
```
|
||||
|
||||
**Behavior:**
|
||||
1. Looks up notification channels configured for the host
|
||||
2. If no host-specific channels, uses `default_notification_channels`
|
||||
3. Dispatches to each channel in parallel
|
||||
4. Returns dict of results keyed by channel name
|
||||
5. Logs success/failure for each channel
|
||||
|
||||
## Examples
|
||||
|
||||
### Complete Configuration Example
|
||||
|
||||
```yaml
|
||||
# Notification channel definitions
|
||||
notification_channels:
|
||||
signal_oncall:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +12025551234
|
||||
recipient: +12025555678
|
||||
|
||||
email_ops:
|
||||
type: email
|
||||
recipients: [ops@example.com, alerts@example.com]
|
||||
sender: heartbeat@example.com
|
||||
smtp_server: smtp.fastmail.com
|
||||
smtp_port: 587
|
||||
smtp_user: heartbeat@example.com
|
||||
smtp_password: app-password-here
|
||||
|
||||
# Default channels
|
||||
default_notification_channels: [email_ops]
|
||||
|
||||
# Host definitions with channel assignments
|
||||
hosts:
|
||||
prod-web-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, email_ops]
|
||||
dyndns: false
|
||||
|
||||
dev-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
notification_channels: [email_ops]
|
||||
dyndns: false
|
||||
```
|
||||
|
||||
### Multiple Environments Example
|
||||
|
||||
```yaml
|
||||
notification_channels:
|
||||
# Production channels
|
||||
signal_prod_oncall:
|
||||
type: signal
|
||||
user: +12025551234
|
||||
recipient: +12025551111 # On-call phone
|
||||
|
||||
email_prod_ops:
|
||||
type: email
|
||||
recipients: [prod-ops@example.com]
|
||||
sender: prod-heartbeat@example.com
|
||||
smtp_server: smtp.example.com
|
||||
|
||||
# Staging channels
|
||||
email_staging:
|
||||
type: email
|
||||
recipients: [staging-alerts@example.com]
|
||||
sender: staging-heartbeat@example.com
|
||||
smtp_server: smtp.example.com
|
||||
|
||||
# Development channels
|
||||
mattermost_dev:
|
||||
type: mattermost
|
||||
host: chat.example.com
|
||||
token: dev-webhook-token
|
||||
channel: dev-alerts
|
||||
|
||||
hosts:
|
||||
prod-api-01:
|
||||
notification_channels: [signal_prod_oncall, email_prod_ops]
|
||||
|
||||
staging-api-01:
|
||||
notification_channels: [email_staging]
|
||||
|
||||
dev-api-01:
|
||||
notification_channels: [mattermost_dev]
|
||||
```
|
||||
@@ -0,0 +1,544 @@
|
||||
# Plugin Development Guide
|
||||
|
||||
This guide explains how to create custom plugins for the Heartbeat monitoring system.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Plugin Architecture](#plugin-architecture)
|
||||
- [Plugin Types](#plugin-types)
|
||||
- [Creating a Plugin](#creating-a-plugin)
|
||||
- [Plugin Lifecycle](#plugin-lifecycle)
|
||||
- [Configuration](#configuration)
|
||||
- [Best Practices](#best-practices)
|
||||
- [Examples](#examples)
|
||||
- [Testing](#testing)
|
||||
|
||||
## Plugin Architecture
|
||||
|
||||
Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
|
||||
|
||||
### Key Concepts
|
||||
|
||||
- **Plugin Registry**: Central registry that manages all loaded plugins
|
||||
- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
|
||||
- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
|
||||
- **Async/Await**: All plugin methods are async for non-blocking operation
|
||||
|
||||
## Plugin Types
|
||||
|
||||
### InfoPlugin
|
||||
|
||||
InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
|
||||
|
||||
- **Runs once** at startup (interval = 0)
|
||||
- **Cached** - data is collected once and reused
|
||||
- **Lightweight** - no periodic overhead
|
||||
|
||||
**Use InfoPlugin for:**
|
||||
- Operating system details
|
||||
- Hardware information
|
||||
- Software versions
|
||||
- Configuration data
|
||||
- Static inventory
|
||||
|
||||
### MonitorPlugin
|
||||
|
||||
MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
|
||||
|
||||
- **Runs periodically** based on configured interval
|
||||
- **Scheduled** - collected at regular intervals
|
||||
- **Dynamic** - captures changing system state
|
||||
|
||||
**Use MonitorPlugin for:**
|
||||
- Resource usage (CPU, memory, disk, network)
|
||||
- Performance metrics
|
||||
- Counters and gauges
|
||||
- Time-series data
|
||||
|
||||
## Creating a Plugin
|
||||
|
||||
### Step 1: Choose Plugin Type
|
||||
|
||||
Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
|
||||
|
||||
### Step 2: Create Plugin File
|
||||
|
||||
Create a new Python file in `hbd/plugins/` directory:
|
||||
|
||||
```python
|
||||
"""
|
||||
My awesome plugin for Heartbeat.
|
||||
|
||||
Brief description of what this plugin does.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Import psutil or other dependencies if needed
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.plugin import MonitorPlugin # or InfoPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyAwesomePlugin(MonitorPlugin): # or InfoPlugin
|
||||
"""
|
||||
One-line description of the plugin.
|
||||
|
||||
Collects:
|
||||
- List of metrics/data collected
|
||||
- Another metric
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 60)
|
||||
option1: Description of option1 (default: value)
|
||||
option2: Description of option2 (default: value)
|
||||
"""
|
||||
|
||||
name = "my_awesome_plugin" # Unique plugin name
|
||||
interval = 60 # For MonitorPlugin, use 0 for InfoPlugin
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize the plugin with optional configuration."""
|
||||
super().__init__(config)
|
||||
|
||||
# Extract configuration options
|
||||
self.option1 = self.config.get('option1', 'default_value')
|
||||
self.option2 = self.config.get('option2', True)
|
||||
|
||||
# Check dependencies
|
||||
if psutil is None:
|
||||
raise ImportError("psutil is required for my_awesome_plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize the plugin.
|
||||
|
||||
This is called once when the plugin is loaded.
|
||||
Use this to verify dependencies, establish connections, etc.
|
||||
|
||||
Returns:
|
||||
True if initialization successful, False otherwise
|
||||
"""
|
||||
logger.info(f"My awesome plugin initialized (option1: {self.option1})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect data.
|
||||
|
||||
This is called periodically (MonitorPlugin) or once (InfoPlugin).
|
||||
|
||||
Returns:
|
||||
Dictionary of collected data (will be sent to server)
|
||||
"""
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected {len(data)} metrics")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting data: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Internal method to collect actual metrics."""
|
||||
metrics = {}
|
||||
|
||||
# Collect your data here
|
||||
metrics['metric1'] = self._get_metric1()
|
||||
metrics['metric2'] = self._get_metric2()
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_metric1(self):
|
||||
"""Helper method for metric collection."""
|
||||
# Implementation here
|
||||
return 42
|
||||
|
||||
def _get_metric2(self):
|
||||
"""Helper method for metric collection."""
|
||||
# Implementation here
|
||||
return "hello"
|
||||
|
||||
async def cleanup(self):
|
||||
"""
|
||||
Cleanup resources.
|
||||
|
||||
This is called when the plugin is unloaded or the client shuts down.
|
||||
Use this to close connections, release resources, etc.
|
||||
"""
|
||||
logger.info("My awesome plugin cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = MyAwesomePlugin
|
||||
```
|
||||
|
||||
### Step 3: Test Your Plugin
|
||||
|
||||
Create a test script to verify your plugin works:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
|
||||
|
||||
async def test():
|
||||
# Create plugin instance
|
||||
plugin = MyAwesomePlugin({'option1': 'test_value'})
|
||||
|
||||
# Initialize
|
||||
if not await plugin.initialize():
|
||||
print("Failed to initialize")
|
||||
return False
|
||||
|
||||
# Collect data
|
||||
data = await plugin.collect()
|
||||
print(f"Collected data: {data}")
|
||||
|
||||
# Cleanup
|
||||
await plugin.cleanup()
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
success = asyncio.run(test())
|
||||
sys.exit(0 if success else 1)
|
||||
```
|
||||
|
||||
## Plugin Lifecycle
|
||||
|
||||
Understanding the plugin lifecycle helps you implement plugins correctly:
|
||||
|
||||
```
|
||||
1. Plugin Discovery
|
||||
└─> Loader scans hbd/plugins/ directory
|
||||
└─> Finds Python files (except those starting with _)
|
||||
└─> Imports modules
|
||||
|
||||
2. Plugin Instantiation
|
||||
└─> Creates instance with configuration
|
||||
└─> __init__() is called
|
||||
|
||||
3. Plugin Initialization
|
||||
└─> initialize() is called
|
||||
└─> Plugin verifies dependencies, establishes connections
|
||||
└─> Returns True/False for success/failure
|
||||
|
||||
4. Plugin Registration
|
||||
└─> If initialization succeeds, plugin is registered
|
||||
└─> Plugin becomes active
|
||||
|
||||
5. Data Collection
|
||||
└─> For InfoPlugin: collect() called once after initialization
|
||||
└─> For MonitorPlugin: collect() called periodically based on interval
|
||||
└─> Data is sent to server via PLG message
|
||||
|
||||
6. Plugin Shutdown
|
||||
└─> cleanup() is called
|
||||
└─> Plugin releases resources, closes connections
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin-Specific Configuration
|
||||
|
||||
Plugins receive configuration through the `config` parameter in `__init__`:
|
||||
|
||||
```python
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
# Access configuration with defaults
|
||||
self.interval = self.config.get('interval', 60)
|
||||
self.threshold = self.config.get('threshold', 80)
|
||||
self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
|
||||
```
|
||||
|
||||
### Client Configuration File
|
||||
|
||||
Users configure plugins in the client configuration YAML:
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
my_awesome_plugin:
|
||||
enabled: true
|
||||
interval: 120
|
||||
option1: custom_value
|
||||
option2: false
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Error Handling
|
||||
|
||||
Always handle errors gracefully:
|
||||
|
||||
```python
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
try:
|
||||
return await self._collect_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
```
|
||||
|
||||
### 2. Logging
|
||||
|
||||
Use appropriate log levels:
|
||||
|
||||
```python
|
||||
logger.debug("Detailed information for debugging")
|
||||
logger.info("Normal operation messages")
|
||||
logger.warning("Warning messages for unusual but handled situations")
|
||||
logger.error("Error messages for failures")
|
||||
```
|
||||
|
||||
### 3. Dependencies
|
||||
|
||||
Check for optional dependencies:
|
||||
|
||||
```python
|
||||
try:
|
||||
import some_optional_library
|
||||
except ImportError:
|
||||
some_optional_library = None
|
||||
|
||||
# Later in __init__:
|
||||
if some_optional_library is None:
|
||||
raise ImportError("some_optional_library is required")
|
||||
```
|
||||
|
||||
### 4. Performance
|
||||
|
||||
- Keep collection methods fast (< 1 second)
|
||||
- Use async/await for I/O operations
|
||||
- Cache expensive computations
|
||||
- Don't block the event loop
|
||||
|
||||
### 5. Data Structure
|
||||
|
||||
Return clean, structured data:
|
||||
|
||||
```python
|
||||
{
|
||||
'metric_name': value,
|
||||
'nested_data': {
|
||||
'sub_metric': value
|
||||
},
|
||||
'list_data': [item1, item2],
|
||||
'timestamp': time.time() # Optional timestamp
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Documentation
|
||||
|
||||
Document your plugin thoroughly:
|
||||
|
||||
- Class docstring with description and configuration
|
||||
- Method docstrings explaining purpose and return values
|
||||
- Inline comments for complex logic
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Simple InfoPlugin
|
||||
|
||||
```python
|
||||
from hbd.plugin import InfoPlugin
|
||||
import platform
|
||||
|
||||
class SimpleInfoPlugin(InfoPlugin):
|
||||
"""Collect basic system information."""
|
||||
|
||||
name = "simple_info"
|
||||
interval = 0 # InfoPlugin
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'hostname': platform.node(),
|
||||
'system': platform.system(),
|
||||
'python_version': platform.python_version()
|
||||
}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = SimpleInfoPlugin
|
||||
```
|
||||
|
||||
### Example 2: MonitorPlugin with State
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
import time
|
||||
|
||||
class CounterPlugin(MonitorPlugin):
|
||||
"""Track a counter over time."""
|
||||
|
||||
name = "counter"
|
||||
interval = 30
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__(config)
|
||||
self._counter = 0
|
||||
self._start_time = time.time()
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
self._counter += 1
|
||||
uptime = time.time() - self._start_time
|
||||
|
||||
return {
|
||||
'count': self._counter,
|
||||
'uptime': uptime,
|
||||
'rate': self._counter / uptime
|
||||
}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = CounterPlugin
|
||||
```
|
||||
|
||||
### Example 3: Plugin with External Command
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
import asyncio
|
||||
|
||||
class CommandPlugin(MonitorPlugin):
|
||||
"""Execute external command and capture output."""
|
||||
|
||||
name = "command_executor"
|
||||
interval = 60
|
||||
|
||||
def __init__(self, config=None):
|
||||
super().__init__(config)
|
||||
self.command = self.config.get('command', 'echo "no command"')
|
||||
|
||||
async def initialize(self):
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
try:
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
self.command,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
return {
|
||||
'exit_code': process.returncode,
|
||||
'stdout': stdout.decode('utf-8'),
|
||||
'stderr': stderr.decode('utf-8')
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
async def cleanup(self):
|
||||
pass
|
||||
|
||||
plugin = CommandPlugin
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Testing
|
||||
|
||||
Create unit tests for your plugins:
|
||||
|
||||
```python
|
||||
import unittest
|
||||
import asyncio
|
||||
|
||||
class TestMyPlugin(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.plugin = MyAwesomePlugin({'option1': 'test'})
|
||||
|
||||
def test_initialization(self):
|
||||
result = asyncio.run(self.plugin.initialize())
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_collection(self):
|
||||
asyncio.run(self.plugin.initialize())
|
||||
data = asyncio.run(self.plugin.collect())
|
||||
|
||||
self.assertIsInstance(data, dict)
|
||||
self.assertIn('metric1', data)
|
||||
self.assertGreater(data['metric1'], 0)
|
||||
|
||||
def tearDown(self):
|
||||
asyncio.run(self.plugin.cleanup())
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
```
|
||||
|
||||
### Integration Testing
|
||||
|
||||
Test your plugin with the actual client:
|
||||
|
||||
```bash
|
||||
# Create test configuration
|
||||
cat > test_config.yaml <<EOF
|
||||
server: localhost
|
||||
plugins:
|
||||
my_awesome_plugin:
|
||||
enabled: true
|
||||
interval: 10
|
||||
option1: test_value
|
||||
EOF
|
||||
|
||||
# Run client in test mode
|
||||
python -m hbd.hbc -c test_config.yaml --verbose
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### My plugin isn't loading
|
||||
|
||||
1. Check filename doesn't start with underscore
|
||||
2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
|
||||
3. Check `initialize()` returns True
|
||||
4. Look for import errors in logs
|
||||
|
||||
### Plugin loads but doesn't collect data
|
||||
|
||||
1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
|
||||
2. Verify `collect()` returns a dictionary
|
||||
3. Check for exceptions in `collect()` method
|
||||
4. Enable DEBUG logging to see detailed errors
|
||||
|
||||
### Data isn't appearing on server
|
||||
|
||||
1. Verify client is connected to server
|
||||
2. Check server logs for PLG message handling
|
||||
3. Verify returned data is JSON-serializable
|
||||
4. Check for large data sizes (may exceed UDP packet size)
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
|
||||
- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
|
||||
- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
|
||||
- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
|
||||
File diff suppressed because it is too large
Load Diff
+242
@@ -0,0 +1,242 @@
|
||||
# User Management
|
||||
|
||||
Heartbeat supports optional user accounts with role-based access control per host. When no users are configured the server runs in **unauthenticated mode** — all existing behaviour is unchanged.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Users are defined in the server config file. Each host can have an **owner**, zero or more **managers**, and zero or more **monitors**. A **default owner** catches any host that does not name an explicit owner.
|
||||
|
||||
### Roles
|
||||
|
||||
| Role | Inherits | Permissions |
|
||||
|------|----------|-------------|
|
||||
| **monitor** | — | View host status, plugin data, alerts; acknowledge alerts they were notified for |
|
||||
| **manager** | monitor | + Queue commands (`/c`), trigger DNS re-registration (`/n`), queue upgrades (`/u`); add/remove monitors |
|
||||
| **owner** | manager | + Drop host (`/d`); add/remove managers; transfer ownership; update host access |
|
||||
| **admin** *(flag)* | owner on all hosts | Full access to every host and the user list |
|
||||
|
||||
`admin` is a flag on the user, not a per-host role. An admin user has owner-level access on every host without being listed as owner/manager/monitor.
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Defining users
|
||||
|
||||
```yaml
|
||||
users:
|
||||
andreas:
|
||||
full_name: Andreas Wrede
|
||||
avatar: /path/to/avatar.png # file path, URL, or base64 data URI (optional)
|
||||
password: pbkdf2:sha256:... # generated with: hbd passwd andreas
|
||||
admin: true # optional — grants server-wide owner access
|
||||
|
||||
bob:
|
||||
full_name: Bob Smith
|
||||
password: pbkdf2:sha256:...
|
||||
notification_channels: [pushover_standard]
|
||||
|
||||
carol:
|
||||
full_name: Carol Jones
|
||||
password: pbkdf2:sha256:...
|
||||
|
||||
default_owner: andreas # owns hosts with no explicit owner
|
||||
# falls back to the first admin user if omitted
|
||||
```
|
||||
|
||||
### Assigning roles to hosts
|
||||
|
||||
```yaml
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: andreas
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
|
||||
unattended-host: # no owner → owned by default_owner
|
||||
threshold_config: default
|
||||
watch: true
|
||||
```
|
||||
|
||||
### Generating a password hash
|
||||
|
||||
```bash
|
||||
hbd passwd andreas
|
||||
```
|
||||
|
||||
Enter and confirm the password when prompted. Paste the printed hash into the config file under the user's `password` key.
|
||||
|
||||
You can also generate a hash non-interactively from Python:
|
||||
|
||||
```python
|
||||
from hbd.server.users import hash_password
|
||||
print(hash_password("mysecret"))
|
||||
```
|
||||
|
||||
Passwords are stored as PBKDF2-HMAC-SHA256 hashes (260 000 iterations). No third-party libraries are required — only Python's standard `hashlib`.
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
When at least one user is defined, every request must be authenticated. Unauthenticated requests to HTML pages are redirected to `/login`; unauthenticated API requests receive `401 Unauthorized`.
|
||||
|
||||
### Browser login
|
||||
|
||||
Navigate to any page — you will be redirected to `/login` automatically. After submitting valid credentials the server sets an `hbd_session` cookie (HttpOnly, SameSite=Lax, 24 h lifetime). All subsequent requests, including JavaScript `fetch()` calls on the dashboards, carry the cookie automatically.
|
||||
|
||||
To log out, visit `/logout`.
|
||||
|
||||
### API / programmatic login
|
||||
|
||||
```bash
|
||||
# Log in and capture the token
|
||||
TOKEN=$(curl -s -X POST http://localhost:50004/api/0/auth/login \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"username":"andreas","password":"mysecret"}' | jq -r .token)
|
||||
|
||||
# Use the token in subsequent requests
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:50004/api/0/hosts
|
||||
```
|
||||
|
||||
The token is identical to the session cookie value — both mechanisms work simultaneously.
|
||||
|
||||
```bash
|
||||
# Log out
|
||||
curl -s -X POST http://localhost:50004/api/0/auth/logout \
|
||||
-H "Authorization: Bearer $TOKEN"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Authentication
|
||||
|
||||
#### POST /api/0/auth/login
|
||||
Obtain a session token.
|
||||
|
||||
**Request body:**
|
||||
```json
|
||||
{ "username": "andreas", "password": "mysecret" }
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{ "token": "<opaque-hex-token>", "username": "andreas" }
|
||||
```
|
||||
Also sets the `hbd_session` cookie for browser clients.
|
||||
|
||||
**Status codes:** `200 OK`, `401 Unauthorized`, `404` (auth not configured)
|
||||
|
||||
---
|
||||
|
||||
#### POST /api/0/auth/logout
|
||||
Invalidate the current session.
|
||||
|
||||
**Headers:** `Authorization: Bearer <token>` or cookie
|
||||
|
||||
**Response:** `{ "success": true }`
|
||||
|
||||
---
|
||||
|
||||
### Users
|
||||
|
||||
#### GET /api/0/users
|
||||
List all users. **Admin only.**
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{ "username": "andreas", "full_name": "Andreas Wrede", "avatar": "", "admin": true, "notification_channels": [] },
|
||||
{ "username": "bob", "full_name": "Bob Smith", "avatar": "", "admin": false, "notification_channels": ["pushover_standard"] }
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### GET /api/0/users/me
|
||||
Return the currently authenticated user's profile.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{ "username": "carol", "full_name": "Carol Jones", "avatar": "", "admin": false, "notification_channels": [] }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Host Access
|
||||
|
||||
#### GET /api/0/hosts/{hostname}/access
|
||||
Return owner/managers/monitors for a host. Requires at least **monitor** role.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"owner": "andreas",
|
||||
"managers": ["bob"],
|
||||
"monitors": ["carol"]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
#### PUT /api/0/hosts/{hostname}/access
|
||||
Update owner/managers/monitors. Requires **owner** role or admin.
|
||||
|
||||
**Request body** (all fields optional):
|
||||
```json
|
||||
{
|
||||
"owner": "bob",
|
||||
"managers": ["carol"],
|
||||
"monitors": []
|
||||
}
|
||||
```
|
||||
|
||||
Changes take effect immediately in memory. They are not written back to the config file — reload (`SIGHUP`) will re-apply config values. To make changes permanent, update the config file.
|
||||
|
||||
---
|
||||
|
||||
## Host visibility
|
||||
|
||||
When users are configured, `GET /api/0/hosts` only returns hosts the authenticated user has at least monitor access to. Admins see all hosts.
|
||||
|
||||
---
|
||||
|
||||
## Config reload
|
||||
|
||||
On `SIGHUP`, the server reloads the config file, re-loads the user registry, and re-applies `owner`/`managers`/`monitors` from config to all known hosts. Existing sessions remain valid after a reload.
|
||||
|
||||
---
|
||||
|
||||
## No-auth mode
|
||||
|
||||
If `users:` is absent or empty, the server starts in **unauthenticated mode**:
|
||||
|
||||
- No login required — all pages and API endpoints are accessible without credentials.
|
||||
- All permission checks pass unconditionally.
|
||||
- `/login`, `/logout`, and the auth/user API endpoints return `404`.
|
||||
|
||||
This preserves full backwards compatibility with existing deployments.
|
||||
|
||||
---
|
||||
|
||||
## Security notes
|
||||
|
||||
- Session tokens are 64-character cryptographically random hex strings (`secrets.token_hex(32)`).
|
||||
- Sessions expire after 24 hours (configurable via `users_mod.SESSION_TTL`).
|
||||
- Cookies are `HttpOnly` and `SameSite=Lax` — they are not accessible to JavaScript and are not sent on cross-site requests.
|
||||
- The HTTP API does not yet enforce TLS. For production use, place hbd behind a TLS-terminating reverse proxy (nginx, Caddy, etc.) or enable WSS.
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [HTTP API Documentation](HTTP_API.md)
|
||||
- [Notifications](NOTIFICATIONS.md)
|
||||
- Configuration example: `hbd/config_example.yaml`
|
||||
@@ -0,0 +1,9 @@
|
||||
Plan
|
||||
|
||||
Heartbeat is a client/server based network monitor and host observer. hbd, the server portion receives heartbeat and state messages from clients and maintaines state and hisgtory of the informations it receives.
|
||||
|
||||
hbc, the client portion gathers information on various aspects of the
|
||||
system it is running on, and sends it to hbd. Initially this info is basic, like OS make and version, hardware info (CPU type, memory and disks), fileystem info and some resource info. hbc/hbd support a plugin system to extend the info gathered and stored.
|
||||
|
||||
hbd also can send notification based on missed hbc updates, and on violation of pre-set limits for various state paramaters.
|
||||
|
||||
+14
-8
@@ -1,11 +1,17 @@
|
||||
"""hbd package - scaffolding for heartbeat daemon
|
||||
"""hbd package - heartbeat monitoring system
|
||||
|
||||
This package contains the refactored modules for the original monolithic
|
||||
`hbd` script. The initial implementation contains small scaffolds so you can
|
||||
start moving functionality into the package.
|
||||
This package contains both the heartbeat client (hbc) and server (hbd) components,
|
||||
organized into separate subpackages:
|
||||
|
||||
- hbd.client: Client component with system monitoring plugins
|
||||
- hbd.server: Server/daemon component with web UI and notifications
|
||||
- hbd.common: Shared utilities and protocol definitions
|
||||
|
||||
Install options:
|
||||
- pip install hbd[client] # Client only
|
||||
- pip install hbd[server] # Server only
|
||||
- pip install hbd[all] # Both client and server
|
||||
"""
|
||||
|
||||
__all__ = ["main", "__version__"]
|
||||
__version__ = "5.0.5"
|
||||
|
||||
from .cli import main
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.0.12"
|
||||
|
||||
-54
@@ -1,54 +0,0 @@
|
||||
"""Command line interface for hbd package."""
|
||||
|
||||
import argparse
|
||||
|
||||
from .config import load_config
|
||||
from .server import run as run_server
|
||||
|
||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||
|
||||
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbd",
|
||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config", dest="configfile", help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--foreground", action="store_true", help="Run in foreground"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument(
|
||||
"-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS, help="Push service to use"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x", "--debug", action="count", default=0, help="Increase debug level"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.foreground:
|
||||
config["foreground"] = True
|
||||
if args.verbose:
|
||||
config["verbose"] = True
|
||||
if args.pushsrv:
|
||||
config["pushsrv"] = args.pushsrv
|
||||
if args.debug:
|
||||
config.setdefault("debug", 0)
|
||||
config["debug"] += args.debug
|
||||
|
||||
run_server(config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,3 @@
|
||||
"""HeartBeat Client (hbc) - System monitoring client."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
CLIENT_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port where hbd servers listen
|
||||
"interval": 10, # Heartbeat interval in seconds
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
# Plugin configuration
|
||||
"plugins": {}, # Per-plugin configuration
|
||||
"thresholds": {}, # Threshold configuration for monitoring
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with client defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = CLIENT_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
# Keep all keys from YAML to support plugin configs and future extensions
|
||||
for k, v in data.items():
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
@@ -0,0 +1,696 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HeartBeat Client (hbc) - Async version with plugin support.
|
||||
|
||||
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
|
||||
system information via plugins.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Import protocol and config
|
||||
from .config import load_config
|
||||
from ..common.proto import dicttos, stodict
|
||||
|
||||
# Import plugin system
|
||||
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||
|
||||
# Constants
|
||||
PORT = 50003
|
||||
INTERVAL = 10
|
||||
MAXRECV = 32767
|
||||
|
||||
# Global state
|
||||
running = True
|
||||
dorestart = False
|
||||
shutdown_event: Optional[asyncio.Event] = None
|
||||
active_tasks: List[asyncio.Task] = []
|
||||
|
||||
|
||||
class AsyncConnection:
|
||||
"""Async UDP connection to a heartbeat server."""
|
||||
|
||||
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
|
||||
self.conn_id = conn_id
|
||||
self.addr = addr
|
||||
self.port = port
|
||||
self.af = af
|
||||
self.name = name
|
||||
|
||||
self.ackcount = 0
|
||||
self.lastack = 0.0
|
||||
self.send_count = 0
|
||||
self.lastsend = 0.0
|
||||
self.rtts = [0.0]
|
||||
|
||||
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||
|
||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
async def open(self) -> bool:
|
||||
"""Open the UDP connection.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Create datagram endpoint
|
||||
self.transport, self.protocol = await loop.create_datagram_endpoint(
|
||||
lambda: HeartbeatProtocol(self),
|
||||
family=self.af
|
||||
)
|
||||
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to open connection: {e}")
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close the connection."""
|
||||
if self.transport:
|
||||
self.transport.close()
|
||||
self.transport = None
|
||||
self.protocol = None
|
||||
|
||||
async def sendto(self, msg: dict, msg_id: str = "HTB"):
|
||||
"""Send a message to the server.
|
||||
|
||||
Args:
|
||||
msg: Message dictionary
|
||||
msg_id: Message ID (HTB, PLG, etc.)
|
||||
"""
|
||||
if not self.transport:
|
||||
await self.open()
|
||||
|
||||
if not self.transport:
|
||||
self.logger.error("Cannot send - no transport")
|
||||
return
|
||||
|
||||
# Add standard fields
|
||||
msg["name"] = shortname(self.name)
|
||||
msg["id"] = self.conn_id
|
||||
msg["time"] = time.time()
|
||||
|
||||
# Encode message
|
||||
data = dicttos(msg_id, msg)
|
||||
|
||||
# Send
|
||||
self.transport.sendto(data, (self.addr, self.port))
|
||||
self.send_count += 1
|
||||
self.lastsend = time.time()
|
||||
|
||||
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||
|
||||
def handle_ack(self, msg: dict, now: float):
|
||||
"""Handle ACK message from server.
|
||||
|
||||
RTT is calculated as: (time ACK received) - (time HTB sent)
|
||||
"""
|
||||
self.lastack = now
|
||||
|
||||
# Calculate RTT: time ACK received minus time HTB sent
|
||||
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
|
||||
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > 10:
|
||||
self.rtts.pop(0)
|
||||
|
||||
self.ackcount += 1
|
||||
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||
|
||||
|
||||
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
"""Protocol handler for incoming UDP messages."""
|
||||
|
||||
def __init__(self, connection: AsyncConnection):
|
||||
self.connection = connection
|
||||
self.logger = logging.getLogger("hbc.protocol")
|
||||
|
||||
def datagram_received(self, data: bytes, addr):
|
||||
"""Handle incoming datagram."""
|
||||
try:
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
self.logger.warning(f"Failed to parse message from {addr}")
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
msg_id = msg.get("ID")
|
||||
|
||||
if msg_id == "ACK":
|
||||
self.connection.handle_ack(msg, now)
|
||||
elif msg_id == "CMD":
|
||||
# Command from server
|
||||
asyncio.create_task(handle_command(self.connection, msg))
|
||||
elif msg_id == "UPD":
|
||||
# Update from server
|
||||
asyncio.create_task(handle_update(self.connection, msg))
|
||||
else:
|
||||
self.logger.warning(f"Unknown message type: {msg_id}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||
|
||||
def error_received(self, exc):
|
||||
"""Handle protocol errors."""
|
||||
self.logger.error(f"Protocol error: {exc}")
|
||||
|
||||
|
||||
async def handle_command(conn: AsyncConnection, msg: dict):
|
||||
"""Execute a command received from server."""
|
||||
import subprocess
|
||||
|
||||
cmd = msg.get("cmd", "")
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
logger = logging.getLogger("hbc.command")
|
||||
logger.info(f"Executing command: {cmd}")
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
|
||||
).decode()
|
||||
status = "OK"
|
||||
except subprocess.CalledProcessError as e:
|
||||
result = str(e)
|
||||
status = "CalledProcessError"
|
||||
except subprocess.TimeoutExpired:
|
||||
result = "Command timed out"
|
||||
status = "Timeout"
|
||||
except Exception as e:
|
||||
result = str(e)
|
||||
status = "Error"
|
||||
|
||||
# Send response
|
||||
response = {
|
||||
"service": "command",
|
||||
"msg": f"{status} {result}"
|
||||
}
|
||||
await conn.sendto(response)
|
||||
|
||||
|
||||
async def handle_update(conn: AsyncConnection, msg: dict):
|
||||
"""Handle self-update from server."""
|
||||
import codecs
|
||||
import shutil
|
||||
|
||||
logger = logging.getLogger("hbc.update")
|
||||
|
||||
try:
|
||||
code = codecs.decode(msg["code"], "base64").decode()
|
||||
csum = msg["csum"]
|
||||
except Exception as e:
|
||||
error = f"Missing code/csum: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Verify checksum
|
||||
m = md5()
|
||||
m.update(code.encode())
|
||||
if m.hexdigest() != csum:
|
||||
error = "Checksum mismatch"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Backup current file
|
||||
fn = sys.argv[0]
|
||||
ofn = f"{fn}.sav"
|
||||
try:
|
||||
shutil.copy2(fn, ofn)
|
||||
except Exception as e:
|
||||
error = f"Backup failed: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Write new code
|
||||
try:
|
||||
with open(fn, "w") as fh:
|
||||
fh.write(code)
|
||||
except Exception as e:
|
||||
error = f"Write failed: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
logger.info("Update successful, restart required")
|
||||
await conn.sendto({"service": "update", "msg": "OK"})
|
||||
|
||||
# Trigger restart
|
||||
global dorestart
|
||||
dorestart = True
|
||||
stop()
|
||||
|
||||
|
||||
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
"""Send periodic heartbeats.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
interval: Heartbeat interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger("hbc.heartbeat")
|
||||
|
||||
while running:
|
||||
try:
|
||||
msg = {
|
||||
"acks": conn.ackcount,
|
||||
"rtt": conn.rtts[-1],
|
||||
"interval": interval
|
||||
}
|
||||
await conn.sendto(msg, "HTB")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Heartbeat sender cancelled")
|
||||
raise
|
||||
|
||||
# Wait for next interval or shutdown event
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(
|
||||
shutdown_event.wait(),
|
||||
timeout=interval
|
||||
)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Heartbeat sender cancelled during sleep")
|
||||
raise
|
||||
|
||||
|
||||
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||
"""Collect and send plugin data.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
registry: Plugin registry
|
||||
"""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
|
||||
# Collect InfoPlugins once at startup
|
||||
info_plugins = registry.get_by_type(InfoPlugin)
|
||||
for plugin in info_plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Create PLG message with plugin name
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.info(f"Sent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
# Schedule MonitorPlugins
|
||||
# Group plugins by interval
|
||||
from collections import defaultdict
|
||||
by_interval = defaultdict(list)
|
||||
|
||||
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||
for plugin in monitor_plugins:
|
||||
by_interval[plugin.interval].append(plugin)
|
||||
|
||||
# Create tasks for each interval
|
||||
tasks = []
|
||||
for interval, plugins in by_interval.items():
|
||||
task = asyncio.create_task(
|
||||
plugin_collector_interval(conn, plugins, interval)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks
|
||||
if tasks:
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||
for task in tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
raise
|
||||
|
||||
|
||||
async def plugin_collector_interval(
|
||||
conn: AsyncConnection,
|
||||
plugins: List,
|
||||
interval: int
|
||||
):
|
||||
"""Collect plugins on a specific interval.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
plugins: List of plugins to collect
|
||||
interval: Collection interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger(f"hbc.plugins.{interval}s")
|
||||
|
||||
while running:
|
||||
for plugin in plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Don't use encode_plugin_data - create dict directly
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.debug(f"Sent {plugin.name} data")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error collecting {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
# Wait for next interval or shutdown event
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(
|
||||
shutdown_event.wait(),
|
||||
timeout=interval
|
||||
)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled during sleep")
|
||||
raise
|
||||
|
||||
|
||||
def shortname(name: str) -> str:
|
||||
"""Extract short hostname."""
|
||||
return name.split(".")[0]
|
||||
|
||||
|
||||
def stop():
|
||||
"""Stop the event loop."""
|
||||
global running
|
||||
running = False
|
||||
|
||||
# Set shutdown event to wake up sleeping tasks
|
||||
if shutdown_event:
|
||||
shutdown_event.set()
|
||||
|
||||
# Cancel all active tasks
|
||||
for task in active_tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
|
||||
async def cleanup(connections: List[AsyncConnection]):
|
||||
"""Cleanup connections on shutdown."""
|
||||
logger = logging.getLogger("hbc.cleanup")
|
||||
logger.info("Cleaning up connections")
|
||||
|
||||
for conn in connections:
|
||||
try:
|
||||
msg = {
|
||||
"shutdown": 1,
|
||||
"acks": conn.ackcount
|
||||
}
|
||||
await conn.sendto(msg)
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending shutdown: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Give messages time to send
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
async def async_main(args, config):
|
||||
"""Async main function."""
|
||||
global running, shutdown_event, active_tasks
|
||||
|
||||
# Create shutdown event
|
||||
shutdown_event = asyncio.Event()
|
||||
active_tasks = []
|
||||
|
||||
logger = logging.getLogger("hbc.main")
|
||||
|
||||
# Setup
|
||||
iam = socket.gethostname()
|
||||
if args.name:
|
||||
iam = args.name
|
||||
|
||||
hb_hosts = args.hosts
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
||||
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
||||
|
||||
# Create connections
|
||||
connections = []
|
||||
conn_id = 1
|
||||
|
||||
for host in hb_hosts:
|
||||
try:
|
||||
addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
||||
except socket.gaierror as e:
|
||||
logger.error(f"Cannot resolve {host}: {e}")
|
||||
continue
|
||||
|
||||
for addr_info in addrs:
|
||||
af = addr_info[0]
|
||||
addr = addr_info[4][0]
|
||||
|
||||
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||
if await conn.open():
|
||||
connections.append(conn)
|
||||
conn_id += 1
|
||||
|
||||
if not connections:
|
||||
logger.error("No connections established")
|
||||
return 1
|
||||
|
||||
logger.info(f"Created {len(connections)} connections")
|
||||
|
||||
# Send boot/message if requested
|
||||
if args.boot or args.message:
|
||||
boot_msg = {}
|
||||
if args.boot:
|
||||
boot_msg["boot"] = 1
|
||||
if args.message:
|
||||
boot_msg["service"] = "service"
|
||||
boot_msg["msg"] = args.message
|
||||
|
||||
boot_msg["acks"] = 0
|
||||
for conn in connections:
|
||||
await conn.sendto(boot_msg)
|
||||
|
||||
if args.message and not args.daemon:
|
||||
# Message-only mode
|
||||
await cleanup(connections)
|
||||
return 0
|
||||
|
||||
# Load plugins
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
plugin_dir = Path(__file__).parent / "plugins"
|
||||
if plugin_dir.exists():
|
||||
count = await loader.load_from_directory(plugin_dir, config)
|
||||
logger.info(f"Loaded {count} plugins")
|
||||
else:
|
||||
logger.warning(f"Plugin directory not found: {plugin_dir}")
|
||||
|
||||
# Setup signal handlers
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, stop)
|
||||
|
||||
# Start async tasks
|
||||
# Heartbeat senders (one per connection)
|
||||
for conn in connections:
|
||||
task = asyncio.create_task(heartbeat_sender(conn, interval))
|
||||
active_tasks.append(task)
|
||||
|
||||
# Plugin collector (uses all connections, but we'll use first one)
|
||||
if connections and registry.get_enabled():
|
||||
task = asyncio.create_task(plugin_collector(connections[0], registry))
|
||||
active_tasks.append(task)
|
||||
|
||||
# Wait for stop or tasks to complete
|
||||
try:
|
||||
await asyncio.gather(*active_tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Tasks cancelled")
|
||||
|
||||
# Cleanup
|
||||
logger.info("Shutting down...")
|
||||
await cleanup(connections)
|
||||
await loader.unload_all()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def daemonize(
|
||||
working_dir="/",
|
||||
stdin="/dev/zero",
|
||||
stdout="/dev/null",
|
||||
stderr="/dev/null"
|
||||
):
|
||||
"""UNIX double-fork daemonization."""
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #1 failed: {e}\n")
|
||||
os._exit(1)
|
||||
|
||||
os.chdir(working_dir)
|
||||
os.setsid()
|
||||
os.umask(0)
|
||||
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #2 failed: {e}\n")
|
||||
sys.exit(1)
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
si = open(stdin, "r")
|
||||
so = open(stdout, "a+")
|
||||
se = open(stderr, "a+")
|
||||
|
||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
def build_parser():
|
||||
"""Build argument parser."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbc",
|
||||
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--boot",
|
||||
action="store_true",
|
||||
help="Send a boot message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config",
|
||||
dest="configfile",
|
||||
help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--message",
|
||||
dest="message",
|
||||
help="Send a message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--name",
|
||||
dest="name",
|
||||
help="Name to use in heartbeat message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--daemon",
|
||||
action="store_true",
|
||||
help="Run in daemon mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose",
|
||||
action="store_true",
|
||||
help="Verbose output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x", "--debug",
|
||||
action="count",
|
||||
default=0,
|
||||
help="Increase debug level"
|
||||
)
|
||||
parser.add_argument(
|
||||
"hosts",
|
||||
nargs="+",
|
||||
help="Heartbeat daemon hosts to send to"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
"""Main entry point."""
|
||||
global running, dorestart
|
||||
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.INFO
|
||||
if args.verbose:
|
||||
log_level = logging.DEBUG
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
print("Daemonizing...")
|
||||
import syslog
|
||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
||||
syslog.syslog(syslog.LOG_INFO, f"Starting heartbeat to {', '.join(args.hosts)}")
|
||||
daemonize()
|
||||
|
||||
# Reconfigure logging for syslog
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="hbc[%(process)d]: %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
# Run async main
|
||||
try:
|
||||
exit_code = asyncio.run(async_main(args, config))
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Interrupted by user")
|
||||
exit_code = 0
|
||||
except Exception as e:
|
||||
logging.error(f"Fatal error: {e}", exc_info=True)
|
||||
exit_code = 1
|
||||
|
||||
# Handle restart
|
||||
if dorestart:
|
||||
logging.info("Restarting...")
|
||||
os.execv(sys.argv[0], sys.argv)
|
||||
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,410 @@
|
||||
"""Plugin system for extending Heartbeat data collection and monitoring.
|
||||
|
||||
This module provides the base classes and infrastructure for the plugin system
|
||||
that enables extending hbc (client) data collection and hbd (server) processing.
|
||||
|
||||
Plugin Types:
|
||||
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
|
||||
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
|
||||
|
||||
Plugins run on the client (hbc) to gather data, which is then sent to the server
|
||||
(hbd) for storage, threshold checking, and display.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
|
||||
class Plugin(ABC):
|
||||
"""Base class for all plugins.
|
||||
|
||||
Attributes:
|
||||
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
|
||||
version: Plugin version string
|
||||
description: Human-readable description
|
||||
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
|
||||
enabled: Whether plugin is active (can be disabled via config)
|
||||
"""
|
||||
|
||||
name: str = ""
|
||||
version: str = "1.0.0"
|
||||
description: str = ""
|
||||
interval: int = 0
|
||||
enabled: bool = True
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize plugin with optional configuration.
|
||||
|
||||
Args:
|
||||
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||
self._initialized = False
|
||||
|
||||
@abstractmethod
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize plugin (load resources, check dependencies).
|
||||
|
||||
Called once when plugin is loaded. Plugins should validate dependencies
|
||||
(e.g., check if psutil is available) and prepare any resources.
|
||||
|
||||
Returns:
|
||||
True if initialization succeeded, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect data from the system.
|
||||
|
||||
This is the main method called on each collection interval. Should return
|
||||
a dictionary of key-value pairs representing the collected data.
|
||||
|
||||
Keys should be strings (metric names). Values can be:
|
||||
- Scalars: int, float, str, bool
|
||||
- Lists/dicts (will be serialized appropriately)
|
||||
|
||||
Returns:
|
||||
Dictionary of collected metrics, or empty dict on error
|
||||
"""
|
||||
pass
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Cleanup plugin resources before shutdown.
|
||||
|
||||
Called when plugin is being unloaded or on system shutdown.
|
||||
Override to release resources, close connections, etc.
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_data(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate collected data before sending to server.
|
||||
|
||||
Override to implement custom validation logic.
|
||||
|
||||
Args:
|
||||
data: Data returned from collect()
|
||||
|
||||
Returns:
|
||||
True if data is valid, False otherwise
|
||||
"""
|
||||
return isinstance(data, dict)
|
||||
|
||||
|
||||
class InfoPlugin(Plugin):
|
||||
"""Plugin for collecting static or rarely-changing information.
|
||||
|
||||
InfoPlugins collect data that doesn't change frequently:
|
||||
- OS name and version
|
||||
- Hardware specifications (CPU model, RAM size)
|
||||
- Network interface MAC addresses
|
||||
|
||||
Characteristics:
|
||||
- interval = 0 (collected once at startup by default)
|
||||
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
|
||||
- Data is cached and reused until next collection
|
||||
"""
|
||||
|
||||
interval: int = 0 # Collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._cached_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get cached data if available (avoids re-collection).
|
||||
|
||||
Returns:
|
||||
Cached data dict, or None if not yet collected
|
||||
"""
|
||||
return self._cached_data
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect and cache static information."""
|
||||
if self._cached_data is None:
|
||||
self._cached_data = await self._collect_info()
|
||||
return self._cached_data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual data collection.
|
||||
|
||||
Override this method instead of collect() for InfoPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
def invalidate_cache(self) -> None:
|
||||
"""Force re-collection on next collect() call."""
|
||||
self._cached_data = None
|
||||
|
||||
|
||||
class MonitorPlugin(Plugin):
|
||||
"""Plugin for collecting periodic monitoring data.
|
||||
|
||||
MonitorPlugins collect time-series metrics that change frequently:
|
||||
- CPU usage percentage
|
||||
- Memory consumption
|
||||
- Disk I/O statistics
|
||||
- Network traffic
|
||||
|
||||
Characteristics:
|
||||
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
|
||||
- Collected continuously on schedule
|
||||
- Data includes timestamps for time-series tracking
|
||||
"""
|
||||
|
||||
interval: int = 30 # Default: collect every 30 seconds
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._last_reading: Optional[Dict[str, Any]] = None
|
||||
|
||||
def get_last_reading(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get the last collected reading.
|
||||
|
||||
Returns:
|
||||
Last reading dict with timestamp, or None if not yet collected
|
||||
"""
|
||||
return self._last_reading
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect monitoring data and store as last reading."""
|
||||
data = await self._collect_metrics()
|
||||
if data:
|
||||
# Add collection timestamp
|
||||
import time
|
||||
data['_timestamp'] = time.time()
|
||||
self._last_reading = data
|
||||
return data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual metric collection.
|
||||
|
||||
Override this method instead of collect() for MonitorPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PluginRegistry:
|
||||
"""Registry for managing loaded plugins.
|
||||
|
||||
Maintains a collection of loaded plugins and provides methods to
|
||||
query plugins by name, type, or interval.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._plugins: Dict[str, Plugin] = {}
|
||||
self.logger = logging.getLogger("plugin.registry")
|
||||
|
||||
def register(self, plugin: Plugin) -> bool:
|
||||
"""Register a plugin instance.
|
||||
|
||||
Args:
|
||||
plugin: Plugin instance to register
|
||||
|
||||
Returns:
|
||||
True if registered successfully, False if name conflict
|
||||
"""
|
||||
if plugin.name in self._plugins:
|
||||
self.logger.error(f"Plugin '{plugin.name}' already registered")
|
||||
return False
|
||||
|
||||
self._plugins[plugin.name] = plugin
|
||||
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
|
||||
return True
|
||||
|
||||
def unregister(self, name: str) -> bool:
|
||||
"""Unregister a plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name to unregister
|
||||
|
||||
Returns:
|
||||
True if unregistered, False if not found
|
||||
"""
|
||||
if name in self._plugins:
|
||||
del self._plugins[name]
|
||||
self.logger.info(f"Unregistered plugin: {name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, name: str) -> Optional[Plugin]:
|
||||
"""Get plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name
|
||||
|
||||
Returns:
|
||||
Plugin instance or None if not found
|
||||
"""
|
||||
return self._plugins.get(name)
|
||||
|
||||
def get_all(self) -> List[Plugin]:
|
||||
"""Get all registered plugins."""
|
||||
return list(self._plugins.values())
|
||||
|
||||
def get_enabled(self) -> List[Plugin]:
|
||||
"""Get all enabled plugins."""
|
||||
return [p for p in self._plugins.values() if p.enabled]
|
||||
|
||||
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
|
||||
"""Get all plugins of a specific type.
|
||||
|
||||
Args:
|
||||
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
|
||||
|
||||
Returns:
|
||||
List of plugins matching the type
|
||||
"""
|
||||
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
|
||||
|
||||
def get_by_interval(self, interval: int) -> List[Plugin]:
|
||||
"""Get all plugins with a specific collection interval.
|
||||
|
||||
Args:
|
||||
interval: Interval in seconds (0 for one-time collection)
|
||||
|
||||
Returns:
|
||||
List of plugins with matching interval
|
||||
"""
|
||||
return [p for p in self._plugins.values() if p.interval == interval]
|
||||
|
||||
|
||||
class PluginLoader:
|
||||
"""Load plugins from filesystem and instantiate them.
|
||||
|
||||
Scans plugin directories for Python modules containing Plugin subclasses,
|
||||
loads them dynamically, and registers them with the PluginRegistry.
|
||||
"""
|
||||
|
||||
def __init__(self, registry: PluginRegistry):
|
||||
self.registry = registry
|
||||
self.logger = logging.getLogger("plugin.loader")
|
||||
self._loaded_modules: Dict[str, Any] = {}
|
||||
|
||||
async def load_from_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
) -> int:
|
||||
"""Load all plugins from a directory.
|
||||
|
||||
Scans for .py files, imports them, finds Plugin subclasses,
|
||||
instantiates them with config, initializes, and registers.
|
||||
|
||||
Args:
|
||||
directory: Path to plugin directory
|
||||
config: Configuration dict (may contain per-plugin config)
|
||||
|
||||
Returns:
|
||||
Number of plugins successfully loaded
|
||||
"""
|
||||
if not directory.exists() or not directory.is_dir():
|
||||
self.logger.warning(f"Plugin directory not found: {directory}")
|
||||
return 0
|
||||
|
||||
loaded_count = 0
|
||||
plugin_config = config or {}
|
||||
|
||||
# Scan for Python files
|
||||
for plugin_file in directory.glob("*.py"):
|
||||
if plugin_file.name.startswith("_"):
|
||||
continue # Skip __init__.py and private modules
|
||||
|
||||
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
|
||||
|
||||
try:
|
||||
# Load module dynamically
|
||||
module_name = f"plugins.{plugin_file.stem}"
|
||||
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
|
||||
if not spec or not spec.loader:
|
||||
self.logger.warning(f"Could not create spec for {plugin_file}")
|
||||
continue
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
self._loaded_modules[module_name] = module
|
||||
|
||||
self.logger.debug(f"Loaded module: {module_name}")
|
||||
|
||||
# Track which plugin classes we've already processed to avoid duplicates
|
||||
processed_classes = set()
|
||||
|
||||
# Find Plugin subclasses in module
|
||||
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||
# Skip base classes and non-Plugin classes
|
||||
if obj in (Plugin, InfoPlugin, MonitorPlugin):
|
||||
self.logger.debug(f"Skipping base class: {name}")
|
||||
continue
|
||||
if not issubclass(obj, Plugin):
|
||||
self.logger.debug(f"Skipping non-Plugin class: {name}")
|
||||
continue
|
||||
|
||||
# Skip if we've already processed this class (handles module-level aliases)
|
||||
if id(obj) in processed_classes:
|
||||
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
|
||||
continue
|
||||
processed_classes.add(id(obj))
|
||||
|
||||
self.logger.debug(f"Found plugin class: {name}")
|
||||
|
||||
# Instantiate plugin with config
|
||||
plugin_instance_config = plugin_config.get(obj.name, {})
|
||||
plugin = obj(config=plugin_instance_config)
|
||||
|
||||
# Initialize plugin
|
||||
try:
|
||||
initialized = await plugin.initialize()
|
||||
if not initialized:
|
||||
self.logger.warning(
|
||||
f"Plugin {plugin.name} failed initialization, skipping"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error initializing plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
continue
|
||||
|
||||
# Register with registry
|
||||
if self.registry.register(plugin):
|
||||
loaded_count += 1
|
||||
self.logger.info(
|
||||
f"Loaded plugin: {plugin.name} v{plugin.version} "
|
||||
f"(interval: {plugin.interval}s)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error loading plugin from {plugin_file}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
return loaded_count
|
||||
|
||||
async def unload_all(self) -> None:
|
||||
"""Unload all plugins and cleanup resources."""
|
||||
for plugin in self.registry.get_all():
|
||||
try:
|
||||
await plugin.cleanup()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error cleaning up plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
self.registry.unregister(plugin.name)
|
||||
|
||||
# Remove loaded modules
|
||||
for module_name in self._loaded_modules:
|
||||
if module_name in sys.modules:
|
||||
del sys.modules[module_name]
|
||||
self._loaded_modules.clear()
|
||||
@@ -0,0 +1,129 @@
|
||||
"""CPU Monitoring Plugin for Heartbeat.
|
||||
|
||||
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
||||
load average, and process counts.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
class CPUMonitorPlugin(MonitorPlugin):
|
||||
"""Monitor CPU usage and load.
|
||||
|
||||
Collects:
|
||||
- Overall CPU usage percentage
|
||||
- Per-core CPU usage (if enabled in config)
|
||||
- Load average (1min, 5min, 15min)
|
||||
- Process count
|
||||
- CPU frequency (if available)
|
||||
"""
|
||||
|
||||
name = "cpu_monitor"
|
||||
version = "1.0.0"
|
||||
description = "CPU usage and load monitoring"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.psutil = None
|
||||
self.per_core = config.get("per_core", False) if config else False
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the CPU monitor plugin.
|
||||
|
||||
Checks if psutil is available.
|
||||
|
||||
Returns:
|
||||
True if psutil is available, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
try:
|
||||
import psutil
|
||||
self.psutil = psutil
|
||||
self.logger.info(f"{self.name} initialized successfully")
|
||||
return True
|
||||
except ImportError:
|
||||
self.logger.error(
|
||||
"psutil module not available. Install with: pip install psutil"
|
||||
)
|
||||
return False
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect CPU metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with CPU metrics
|
||||
"""
|
||||
if not self.psutil:
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = {}
|
||||
|
||||
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
||||
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
||||
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
||||
|
||||
# Per-core CPU usage (if enabled)
|
||||
if self.per_core:
|
||||
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
||||
data["cpu_per_core"] = per_core_percents
|
||||
data["cpu_core_count"] = len(per_core_percents)
|
||||
else:
|
||||
# Just report core count
|
||||
data["cpu_core_count"] = self.psutil.cpu_count()
|
||||
|
||||
# Load average (Unix-like systems only)
|
||||
try:
|
||||
load_avg = self.psutil.getloadavg()
|
||||
data["load_1min"] = round(load_avg[0], 2)
|
||||
data["load_5min"] = round(load_avg[1], 2)
|
||||
data["load_15min"] = round(load_avg[2], 2)
|
||||
except (AttributeError, OSError):
|
||||
# Not available on Windows
|
||||
pass
|
||||
|
||||
# Process count
|
||||
try:
|
||||
data["process_count"] = len(self.psutil.pids())
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not get process count: {e}")
|
||||
|
||||
# CPU frequency (if available)
|
||||
try:
|
||||
freq = self.psutil.cpu_freq()
|
||||
if freq:
|
||||
data["cpu_freq_current"] = round(freq.current, 2)
|
||||
data["cpu_freq_min"] = round(freq.min, 2)
|
||||
data["cpu_freq_max"] = round(freq.max, 2)
|
||||
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
||||
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
||||
self.logger.debug(f"CPU frequency not available: {e}")
|
||||
pass
|
||||
|
||||
# CPU times (user, system, idle, etc.)
|
||||
try:
|
||||
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
||||
data["cpu_user"] = round(cpu_times.user, 1)
|
||||
data["cpu_system"] = round(cpu_times.system, 1)
|
||||
data["cpu_idle"] = round(cpu_times.idle, 1)
|
||||
if hasattr(cpu_times, "iowait"):
|
||||
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get CPU times: {e}")
|
||||
|
||||
self.logger.debug(
|
||||
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||
)
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
||||
return {}
|
||||
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Disk monitoring plugin for Heartbeat.
|
||||
|
||||
Collects disk usage and I/O statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DiskMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor disk usage and I/O statistics.
|
||||
|
||||
Collects:
|
||||
- Disk partition information
|
||||
- Disk usage per partition (total, used, free, percent)
|
||||
- Disk I/O counters (read/write bytes, read/write count)
|
||||
- Disk I/O time statistics
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
partitions: List of mount points to monitor (default: all)
|
||||
include_io: Include disk I/O statistics (default: True)
|
||||
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
|
||||
"""
|
||||
|
||||
name = "disk_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the disk monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- partitions: List of specific mount points to monitor
|
||||
- include_io: Include I/O statistics (default: True)
|
||||
- exclude_types: List of filesystem types to exclude
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.partitions = self.config.get('partitions', None) # None = all partitions
|
||||
self.include_io = self.config.get('include_io', True)
|
||||
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for disk_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - disk_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
|
||||
|
||||
# Initialize I/O counters if available
|
||||
if self.include_io:
|
||||
try:
|
||||
self._prev_io = psutil.disk_io_counters(perdisk=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize disk I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current disk statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with disk metrics organized by partition:
|
||||
- partitions: Dict of partition data, keyed by mount point
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- fstype: Filesystem type (e.g., ext4)
|
||||
- total: Total space in bytes
|
||||
- used: Used space in bytes
|
||||
- free: Free space in bytes
|
||||
- percent: Usage percentage
|
||||
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
|
||||
- read_count: Number of reads
|
||||
- write_count: Number of writes
|
||||
- read_bytes: Bytes read
|
||||
- write_bytes: Bytes written
|
||||
- read_time: Time spent reading in ms
|
||||
- write_time: Time spent writing in ms
|
||||
- read_bytes_delta: Bytes read since last collection
|
||||
- write_bytes_delta: Bytes written since last collection
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting disk metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect disk metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect partition usage
|
||||
partitions_data = {}
|
||||
partitions = psutil.disk_partitions(all=False)
|
||||
|
||||
for partition in partitions:
|
||||
# Skip unwanted filesystem types
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
# Skip if we're only monitoring specific partitions
|
||||
if self.partitions and partition.mountpoint not in self.partitions:
|
||||
continue
|
||||
|
||||
try:
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
partitions_data[partition.mountpoint] = {
|
||||
'device': partition.device,
|
||||
'fstype': partition.fstype,
|
||||
'total': usage.total,
|
||||
'used': usage.used,
|
||||
'free': usage.free,
|
||||
'percent': usage.percent
|
||||
}
|
||||
except PermissionError:
|
||||
logger.debug(f"Permission denied accessing {partition.mountpoint}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading {partition.mountpoint}: {e}")
|
||||
continue
|
||||
|
||||
metrics['partitions'] = partitions_data
|
||||
|
||||
# Collect I/O statistics
|
||||
if self.include_io:
|
||||
try:
|
||||
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||
io_data = {}
|
||||
|
||||
for disk_name, counters in io_counters.items():
|
||||
disk_stats = {
|
||||
'read_count': counters.read_count,
|
||||
'write_count': counters.write_count,
|
||||
'read_bytes': counters.read_bytes,
|
||||
'write_bytes': counters.write_bytes,
|
||||
}
|
||||
|
||||
# Add time statistics if available
|
||||
if hasattr(counters, 'read_time'):
|
||||
disk_stats['read_time'] = counters.read_time
|
||||
if hasattr(counters, 'write_time'):
|
||||
disk_stats['write_time'] = counters.write_time
|
||||
if hasattr(counters, 'busy_time'):
|
||||
disk_stats['busy_time'] = counters.busy_time
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if disk_name in self._prev_io:
|
||||
prev = self._prev_io[disk_name]
|
||||
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
|
||||
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
|
||||
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
|
||||
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
|
||||
|
||||
io_data[disk_name] = disk_stats
|
||||
|
||||
metrics['io_counters'] = io_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect disk I/O statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Disk monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = DiskMonitorPlugin
|
||||
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Filesystem information plugin for Heartbeat.
|
||||
|
||||
Collects static filesystem and partition information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilesystemInfoPlugin(InfoPlugin):
|
||||
"""
|
||||
Collect filesystem and partition information.
|
||||
|
||||
This is an InfoPlugin that collects static information once during startup.
|
||||
|
||||
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
|
||||
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
|
||||
|
||||
Collects:
|
||||
- List of mounted filesystems
|
||||
- Partition details (device, mount point, filesystem type, options)
|
||||
- Filesystem capabilities and features
|
||||
|
||||
Configuration:
|
||||
include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
exclude_types: List of additional filesystem types to exclude (default: [])
|
||||
"""
|
||||
|
||||
name = "filesystem_info"
|
||||
interval = 0 # InfoPlugin - collect once
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the filesystem info plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
- exclude_types: List of filesystem types to exclude (default: [])
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_pseudo = self.config.get('include_pseudo', False)
|
||||
# By default, no exclusions since all=False filters most pseudo filesystems
|
||||
# Users can add specific types to exclude if needed
|
||||
self.exclude_types = set(self.config.get('exclude_types', []))
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for filesystem_info plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - filesystem_info cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect filesystem information.
|
||||
|
||||
Returns only physical mounted filesystems by default.
|
||||
|
||||
Returns:
|
||||
Dictionary with filesystem data:
|
||||
- filesystems: List of filesystem dictionaries:
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- mountpoint: Mount point path
|
||||
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
|
||||
- opts: Mount options (comma-separated string)
|
||||
- maxfile: Maximum filename length
|
||||
- maxpath: Maximum path length
|
||||
- filesystem_types: List of unique filesystem types found
|
||||
- mount_count: Total number of mounted filesystems
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_info()
|
||||
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting filesystem info: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect filesystem information from psutil."""
|
||||
info = {}
|
||||
filesystems = []
|
||||
filesystem_types = set()
|
||||
|
||||
# Get mounted disk partitions
|
||||
# all=False returns only physical devices (real mounted filesystems)
|
||||
# all=True would include pseudo filesystems (proc, sysfs, etc.)
|
||||
partitions = psutil.disk_partitions(all=self.include_pseudo)
|
||||
|
||||
for partition in partitions:
|
||||
# Additional filtering if exclude_types is specified
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
fs_info = {
|
||||
'device': partition.device,
|
||||
'mountpoint': partition.mountpoint,
|
||||
'fstype': partition.fstype,
|
||||
'opts': partition.opts,
|
||||
}
|
||||
|
||||
# Try to get filesystem capabilities
|
||||
try:
|
||||
# Get path configuration for this mount point
|
||||
import os
|
||||
if hasattr(os, 'pathconf'):
|
||||
try:
|
||||
# Maximum filename length
|
||||
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||
if max_name:
|
||||
fs_info['maxfile'] = max_name
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Maximum path length
|
||||
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||
if max_path:
|
||||
fs_info['maxpath'] = max_path
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
|
||||
|
||||
filesystems.append(fs_info)
|
||||
filesystem_types.add(partition.fstype)
|
||||
|
||||
info['filesystems'] = filesystems
|
||||
info['filesystem_types'] = sorted(list(filesystem_types))
|
||||
info['mount_count'] = len(filesystems)
|
||||
|
||||
# Add some additional filesystem statistics
|
||||
try:
|
||||
# Get boot time (useful for determining filesystem mount times)
|
||||
boot_time = psutil.boot_time()
|
||||
info['boot_time'] = boot_time
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get boot time: {e}")
|
||||
|
||||
return info
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Filesystem info cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = FilesystemInfoPlugin
|
||||
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Memory monitoring plugin for Heartbeat.
|
||||
|
||||
Collects memory and swap usage statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor memory and swap usage.
|
||||
|
||||
Collects:
|
||||
- Physical memory (RAM) usage and statistics
|
||||
- Virtual memory details
|
||||
- Swap memory usage and statistics
|
||||
- Memory available for applications
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
|
||||
name = "memory_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the memory monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_swap = self.config.get('include_swap', True)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for memory_monitor plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - memory_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current memory statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with memory metrics:
|
||||
- memory_total: Total physical RAM in bytes
|
||||
- memory_available: Available memory in bytes
|
||||
- memory_used: Used memory in bytes
|
||||
- memory_free: Free memory in bytes
|
||||
- memory_percent: Memory usage percentage
|
||||
- memory_active: Active memory (Unix)
|
||||
- memory_inactive: Inactive memory (Unix)
|
||||
- memory_buffers: Buffers (Linux)
|
||||
- memory_cached: Cached (Linux)
|
||||
- memory_shared: Shared (Linux)
|
||||
- swap_total: Total swap in bytes (if include_swap)
|
||||
- swap_used: Used swap in bytes (if include_swap)
|
||||
- swap_free: Free swap in bytes (if include_swap)
|
||||
- swap_percent: Swap usage percentage (if include_swap)
|
||||
- swap_sin: Bytes swapped in from disk (if include_swap)
|
||||
- swap_sout: Bytes swapped out to disk (if include_swap)
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected memory metrics: {len(data)} fields")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting memory metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect memory metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Virtual (physical) memory statistics
|
||||
vmem = psutil.virtual_memory()
|
||||
metrics['memory_total'] = vmem.total
|
||||
metrics['memory_available'] = vmem.available
|
||||
metrics['memory_used'] = vmem.used
|
||||
metrics['memory_free'] = vmem.free
|
||||
metrics['memory_percent'] = vmem.percent
|
||||
|
||||
# Platform-specific memory details
|
||||
if hasattr(vmem, 'active'):
|
||||
metrics['memory_active'] = vmem.active
|
||||
if hasattr(vmem, 'inactive'):
|
||||
metrics['memory_inactive'] = vmem.inactive
|
||||
if hasattr(vmem, 'buffers'):
|
||||
metrics['memory_buffers'] = vmem.buffers
|
||||
if hasattr(vmem, 'cached'):
|
||||
metrics['memory_cached'] = vmem.cached
|
||||
if hasattr(vmem, 'shared'):
|
||||
metrics['memory_shared'] = vmem.shared
|
||||
|
||||
# Swap memory statistics
|
||||
if self.include_swap:
|
||||
try:
|
||||
swap = psutil.swap_memory()
|
||||
metrics['swap_total'] = swap.total
|
||||
metrics['swap_used'] = swap.used
|
||||
metrics['swap_free'] = swap.free
|
||||
metrics['swap_percent'] = swap.percent
|
||||
|
||||
# Swap in/out counters (may not be available on all platforms)
|
||||
if hasattr(swap, 'sin'):
|
||||
metrics['swap_sin'] = swap.sin
|
||||
if hasattr(swap, 'sout'):
|
||||
metrics['swap_sout'] = swap.sout
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect swap statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Memory monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = MemoryMonitorPlugin
|
||||
@@ -0,0 +1,283 @@
|
||||
"""Nagios Plugin Runner for Heartbeat.
|
||||
|
||||
Executes Nagios-compatible monitoring plugins and parses their output.
|
||||
|
||||
Nagios Plugin Standard:
|
||||
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
- Output format: Single line status message, optional performance data
|
||||
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Example configuration in ~/.hb.yaml:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
interval: 60
|
||||
commands:
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
```
|
||||
"""
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
# Nagios exit codes
|
||||
NAGIOS_OK = 0
|
||||
NAGIOS_WARNING = 1
|
||||
NAGIOS_CRITICAL = 2
|
||||
NAGIOS_UNKNOWN = 3
|
||||
|
||||
STATUS_NAMES = {
|
||||
NAGIOS_OK: "OK",
|
||||
NAGIOS_WARNING: "WARNING",
|
||||
NAGIOS_CRITICAL: "CRITICAL",
|
||||
NAGIOS_UNKNOWN: "UNKNOWN"
|
||||
}
|
||||
|
||||
|
||||
class NagiosRunnerPlugin(MonitorPlugin):
|
||||
"""Run Nagios-compatible monitoring plugins.
|
||||
|
||||
This plugin executes external Nagios plugins and collects their output,
|
||||
including status codes, messages, and performance data.
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
commands: List of command definitions with 'name' and 'command' keys
|
||||
timeout: Command execution timeout in seconds (default: 30)
|
||||
shell: Whether to execute commands via shell (default: True)
|
||||
|
||||
Example:
|
||||
nagios_runner:
|
||||
interval: 300 # Check every 5 minutes
|
||||
timeout: 30
|
||||
commands:
|
||||
- name: check_disk
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
"""
|
||||
|
||||
name = "nagios_runner"
|
||||
version = "1.0.0"
|
||||
description = "Execute Nagios-compatible monitoring plugins"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
# Extract configuration
|
||||
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||
self.shell: bool = config.get("shell", True) if config else True
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
# Validate commands
|
||||
if not self.commands:
|
||||
self.logger.warning(
|
||||
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
|
||||
)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the Nagios runner plugin.
|
||||
|
||||
Returns:
|
||||
True if at least one command is configured, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
if not self.commands:
|
||||
self.logger.error("No Nagios commands configured")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name", "unnamed")
|
||||
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
|
||||
|
||||
return True
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect metrics from all configured Nagios plugins.
|
||||
|
||||
Returns:
|
||||
Dictionary with results from all plugins
|
||||
"""
|
||||
results = {}
|
||||
|
||||
# Track overall status (worst status wins)
|
||||
worst_status = NAGIOS_OK
|
||||
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name")
|
||||
command = cmd_config.get("command")
|
||||
|
||||
if not name or not command:
|
||||
self.logger.warning("Skipping command with missing name or command")
|
||||
continue
|
||||
|
||||
# Execute plugin
|
||||
try:
|
||||
status_code, output, perfdata = await self._run_nagios_plugin(command)
|
||||
|
||||
# Store results
|
||||
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
|
||||
results[f"{name}_status_code"] = status_code
|
||||
results[f"{name}_output"] = output
|
||||
|
||||
# Track worst status
|
||||
if status_code > worst_status:
|
||||
worst_status = status_code
|
||||
|
||||
# Parse and add performance data
|
||||
if perfdata:
|
||||
for metric_name, metric_value in perfdata.items():
|
||||
results[f"{name}_{metric_name}"] = metric_value
|
||||
|
||||
self.logger.debug(
|
||||
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error running {name}: {e}", exc_info=True)
|
||||
results[f"{name}_status"] = "ERROR"
|
||||
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||
results[f"{name}_output"] = str(e)
|
||||
worst_status = NAGIOS_UNKNOWN
|
||||
|
||||
# Add overall status
|
||||
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
|
||||
results["overall_status_code"] = worst_status
|
||||
results["plugin_count"] = len(self.commands)
|
||||
|
||||
return results
|
||||
|
||||
async def _run_nagios_plugin(
|
||||
self,
|
||||
command: str
|
||||
) -> Tuple[int, str, Dict[str, Any]]:
|
||||
"""Execute a Nagios plugin and parse its output.
|
||||
|
||||
Args:
|
||||
command: Command string to execute
|
||||
|
||||
Returns:
|
||||
Tuple of (status_code, output_message, performance_data_dict)
|
||||
"""
|
||||
try:
|
||||
# Run command
|
||||
result = subprocess.run(
|
||||
command,
|
||||
shell=self.shell,
|
||||
capture_output=True,
|
||||
timeout=self.timeout,
|
||||
text=True
|
||||
)
|
||||
|
||||
status_code = result.returncode
|
||||
output = result.stdout.strip()
|
||||
|
||||
# Nagios plugins can return codes > 3, treat as UNKNOWN
|
||||
if status_code > 3:
|
||||
status_code = NAGIOS_UNKNOWN
|
||||
|
||||
# Parse performance data
|
||||
perfdata = self._parse_perfdata(output)
|
||||
|
||||
# Extract just the status message (before the pipe if present)
|
||||
if '|' in output:
|
||||
output_msg = output.split('|')[0].strip()
|
||||
else:
|
||||
output_msg = output
|
||||
|
||||
return status_code, output_msg, perfdata
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error(f"Command timed out: {command}")
|
||||
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error executing command: {e}")
|
||||
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||
|
||||
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
|
||||
"""Parse Nagios performance data from plugin output.
|
||||
|
||||
Nagios performance data format:
|
||||
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Multiple metrics separated by spaces.
|
||||
|
||||
Args:
|
||||
output: Plugin output string
|
||||
|
||||
Returns:
|
||||
Dictionary of metric_name: value
|
||||
"""
|
||||
perfdata = {}
|
||||
|
||||
# Performance data comes after the pipe character
|
||||
if '|' not in output:
|
||||
return perfdata
|
||||
|
||||
perf_section = output.split('|', 1)[1].strip()
|
||||
|
||||
# Regex to match performance data format
|
||||
# Matches: 'label'=value or label=value
|
||||
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
|
||||
|
||||
for match in re.finditer(perf_regex, perf_section):
|
||||
label = match.group(1).strip()
|
||||
value_str = match.group(2)
|
||||
uom = match.group(3) or ""
|
||||
warn = match.group(4)
|
||||
crit = match.group(5)
|
||||
min_val = match.group(6)
|
||||
max_val = match.group(7)
|
||||
|
||||
# Convert value to float
|
||||
try:
|
||||
value = float(value_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Store the value
|
||||
perfdata[label] = value
|
||||
|
||||
# Optionally store UOM as separate field
|
||||
if uom:
|
||||
perfdata[f"{label}_uom"] = uom
|
||||
|
||||
# Store thresholds if present
|
||||
if warn:
|
||||
try:
|
||||
perfdata[f"{label}_warn"] = float(warn)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if crit:
|
||||
try:
|
||||
perfdata[f"{label}_crit"] = float(crit)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if min_val:
|
||||
try:
|
||||
perfdata[f"{label}_min"] = float(min_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if max_val:
|
||||
try:
|
||||
perfdata[f"{label}_max"] = float(max_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return perfdata
|
||||
@@ -0,0 +1,240 @@
|
||||
"""
|
||||
Network monitoring plugin for Heartbeat.
|
||||
|
||||
Collects network interface statistics and connection information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NetworkMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor network interface statistics and connections.
|
||||
|
||||
Collects:
|
||||
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
|
||||
- Per-interface statistics
|
||||
- Network connection counts by state
|
||||
- Interface addresses and configuration
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
interfaces: List of interfaces to monitor (default: all)
|
||||
include_connections: Include connection statistics (default: True)
|
||||
include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
|
||||
name = "network_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the network monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- interfaces: List of specific interfaces to monitor
|
||||
- include_connections: Include connection stats (default: True)
|
||||
- include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
|
||||
self.include_connections = self.config.get('include_connections', True)
|
||||
self.include_addresses = self.config.get('include_addresses', False)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for network_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - network_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
|
||||
f"connections: {self.include_connections})")
|
||||
|
||||
# Initialize I/O counters
|
||||
try:
|
||||
self._prev_io = psutil.net_io_counters(pernic=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize network I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current network statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with network metrics:
|
||||
- interfaces: Dict of interface statistics, keyed by interface name
|
||||
- bytes_sent: Total bytes sent
|
||||
- bytes_recv: Total bytes received
|
||||
- packets_sent: Total packets sent
|
||||
- packets_recv: Total packets received
|
||||
- errin: Total incoming errors
|
||||
- errout: Total outgoing errors
|
||||
- dropin: Total incoming packets dropped
|
||||
- dropout: Total outgoing packets dropped
|
||||
- bytes_sent_delta: Bytes sent since last collection
|
||||
- bytes_recv_delta: Bytes received since last collection
|
||||
- packets_sent_delta: Packets sent since last collection
|
||||
- packets_recv_delta: Packets received since last collection
|
||||
- connections: Connection statistics by state (if include_connections)
|
||||
- ESTABLISHED: Count of established connections
|
||||
- LISTEN: Count of listening sockets
|
||||
- TIME_WAIT: Count of TIME_WAIT connections
|
||||
- etc.
|
||||
- addresses: Interface address information (if include_addresses)
|
||||
- Dict keyed by interface name with address details
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting network metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect network metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect per-interface I/O counters
|
||||
try:
|
||||
io_counters = psutil.net_io_counters(pernic=True)
|
||||
interfaces_data = {}
|
||||
|
||||
for iface_name, counters in io_counters.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_stats = {
|
||||
'bytes_sent': counters.bytes_sent,
|
||||
'bytes_recv': counters.bytes_recv,
|
||||
'packets_sent': counters.packets_sent,
|
||||
'packets_recv': counters.packets_recv,
|
||||
'errin': counters.errin,
|
||||
'errout': counters.errout,
|
||||
'dropin': counters.dropin,
|
||||
'dropout': counters.dropout,
|
||||
}
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if iface_name in self._prev_io:
|
||||
prev = self._prev_io[iface_name]
|
||||
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
|
||||
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
|
||||
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
|
||||
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
|
||||
|
||||
interfaces_data[iface_name] = iface_stats
|
||||
|
||||
metrics['interfaces'] = interfaces_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect network I/O counters: {e}")
|
||||
|
||||
# Collect connection statistics
|
||||
if self.include_connections:
|
||||
try:
|
||||
connections = psutil.net_connections(kind='inet')
|
||||
conn_stats = {}
|
||||
|
||||
# Count connections by state
|
||||
for conn in connections:
|
||||
state = conn.status
|
||||
conn_stats[state] = conn_stats.get(state, 0) + 1
|
||||
|
||||
metrics['connections'] = conn_stats
|
||||
|
||||
except (PermissionError, psutil.AccessDenied):
|
||||
logger.debug("Permission denied for net_connections (requires root/admin)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect connection statistics: {e}")
|
||||
|
||||
# Collect interface addresses
|
||||
if self.include_addresses:
|
||||
try:
|
||||
addresses = psutil.net_if_addrs()
|
||||
addr_data = {}
|
||||
|
||||
for iface_name, addrs in addresses.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_addrs = []
|
||||
for addr in addrs:
|
||||
addr_info = {
|
||||
'family': str(addr.family),
|
||||
'address': addr.address,
|
||||
}
|
||||
if addr.netmask:
|
||||
addr_info['netmask'] = addr.netmask
|
||||
if addr.broadcast:
|
||||
addr_info['broadcast'] = addr.broadcast
|
||||
iface_addrs.append(addr_info)
|
||||
|
||||
addr_data[iface_name] = iface_addrs
|
||||
|
||||
metrics['addresses'] = addr_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface addresses: {e}")
|
||||
|
||||
# Add interface stats (up/down status, speed, mtu)
|
||||
try:
|
||||
if_stats = psutil.net_if_stats()
|
||||
stats_data = {}
|
||||
|
||||
for iface_name, stats in if_stats.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
stats_data[iface_name] = {
|
||||
'isup': stats.isup,
|
||||
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
|
||||
'speed': stats.speed,
|
||||
'mtu': stats.mtu,
|
||||
}
|
||||
|
||||
metrics['interface_stats'] = stats_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface stats: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Network monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = NetworkMonitorPlugin
|
||||
@@ -0,0 +1,136 @@
|
||||
"""OS Information Plugin for Heartbeat.
|
||||
|
||||
Collects static operating system information including OS name, version,
|
||||
kernel, architecture, and distribution details.
|
||||
"""
|
||||
|
||||
import platform
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
|
||||
class OSInfoPlugin(InfoPlugin):
|
||||
"""Collect operating system information.
|
||||
|
||||
This plugin gathers static OS information that rarely changes:
|
||||
- OS name and version
|
||||
- Kernel version
|
||||
- Architecture (x86_64, arm64, etc.)
|
||||
- Distribution details (for Linux)
|
||||
- Python version (used by hbc)
|
||||
"""
|
||||
|
||||
name = "os_info"
|
||||
version = "1.0.0"
|
||||
description = "Operating system and platform information"
|
||||
interval = 0 # InfoPlugin: collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the OS info plugin.
|
||||
|
||||
Returns:
|
||||
True (always succeeds - platform module is stdlib)
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
return True
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect OS information.
|
||||
|
||||
Returns:
|
||||
Dictionary with OS details
|
||||
"""
|
||||
try:
|
||||
data = {
|
||||
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
|
||||
"node": platform.node(), # hostname
|
||||
"release": platform.release(), # kernel version
|
||||
"version": platform.version(), # detailed version
|
||||
"machine": platform.machine(), # e.g., "x86_64", "arm64"
|
||||
"processor": platform.processor(), # processor name
|
||||
"architecture": platform.architecture()[0], # e.g., "64bit"
|
||||
"python_version": platform.python_version(),
|
||||
"python_implementation": platform.python_implementation(),
|
||||
}
|
||||
|
||||
# Add Linux-specific distribution info
|
||||
if platform.system() == "Linux":
|
||||
data.update(self._get_linux_distro())
|
||||
|
||||
# Add macOS-specific info
|
||||
elif platform.system() == "Darwin":
|
||||
data["macos_version"] = platform.mac_ver()[0]
|
||||
|
||||
# Add Windows-specific info
|
||||
elif platform.system() == "Windows":
|
||||
win_ver = platform.win32_ver()
|
||||
data["windows_release"] = win_ver[0]
|
||||
data["windows_version"] = win_ver[1]
|
||||
data["windows_sp"] = win_ver[2]
|
||||
data["windows_type"] = win_ver[3]
|
||||
|
||||
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
|
||||
return {}
|
||||
|
||||
def _get_linux_distro(self) -> Dict[str, str]:
|
||||
"""Get Linux distribution information.
|
||||
|
||||
Returns:
|
||||
Dictionary with distribution details
|
||||
"""
|
||||
distro_info = {}
|
||||
|
||||
# Try reading /etc/os-release (standard on modern Linux)
|
||||
os_release = Path("/etc/os-release")
|
||||
if os_release.exists():
|
||||
try:
|
||||
with open(os_release) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line and not line.startswith("#"):
|
||||
key, value = line.split("=", 1)
|
||||
# Remove quotes from value
|
||||
value = value.strip('"').strip("'")
|
||||
# Map common keys
|
||||
if key == "NAME":
|
||||
distro_info["distro_name"] = value
|
||||
elif key == "VERSION":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "VERSION_ID":
|
||||
distro_info["distro_version_id"] = value
|
||||
elif key == "PRETTY_NAME":
|
||||
distro_info["distro_pretty_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/os-release: {e}")
|
||||
|
||||
# Fallback: try lsb_release (older systems)
|
||||
elif Path("/etc/lsb-release").exists():
|
||||
try:
|
||||
with open("/etc/lsb-release") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
if key == "DISTRIB_ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "DISTRIB_RELEASE":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "DISTRIB_DESCRIPTION":
|
||||
distro_info["distro_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
|
||||
|
||||
return distro_info
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Common utilities shared between hbc and hbd."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Message encoding/decoding utilities for hbd protocol.
|
||||
|
||||
Message Types:
|
||||
HTB: Heartbeat message (client -> server)
|
||||
ACK: Acknowledgment (server -> client)
|
||||
CMD: Command message (server -> client)
|
||||
UPD: Update message (server -> client)
|
||||
PLG: Plugin data message (client -> server)
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Union
|
||||
import json
|
||||
import zlib
|
||||
|
||||
|
||||
def encode_value(v: Any) -> str:
|
||||
"""Encode a value for protocol transmission.
|
||||
|
||||
Args:
|
||||
v: Value to encode (int, float, str, bool, list, dict, etc.)
|
||||
|
||||
Returns:
|
||||
String representation suitable for protocol
|
||||
"""
|
||||
if isinstance(v, float):
|
||||
return f"{v:0.5f}"
|
||||
elif isinstance(v, (list, dict)):
|
||||
# Use JSON encoding for complex types, prefixed with @
|
||||
return "@" + json.dumps(v)
|
||||
elif isinstance(v, bool):
|
||||
return str(int(v)) # True->1, False->0
|
||||
else:
|
||||
return str(v)
|
||||
|
||||
|
||||
def decode_value(val: str) -> Any:
|
||||
"""Decode a value from protocol format.
|
||||
|
||||
Args:
|
||||
val: String value from protocol
|
||||
|
||||
Returns:
|
||||
Decoded Python object
|
||||
"""
|
||||
if not val:
|
||||
return val
|
||||
|
||||
# Check for JSON-encoded complex types
|
||||
if val.startswith("@"):
|
||||
try:
|
||||
return json.loads(val[1:])
|
||||
except Exception:
|
||||
return val[1:] # Return as string without @
|
||||
|
||||
# Try numeric evaluation (original behavior)
|
||||
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
|
||||
try:
|
||||
return eval(val)
|
||||
except Exception:
|
||||
return val
|
||||
|
||||
return val
|
||||
|
||||
|
||||
def dicttos(ID: str, d: Dict[str, Any]):
|
||||
"""Serialize a dict to protocol message bytes.
|
||||
|
||||
If compress is True, the payload is zlib-compressed and the message is
|
||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||
`ID:key=value;...` (bytes).
|
||||
"""
|
||||
s = []
|
||||
for k in d:
|
||||
v = d[k]
|
||||
encoded_val = encode_value(v)
|
||||
s.append(f"{k}={encoded_val}")
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
|
||||
|
||||
def stodict(msg: bytes):
|
||||
"""Deserialize a protocol message into a dict.
|
||||
|
||||
Mirrors original behaviour: detects compressed messages starting with
|
||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||
message ID and the parsed key/value pairs.
|
||||
"""
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
# message is: b'!ID:' + compressed_payload
|
||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||
try:
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
except Exception:
|
||||
# malformed compressed payload
|
||||
return {}
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
try:
|
||||
r0 = msg.split(b":", 1)
|
||||
pk = r0[1].decode()
|
||||
d["ID"] = r0[0].decode()
|
||||
except Exception:
|
||||
return {}
|
||||
if not pk:
|
||||
return d
|
||||
parts = pk.split(";")
|
||||
for v in parts:
|
||||
if not v:
|
||||
continue
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
val = vr[1].strip()
|
||||
d[k] = decode_value(val)
|
||||
return d
|
||||
|
||||
|
||||
def oldmtodict(msg: bytes):
|
||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||
|
||||
The original implementation prefixed with 'HTB:' and called stodict.
|
||||
"""
|
||||
return stodict(b"HTB:" + msg)
|
||||
|
||||
|
||||
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
|
||||
"""Encode plugin data into a PLG message.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
|
||||
data: Plugin data dictionary
|
||||
compress: Whether to compress the payload
|
||||
|
||||
Returns:
|
||||
Encoded message bytes
|
||||
"""
|
||||
# Add plugin name to data
|
||||
full_data = {"plugin": plugin_name, **data}
|
||||
return dicttos("PLG", full_data)
|
||||
|
||||
|
||||
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
|
||||
"""Decode a PLG message into plugin data.
|
||||
|
||||
Args:
|
||||
msg: Raw message bytes
|
||||
|
||||
Returns:
|
||||
Dictionary with 'ID', 'plugin', and plugin data fields
|
||||
"""
|
||||
return stodict(msg)
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
"""Configuration loader and defaults for hbd."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
DEFAULTS = {
|
||||
"hb_port": 50003,
|
||||
"hbd_port": 50004,
|
||||
"hbd_host": "",
|
||||
"pickfile": "/tmp/hb.pick",
|
||||
"logfile": "/var/log/heartbeat.log",
|
||||
"logfmt": "text",
|
||||
"pushsrv": "pushover",
|
||||
"pushover_token": "",
|
||||
"pushover_user": "",
|
||||
"interval": 20,
|
||||
"grace": 2,
|
||||
"dyndomains": ["wrede.org"],
|
||||
"watchhosts": [],
|
||||
"dyndnshosts": [],
|
||||
"drophosts": [],
|
||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
"smtpserver": "smtp.fastmail.com",
|
||||
"smtpuser": "andreas@wrede.ca",
|
||||
"smtppassword": "pvtvefyp5gbhnch2",
|
||||
"smtpport": 587,
|
||||
"toemail": ["aew.hbd.notify@wrede.ca"],
|
||||
"fromemail": "aew.hbd@wrede.ca",
|
||||
"ws_port": 50005,
|
||||
"wss_port": None,
|
||||
"cert_path": "/usr/local/etc/ssl/",
|
||||
"wss_pem": "fullchain.pem",
|
||||
"wss_key": "privkey.pem",
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
"""
|
||||
cfg = DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# only keep known keys
|
||||
for k, v in data.items():
|
||||
if k in cfg:
|
||||
cfg[k] = v
|
||||
else:
|
||||
logging.warning("unknown config key %s in %s", k, path)
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
@@ -0,0 +1,196 @@
|
||||
# Example Heartbeat Client Configuration
|
||||
# This file demonstrates all available configuration options for the heartbeat client (hbc)
|
||||
# and its plugin system.
|
||||
|
||||
# ==============================================================================
|
||||
# Server Configuration
|
||||
# ==============================================================================
|
||||
server: hbd.example.com # Heartbeat server hostname or IP
|
||||
port: 50003 # Server UDP port (default: 50003)
|
||||
interval: 30 # Heartbeat interval in seconds (default: 30)
|
||||
|
||||
# ==============================================================================
|
||||
# Plugin Configuration
|
||||
# ==============================================================================
|
||||
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
|
||||
# and configured with plugin-specific settings.
|
||||
|
||||
plugins:
|
||||
# --------------------------------------------------------------------------
|
||||
# OS Information Plugin (InfoPlugin - runs once at startup)
|
||||
# --------------------------------------------------------------------------
|
||||
os_info:
|
||||
enabled: true
|
||||
# No additional configuration needed
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
|
||||
# --------------------------------------------------------------------------
|
||||
cpu_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
per_core: false # Collect per-core CPU statistics (default: false)
|
||||
# When per_core is true, will report CPU usage for each core separately
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Memory Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
memory_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_swap: true # Include swap memory statistics (default: true)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Disk Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
disk_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_io: true # Include I/O statistics (default: true)
|
||||
# Optional: Monitor only specific partitions
|
||||
# partitions:
|
||||
# - /
|
||||
# - /home
|
||||
# - /var
|
||||
# Optional: Exclude specific filesystem types
|
||||
exclude_types:
|
||||
- tmpfs
|
||||
- devtmpfs
|
||||
- squashfs
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Network Monitor Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
network_monitor:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
include_connections: true # Include connection statistics (default: true)
|
||||
include_addresses: false # Include interface addresses (default: false)
|
||||
# Optional: Monitor only specific interfaces
|
||||
# interfaces:
|
||||
# - eth0
|
||||
# - wlan0
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
|
||||
# --------------------------------------------------------------------------
|
||||
filesystem_info:
|
||||
enabled: true
|
||||
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
|
||||
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
|
||||
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
|
||||
# Optional: Exclude additional specific filesystem types
|
||||
# exclude_types:
|
||||
# - squashfs
|
||||
# - iso9660
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Nagios Runner Plugin (MonitorPlugin)
|
||||
# --------------------------------------------------------------------------
|
||||
nagios_runner:
|
||||
enabled: true
|
||||
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
|
||||
timeout: 30 # Plugin execution timeout in seconds (default: 30)
|
||||
|
||||
# List of Nagios plugins to execute
|
||||
# Each command is executed as-is, so provide full paths and arguments
|
||||
commands:
|
||||
# System load monitoring
|
||||
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
# Disk space monitoring
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||
|
||||
# Process monitoring
|
||||
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
|
||||
|
||||
# Swap usage
|
||||
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
|
||||
# Custom script example
|
||||
# - /usr/local/bin/check_my_app.sh
|
||||
|
||||
# ==============================================================================
|
||||
# Advanced Options
|
||||
# ==============================================================================
|
||||
# These options control client behavior
|
||||
|
||||
# Compression: Enable zlib compression for heartbeat messages (default: true)
|
||||
compress: true
|
||||
|
||||
# Hostname: Override the system hostname (default: auto-detect)
|
||||
# hostname: myhost.example.com
|
||||
|
||||
# Message: Custom message included in heartbeat (optional)
|
||||
# message: "Production web server"
|
||||
|
||||
# Logging
|
||||
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||
# logfile: /var/log/hbc.log # Optional log file path
|
||||
|
||||
# ==============================================================================
|
||||
# Example Profiles
|
||||
# ==============================================================================
|
||||
# Below are example configuration profiles for different use cases
|
||||
|
||||
# Minimal Configuration (default settings):
|
||||
# -----------------------------------------
|
||||
# server: hbd.example.com
|
||||
# interval: 30
|
||||
|
||||
# Monitoring Server (comprehensive metrics):
|
||||
# ------------------------------------------
|
||||
# server: monitoring.example.com
|
||||
# interval: 30
|
||||
# plugins:
|
||||
# cpu_monitor:
|
||||
# enabled: true
|
||||
# interval: 15
|
||||
# per_core: true
|
||||
# memory_monitor:
|
||||
# enabled: true
|
||||
# interval: 15
|
||||
# disk_monitor:
|
||||
# enabled: true
|
||||
# interval: 60
|
||||
# network_monitor:
|
||||
# enabled: true
|
||||
# interval: 30
|
||||
# include_connections: true
|
||||
|
||||
# Nagios Integration (leverage existing plugins):
|
||||
# -----------------------------------------------
|
||||
# server: hbd.example.com
|
||||
# plugins:
|
||||
# nagios_runner:
|
||||
# enabled: true
|
||||
# interval: 300 # Check every 5 minutes
|
||||
# commands:
|
||||
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
|
||||
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
|
||||
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
|
||||
|
||||
# ==============================================================================
|
||||
# Threshold Configuration (for Heartbeat Daemon)
|
||||
# ==============================================================================
|
||||
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
|
||||
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
|
||||
#
|
||||
# Basic threshold example:
|
||||
# thresholds:
|
||||
# cpu_monitor:
|
||||
# cpu_percent:
|
||||
# warning: 80.0
|
||||
# critical: 90.0
|
||||
# memory_monitor:
|
||||
# percent:
|
||||
# warning: 85.0
|
||||
# critical: 95.0
|
||||
# disk_monitor:
|
||||
# partitions:
|
||||
# /:
|
||||
# percent:
|
||||
# warning: 80.0
|
||||
# critical: 90.0
|
||||
|
||||
@@ -0,0 +1,296 @@
|
||||
# ==============================================================================
|
||||
# Heartbeat Daemon Multi-Threshold Configuration Example
|
||||
# ==============================================================================
|
||||
# This file demonstrates the new multi-threshold configuration feature that allows
|
||||
# different threshold settings for different hosts/clients.
|
||||
#
|
||||
# Features:
|
||||
# - Define multiple named threshold configurations
|
||||
# - Map specific hosts to specific threshold configurations
|
||||
# - Set a default configuration for unmapped hosts
|
||||
# - Backward compatible with single threshold configuration
|
||||
# ==============================================================================
|
||||
|
||||
# Global threshold settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||
|
||||
# Optional: Set default threshold config (defaults to "default" if not specified)
|
||||
default_threshold_config: "default"
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Multiple Named Threshold Configurations
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define multiple threshold configurations with different sensitivity levels
|
||||
threshold_configs:
|
||||
|
||||
# Default configuration - moderate thresholds for most servers
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
load_1min:
|
||||
warning: 4.0
|
||||
critical: 8.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
# RTT thresholds (applies to all hosts)
|
||||
warning: 50.0 # ms
|
||||
critical: 200.0
|
||||
|
||||
# High sensitivity configuration - lower thresholds for critical systems
|
||||
high_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 60.0 # Alert earlier
|
||||
critical: 75.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15 # More hysteresis to reduce flapping
|
||||
load_1min:
|
||||
warning: 2.0
|
||||
critical: 4.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 75.0 # Alert at lower memory usage
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 75.0
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
/var:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 30.0
|
||||
critical: 100.0
|
||||
|
||||
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||
low_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 90.0 # Only alert at very high usage
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 90.0
|
||||
critical: 98.0
|
||||
operator: ">"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 90.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 100.0
|
||||
critical: 500.0
|
||||
|
||||
# Production database servers - specialized thresholds
|
||||
database:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 70.0
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 90.0 # Databases can use high memory
|
||||
critical: 97.0
|
||||
operator: ">"
|
||||
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
||||
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
/var/lib/mysql: # Database data partition
|
||||
percent:
|
||||
warning: 75.0 # Alert earlier for DB partition
|
||||
critical: 85.0
|
||||
operator: ">"
|
||||
|
||||
rtt:
|
||||
warning: 20.0 # Stricter latency requirements
|
||||
critical: 50.0
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Host to Threshold Configuration Mapping
|
||||
# ----------------------------------------------------------------------------
|
||||
# Map specific hosts to specific threshold configurations
|
||||
# ----------------------------------------------------------------------------
|
||||
# Notification Channels
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define notification providers centrally with their credentials
|
||||
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
||||
notification_channels:
|
||||
# Signal notifications
|
||||
signal_ops:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890
|
||||
recipient: +1234567890
|
||||
|
||||
signal_oncall:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +1234567890
|
||||
recipient: +0987654321
|
||||
|
||||
# Email notifications
|
||||
email_ops:
|
||||
type: email
|
||||
recipients: [ops@example.com, alerts@example.com]
|
||||
sender: heartbeat@example.com
|
||||
smtp_server: smtp.example.com
|
||||
smtp_port: 587
|
||||
smtp_user: heartbeat@example.com
|
||||
smtp_password: your-smtp-password
|
||||
|
||||
# Pushover notifications
|
||||
pushover_urgent:
|
||||
type: pushover
|
||||
token: your-pushover-app-token
|
||||
user: your-pushover-user-key
|
||||
|
||||
# Mattermost notifications
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: your-webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
|
||||
# Default notification channels (used if host doesn't specify channels)
|
||||
default_notification_channels: [email_ops]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Host Definitions (New Unified Format)
|
||||
# ----------------------------------------------------------------------------
|
||||
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
||||
hosts:
|
||||
# Critical production servers - high sensitivity, multiple notification channels
|
||||
prod-web-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-web-02:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-api-01:
|
||||
threshold_config: high_sensitivity
|
||||
watch: true
|
||||
notification_channels: [signal_oncall, email_ops]
|
||||
dyndns: false
|
||||
|
||||
# Database servers - database-specific thresholds
|
||||
prod-db-01:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [signal_ops, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-db-02:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [signal_ops, email_ops]
|
||||
dyndns: false
|
||||
|
||||
prod-db-replica:
|
||||
threshold_config: database
|
||||
watch: true
|
||||
notification_channels: [email_ops] # Replica gets email only
|
||||
dyndns: false
|
||||
|
||||
# Development servers - low sensitivity, minimal notifications
|
||||
dev-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false # Don't monitor dev servers closely
|
||||
notification_channels: [email_ops]
|
||||
dyndns: false
|
||||
|
||||
dev-server-02:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
notification_channels: [email_ops]
|
||||
dyndns: false
|
||||
|
||||
# Test servers
|
||||
test-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
watch: false
|
||||
dyndns: false
|
||||
# No notification channels - uses default_notification_channels
|
||||
|
||||
# Home server with dynamic DNS
|
||||
home-server:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [signal_ops]
|
||||
dyndns: true # Update DNS when IP changes
|
||||
|
||||
# Hosts not listed in the hosts section will use:
|
||||
# - default_threshold_config for thresholds (falls back to "default")
|
||||
# - default_notification_channels for notifications
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Notes on Configuration Structure
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# All configuration is centralized in the hosts section. Each host can specify:
|
||||
# - threshold_config: Name of threshold configuration to use
|
||||
# - watch: Whether to monitor this host actively (send notifications)
|
||||
# - notification_channels: List of channels to use for this host
|
||||
# - dyndns: Whether to update DNS when IP address changes
|
||||
#
|
||||
# Notification channels are defined once at the top level and referenced
|
||||
# by name in host definitions, allowing easy reuse and updates.
|
||||
#
|
||||
# For hosts not explicitly listed, the system will still accept heartbeats
|
||||
# and track their state, but won't apply thresholds or send notifications
|
||||
# unless default settings are configured.
|
||||
@@ -0,0 +1,111 @@
|
||||
# Heartbeat Configuration Example with Nagios Plugin Runner
|
||||
|
||||
# This example shows how to configure the Nagios Runner plugin
|
||||
# to execute existing Nagios-compatible monitoring plugins
|
||||
|
||||
# Basic server settings (existing config)
|
||||
hb_port: 50003
|
||||
hbd_port: 50004
|
||||
interval: 20
|
||||
grace: 2
|
||||
|
||||
# Plugin configuration
|
||||
# Each plugin can have its own configuration section
|
||||
|
||||
# CPU Monitor Plugin
|
||||
cpu_monitor:
|
||||
interval: 300 # Collect every 5 minutes (default)
|
||||
per_core: false # Set to true to get per-core CPU usage
|
||||
|
||||
# Nagios Runner Plugin
|
||||
nagios_runner:
|
||||
interval: 300 # Run Nagios plugins every 5 minutes (default)
|
||||
timeout: 30 # Command execution timeout in seconds
|
||||
shell: true # Execute commands via shell
|
||||
|
||||
# List of Nagios plugins to run
|
||||
commands:
|
||||
|
||||
# Example 1: Check disk space
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
|
||||
# Example 2: Check disk space for /home
|
||||
- name: check_disk_home
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
|
||||
|
||||
# Example 3: Check system load
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
|
||||
# Example 4: Check process count
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
|
||||
# Example 5: Check SSH service
|
||||
- name: check_ssh
|
||||
command: /usr/lib/nagios/plugins/check_ssh localhost
|
||||
|
||||
# Example 6: Check HTTP service
|
||||
- name: check_http
|
||||
command: /usr/lib/nagios/plugins/check_http -H localhost
|
||||
|
||||
# Example 7: Check swap usage
|
||||
- name: check_swap
|
||||
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
|
||||
# Example 8: Custom script (Nagios plugin format)
|
||||
- name: check_custom
|
||||
command: /usr/local/bin/my_custom_check.sh
|
||||
|
||||
# Example 9: Check specific log file
|
||||
- name: check_logs
|
||||
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
|
||||
|
||||
# Notes:
|
||||
#
|
||||
# 1. Nagios Plugin Output Format:
|
||||
# - Single line: STATUS - Message | performance_data
|
||||
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
#
|
||||
# 2. Exit Codes:
|
||||
# - 0 = OK
|
||||
# - 1 = WARNING
|
||||
# - 2 = CRITICAL
|
||||
# - 3 = UNKNOWN
|
||||
#
|
||||
# 3. Performance Data:
|
||||
# - Automatically parsed and included in heartbeat data
|
||||
# - Metrics are stored as: {plugin_name}_{metric_name}
|
||||
# - Example: check_disk_root_/ will contain the disk usage percentage
|
||||
#
|
||||
# 4. Overall Status:
|
||||
# - The plugin reports the worst status from all commands
|
||||
# - Useful for quick health checks
|
||||
#
|
||||
# 5. Plugin Paths:
|
||||
# Common Nagios plugin directories:
|
||||
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
|
||||
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
|
||||
# - Custom installs: /usr/local/nagios/libexec/
|
||||
#
|
||||
# 6. Installing Nagios Plugins:
|
||||
# Debian/Ubuntu: sudo apt-get install nagios-plugins
|
||||
# RHEL/CentOS: sudo yum install nagios-plugins-all
|
||||
# Arch Linux: sudo pacman -S monitoring-plugins
|
||||
#
|
||||
# 7. Writing Custom Nagios Plugins:
|
||||
# Any script can be a Nagios plugin if it:
|
||||
# - Returns appropriate exit codes (0-3)
|
||||
# - Prints status message to stdout
|
||||
# - Optionally includes performance data after "|"
|
||||
#
|
||||
# Example custom plugin (save as /usr/local/bin/check_example.sh):
|
||||
# #!/bin/bash
|
||||
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
|
||||
# echo "CRITICAL - Too many users | users=52;40;50;0"
|
||||
# exit 2
|
||||
# else
|
||||
# echo "OK - Normal user count | users=25;40;50;0"
|
||||
# exit 0
|
||||
# fi
|
||||
@@ -0,0 +1,254 @@
|
||||
# ==============================================================================
|
||||
# Heartbeat Daemon Threshold Configuration Example
|
||||
# ==============================================================================
|
||||
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
||||
# Thresholds can be defined for any metric collected by monitoring plugins.
|
||||
#
|
||||
# Threshold levels:
|
||||
# - WARNING: First level of concern, typically for early notification
|
||||
# - CRITICAL: Severe condition requiring immediate attention
|
||||
#
|
||||
# Alert notifications are sent when:
|
||||
# - A metric crosses from OK to WARNING or CRITICAL
|
||||
# - A metric crosses from WARNING to CRITICAL
|
||||
# - A metric recovers (returns to a lower severity level)
|
||||
#
|
||||
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
||||
# ==============================================================================
|
||||
|
||||
# Global threshold settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||
|
||||
# Threshold definitions per plugin
|
||||
thresholds:
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# CPU Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
cpu_monitor:
|
||||
# Overall CPU usage percentage (0-100)
|
||||
cpu_percent:
|
||||
warning: 80.0 # Warn when CPU usage exceeds 80%
|
||||
critical: 90.0 # Critical when CPU usage exceeds 90%
|
||||
operator: ">" # Alert when value is GREATER than threshold
|
||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||
enabled: true
|
||||
|
||||
# 1-minute load average
|
||||
load_1min:
|
||||
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
||||
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15 # 15% hysteresis
|
||||
enabled: true
|
||||
|
||||
# 5-minute load average
|
||||
load_5min:
|
||||
warning: 3.0
|
||||
critical: 6.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# 15-minute load average
|
||||
load_15min:
|
||||
warning: 2.0
|
||||
critical: 4.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Memory Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
memory_monitor:
|
||||
# Memory usage percentage
|
||||
percent:
|
||||
warning: 85.0 # Warn at 85% memory usage
|
||||
critical: 95.0 # Critical at 95% memory usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Available memory in MB (inverse threshold - alert when LOW)
|
||||
available_mb:
|
||||
warning: 1000 # Warn when less than 1GB available
|
||||
critical: 500 # Critical when less than 500MB available
|
||||
operator: "<" # Alert when value is LESS than threshold
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Swap usage percentage
|
||||
swap_percent:
|
||||
warning: 50.0 # Warn at 50% swap usage
|
||||
critical: 80.0 # Critical at 80% swap usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Disk Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
disk_monitor:
|
||||
# Partition-specific thresholds
|
||||
# Use the mount point as the key
|
||||
partitions:
|
||||
# Root filesystem
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0 # Warn at 80% disk usage
|
||||
critical: 90.0 # Critical at 90% disk usage
|
||||
operator: ">"
|
||||
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 10.0 # Warn when less than 10GB free
|
||||
critical: 5.0 # Critical when less than 5GB free
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Home filesystem (if separate partition)
|
||||
/home:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
# Var filesystem (logs, etc.)
|
||||
/var:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 5.0 # Var needs space for logs
|
||||
critical: 2.0
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Network Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
network_monitor:
|
||||
# Total error count across all interfaces
|
||||
errors_total:
|
||||
warning: 100 # Warn at 100 errors
|
||||
critical: 1000 # Critical at 1000 errors
|
||||
operator: ">"
|
||||
hysteresis: 0.2 # 20% hysteresis for counters
|
||||
enabled: true
|
||||
|
||||
# Total dropped packets
|
||||
dropin_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
dropout_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# TCP connections in TIME_WAIT state
|
||||
connections_TIME_WAIT:
|
||||
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
||||
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# Total established connections
|
||||
connections_ESTABLISHED:
|
||||
warning: 500
|
||||
critical: 1000
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Nagios Plugin Thresholds (if using nagios_runner)
|
||||
# ----------------------------------------------------------------------------
|
||||
nagios_runner:
|
||||
# Nagios plugins report exit codes:
|
||||
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
||||
# We can threshold on the exit_code directly
|
||||
exit_code:
|
||||
warning: 1 # Map Nagios WARNING to our WARNING
|
||||
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
||||
operator: ">=" # Alert when exit code >= threshold
|
||||
hysteresis: 0.0 # No hysteresis for exit codes
|
||||
enabled: true
|
||||
|
||||
# ==============================================================================
|
||||
# Notification Configuration
|
||||
# ==============================================================================
|
||||
# Configure notification methods (email, pushover, etc.)
|
||||
# These are used when threshold violations occur
|
||||
|
||||
# Email notifications
|
||||
toemail:
|
||||
- admin@example.com
|
||||
- oncall@example.com
|
||||
fromemail: heartbeat@example.com
|
||||
smtpserver: smtp.example.com
|
||||
smtpport: 587
|
||||
smtpuser: heartbeat@example.com
|
||||
smtppassword: your-password-here
|
||||
|
||||
# Pushover notifications (optional)
|
||||
# pushover_token: your-pushover-app-token
|
||||
# pushover_user: your-pushover-user-key
|
||||
|
||||
# Mattermost webhook (optional)
|
||||
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
||||
|
||||
# ==============================================================================
|
||||
# Watched Hosts
|
||||
# ==============================================================================
|
||||
# Hosts in this list will trigger notifications for:
|
||||
# - Heartbeat timeouts/overdue
|
||||
# - Threshold violations
|
||||
# - Boot messages
|
||||
watchhosts:
|
||||
- webserver01
|
||||
- database01
|
||||
- mailserver
|
||||
- critical-app
|
||||
|
||||
# ==============================================================================
|
||||
# Additional Server Settings
|
||||
# ==============================================================================
|
||||
hb_port: 50003 # UDP port for heartbeat messages
|
||||
hbd_port: 50004 # HTTP port for web interface
|
||||
grace: 10 # Grace period for overdue detection (seconds)
|
||||
debug: 0 # Debug level (0-3)
|
||||
verbose: false # Verbose output
|
||||
|
||||
# Journal settings (message logging)
|
||||
journal_enabled: true
|
||||
journal_path: /var/log/heartbeat/messages.journal
|
||||
journal_max_size: 104857600 # 100MB before rotation
|
||||
journal_max_backups: 10
|
||||
|
||||
# ==============================================================================
|
||||
# Example: Production Configuration with Conservative Thresholds
|
||||
# ==============================================================================
|
||||
# For production systems, consider:
|
||||
# - Higher warning thresholds to reduce alert fatigue
|
||||
# - Appropriate hysteresis values (5-15% typical)
|
||||
# - Re-notification intervals matching on-call rotation
|
||||
# - Multiple escalation contacts
|
||||
# - Integration with incident management systems
|
||||
# ==============================================================================
|
||||
-602
@@ -1,602 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
|
||||
# NEW
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
import os
|
||||
import signal
|
||||
import select
|
||||
import traceback
|
||||
from hashlib import md5
|
||||
import shutil
|
||||
import zlib
|
||||
import subprocess
|
||||
import syslog
|
||||
import codecs
|
||||
|
||||
from .config import load_config
|
||||
|
||||
PORT = 50003
|
||||
INTERVAL = 10
|
||||
REOPENC = 6
|
||||
PIDFILE = "/tmp/hbc.pid"
|
||||
VER = 6
|
||||
MAXRECV = 32767
|
||||
|
||||
running = True
|
||||
dorestart = False
|
||||
warned1 = False
|
||||
|
||||
msgonly = False
|
||||
helpflag = False
|
||||
verbose = False
|
||||
fdaemon = False
|
||||
daemonized = False
|
||||
msgboot = {}
|
||||
home = os.environ["HOME"]
|
||||
configfile = "%s/.hbrc" % home
|
||||
cmdargs = []
|
||||
iam = socket.gethostname()
|
||||
|
||||
|
||||
def log(msg):
|
||||
if fdaemon:
|
||||
syslog.syslog(syslog.LOG_ERR, msg)
|
||||
else:
|
||||
print(msg)
|
||||
|
||||
|
||||
def handler(signum, frame):
|
||||
if signum == signal.SIGTERM:
|
||||
cleanup()
|
||||
|
||||
|
||||
class NullDevice:
|
||||
def write(self, s):
|
||||
pass
|
||||
|
||||
|
||||
class Conn:
|
||||
def __init__(self, conId, addr, port, af):
|
||||
self.conId = conId
|
||||
self.addr = addr
|
||||
self.port = port
|
||||
self.af = af
|
||||
|
||||
self.ackcount = 0 # num of accks received
|
||||
self.lastack = 0 # time() last ACK was received
|
||||
self.send = 0
|
||||
self.lastsend = 0 # time() last msg was sent
|
||||
self.rtts = [0]
|
||||
self.sock = None
|
||||
|
||||
def __str__(self):
|
||||
return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
|
||||
|
||||
def open(self):
|
||||
self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
|
||||
self.sock.setsockopt(
|
||||
socket.SOL_SOCKET,
|
||||
socket.SO_REUSEADDR,
|
||||
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
|
||||
)
|
||||
|
||||
def sendto(self, msg, ID="HTB"): # default ID is HearTBeat
|
||||
global warned1
|
||||
|
||||
if self.send % REOPENC == 0:
|
||||
self.close()
|
||||
if not self.sock:
|
||||
self.open()
|
||||
msg["name"] = shortname(iam)
|
||||
msg["id"] = self.conId
|
||||
msg["ver"] = VER
|
||||
msg["time"] = time.time()
|
||||
m = dicttos(ID, msg) # always compress
|
||||
if verbose:
|
||||
log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
|
||||
try:
|
||||
self.sock.sendto(m, (self.addr, self.port))
|
||||
except socket.error as e:
|
||||
if not warned1:
|
||||
log("socket error: %s %s:%s" % (e, self.addr, self.port))
|
||||
warned1 = True
|
||||
self.close()
|
||||
return
|
||||
self.send += 1
|
||||
self.lastsend = time.time()
|
||||
|
||||
def ack(self, msgDict, now):
|
||||
try:
|
||||
self.lastack = msgDict["time"]
|
||||
mul = 2
|
||||
except Exception:
|
||||
self.lastack = now
|
||||
mul = 1
|
||||
rtt = (self.lastack - self.lastsend) * mul
|
||||
if verbose:
|
||||
log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
|
||||
self.rtts.append(rtt * 1000.0)
|
||||
if len(self.rtts) > 10:
|
||||
del self.rtts[0]
|
||||
self.ackcount += 1
|
||||
|
||||
def close(self):
|
||||
if self.sock:
|
||||
self.sock.close()
|
||||
self.sock = None
|
||||
|
||||
|
||||
def shortname(name):
|
||||
r = name.split(".")
|
||||
return r[0]
|
||||
|
||||
|
||||
def dicttos(ID, d):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
return ID.encode() + zpk
|
||||
|
||||
|
||||
def stodict(msg):
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
r0 = msg.split(":", 1)
|
||||
pk = r0[1]
|
||||
d["ID"] = r0[0]
|
||||
r = pk.split(";")
|
||||
for v in r:
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
v = vr[1].strip()
|
||||
try:
|
||||
v = eval(v)
|
||||
except Exception:
|
||||
pass
|
||||
d[k] = v
|
||||
if verbose:
|
||||
print("msg is %s" % d)
|
||||
return d
|
||||
|
||||
|
||||
def XXstodict(msg):
|
||||
d = {}
|
||||
r0 = msg.split(":", 1)
|
||||
if len(r0) == 1:
|
||||
return None
|
||||
if r0[0][0] == "!": # compressed
|
||||
pk = zlib.decompress(msg[len(r0[0]) + 1 :])
|
||||
d["ID"] = r0[0][1:]
|
||||
else:
|
||||
pk = r0[1]
|
||||
d["ID"] = r0[0]
|
||||
r = pk.split(";")
|
||||
for v in r:
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
v = vr[1].strip()
|
||||
try:
|
||||
if v[0].isdigit():
|
||||
v = eval(v)
|
||||
except Exception:
|
||||
pass
|
||||
d[k] = v
|
||||
return d
|
||||
|
||||
|
||||
def syslogtrace(note):
|
||||
logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
|
||||
log(logm)
|
||||
for line in logm.split("\n"):
|
||||
syslog.syslog(syslog.LOG_ERR, " tb: %s" % line)
|
||||
if verbose:
|
||||
print(logm)
|
||||
|
||||
|
||||
conId = 1
|
||||
|
||||
|
||||
def createConnections(hosts):
|
||||
global conId
|
||||
for host in hosts:
|
||||
if verbose:
|
||||
log("createConnections for %s" % host)
|
||||
try:
|
||||
rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
||||
except socket.gaierror:
|
||||
logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
|
||||
if verbose:
|
||||
log(logm)
|
||||
return None
|
||||
for r in rs:
|
||||
if verbose:
|
||||
log("address %s" % str(r))
|
||||
if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD
|
||||
af = socket.AF_INET6
|
||||
elif r[0] == 2:
|
||||
af = socket.AF_INET
|
||||
else:
|
||||
print("dont know this net type: %s" % r[0][0])
|
||||
sys.exit(1)
|
||||
|
||||
addr = r[4][0]
|
||||
conns[conId] = Conn(conId, addr, hb_port, af)
|
||||
if verbose:
|
||||
print("cons[%s] = %s" % (conId, str(conns[conId])))
|
||||
conId += 1
|
||||
|
||||
|
||||
def doexec(conn, data):
|
||||
try:
|
||||
ro = subprocess.check_output(
|
||||
data, stderr=subprocess.STDOUT, shell=True
|
||||
).decode()
|
||||
fail = "OK"
|
||||
except subprocess.CalledProcessError as e:
|
||||
ro = str(e)
|
||||
fail = "CalledProcessError"
|
||||
except Exception as e:
|
||||
syslogtrace("System")
|
||||
ro = "N/A"
|
||||
fail = "cmd failed: %s" % e
|
||||
msg = {"service": "command", "msg": fail + " " + ro}
|
||||
conns[conn].sendto(msg)
|
||||
|
||||
|
||||
def doupdate(conn, msgDict):
|
||||
fail = None
|
||||
try:
|
||||
code = codecs.decode(msgDict["code"], "base64").decode()
|
||||
csum = msgDict["csum"]
|
||||
except Exception as e:
|
||||
fail = "csum/code missing: %s" % e
|
||||
if not fail:
|
||||
fail = doupdateone(code, csum)
|
||||
|
||||
msg = {"service": "update", "msg": fail if fail else "OK"}
|
||||
conns[conn].sendto(msg)
|
||||
if not fail:
|
||||
log("hc updates, fs = %s" % (len(code)))
|
||||
|
||||
return fail
|
||||
|
||||
|
||||
def doupdateone(code, csum):
|
||||
|
||||
m = md5()
|
||||
m.update(code.encode())
|
||||
icsum = m.hexdigest()
|
||||
if icsum != csum:
|
||||
return "checksum error"
|
||||
|
||||
fn = sys.argv[0]
|
||||
ofn = "%s.sav" % fn
|
||||
try:
|
||||
shutil.copy2(fn, ofn)
|
||||
except Exception as e:
|
||||
return "cannot make backup copy: %s" % e
|
||||
|
||||
try:
|
||||
fh = open(fn, "w")
|
||||
fh.write(code)
|
||||
fh.close()
|
||||
except Exception as e:
|
||||
return "cannot write new code: %s" % e
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def restart():
|
||||
if verbose:
|
||||
print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
|
||||
syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
|
||||
e = "fallthrough"
|
||||
try:
|
||||
os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
|
||||
except Exception:
|
||||
pass
|
||||
print("should not be here:", str(e))
|
||||
log("restart failed: %s" % e)
|
||||
|
||||
|
||||
def process():
|
||||
global running, dorestart
|
||||
|
||||
nextReport = time.time()
|
||||
|
||||
while running:
|
||||
while time.time() < nextReport:
|
||||
ifiles = {}
|
||||
conIds = {}
|
||||
for conn in conns:
|
||||
if conns[conn].sock:
|
||||
ifiles[conns[conn].sock.fileno()] = conns[conn].sock
|
||||
conIds[conns[conn].sock.fileno()] = conn
|
||||
|
||||
sleep = nextReport - time.time()
|
||||
if sleep <= 0:
|
||||
break
|
||||
try:
|
||||
r = select.select(list(ifiles.keys()), [], [], sleep)
|
||||
now = (
|
||||
time.time()
|
||||
) # nb: delay from actual packet arrival to select is ca. 105ms!
|
||||
except KeyboardInterrupt:
|
||||
running = False
|
||||
break
|
||||
except SystemExit:
|
||||
log("daemon exit, running was %s" % running)
|
||||
if running:
|
||||
running = False
|
||||
break
|
||||
except Exception:
|
||||
if running:
|
||||
syslogtrace("select")
|
||||
running = False
|
||||
break
|
||||
for rfh in r[0]:
|
||||
conn = conIds[rfh]
|
||||
data, addr = ifiles[rfh].recvfrom(MAXRECV)
|
||||
if verbose:
|
||||
print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
|
||||
try:
|
||||
msgDict = stodict(data)
|
||||
except Exception as e:
|
||||
print(
|
||||
"failed to parse incoming data from %s: %s (%s)"
|
||||
% (addr, data, e)
|
||||
)
|
||||
continue
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
"sock.recvfrom: %s (%s) %s"
|
||||
% (addr, len(data), str(msgDict)[:80])
|
||||
)
|
||||
if msgDict is None:
|
||||
print("bad backet from %s (%s) %s" % (addr, len(data), data))
|
||||
elif msgDict["ID"] == "ACK":
|
||||
conns[conn].ack(msgDict, now)
|
||||
elif msgDict["ID"] == "UPD":
|
||||
if doupdate(conn, msgDict) is None:
|
||||
if verbose:
|
||||
print("process: restart after update")
|
||||
dorestart = True
|
||||
break
|
||||
elif msgDict["ID"] == "CMD":
|
||||
doexec(conn, msgDict["cmd"])
|
||||
else:
|
||||
doexec(conn, data) # deprecated until no more VER - hbc
|
||||
if dorestart:
|
||||
running = False
|
||||
break
|
||||
if not running:
|
||||
break
|
||||
for conn in conns:
|
||||
msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
|
||||
conns[conn].sendto(msg)
|
||||
time.sleep(
|
||||
0.1
|
||||
) # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
|
||||
if nextReport + interval >= time.time():
|
||||
nextReport += interval
|
||||
else:
|
||||
nextReport = time.time() + interval
|
||||
|
||||
if verbose:
|
||||
log("process: done running")
|
||||
|
||||
|
||||
def cleanup():
|
||||
global running
|
||||
if not running:
|
||||
return
|
||||
if verbose:
|
||||
log("cleanup")
|
||||
running = False
|
||||
for conn in conns:
|
||||
msg = {"shutdown": 1, "acks": conns[conn].ackcount}
|
||||
conns[conn].sendto(msg)
|
||||
conns[conn].close()
|
||||
time.sleep(1)
|
||||
closeall()
|
||||
|
||||
|
||||
def closeall():
|
||||
if verbose:
|
||||
syslog.syslog(syslog.LOG_ERR, "closecall")
|
||||
for conn in conns:
|
||||
conns[conn].close()
|
||||
|
||||
|
||||
def daemonize(
|
||||
working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
|
||||
):
|
||||
"""
|
||||
Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
|
||||
UNIX Environment" for details (ISBN 0201563177)
|
||||
http://www.yendor.com/programming/unix/apue/proc/fork2.c
|
||||
"""
|
||||
|
||||
try:
|
||||
# first fork
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# exit from first parent
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
|
||||
os._exit(1)
|
||||
|
||||
# decouple from parent environment
|
||||
os.chdir(working_dir)
|
||||
os.setsid()
|
||||
os.umask(0)
|
||||
# second fork
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# exit from second parent
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
|
||||
sys.exit(1)
|
||||
|
||||
# redirects standard file descriptors
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
si = open(stdin, "r")
|
||||
so = open(stdout, "a+")
|
||||
se = open(stderr, "a+")
|
||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
#
|
||||
# Main program
|
||||
#
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbc",
|
||||
description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
|
||||
parser.add_argument(
|
||||
"-c", "--config", dest="configfile", help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument("-m", "--message", dest="message", help="Send a message")
|
||||
parser.add_argument(
|
||||
"-n", "--name", dest="name", help="Name to use in heartbeat message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--daemon", action="store_true", help="Run in daemon mode"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument(
|
||||
"-x", "--debug", action="count", default=0, help="Increase debug level"
|
||||
)
|
||||
parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
global msgonly, verbose, fdaemon, daemonized, cmdargs, iam, hb_port, conns, interval, hb_hosts
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.boot:
|
||||
msgboot["boot"] = 1
|
||||
if args.message:
|
||||
msgboot["service"] = "service"
|
||||
msgboot["msg"] = args.message
|
||||
msgonly = True
|
||||
if args.name:
|
||||
iam = args.name
|
||||
cmdargs += ["-n", iam]
|
||||
if args.daemon:
|
||||
fdaemon = True
|
||||
if args.verbose:
|
||||
verbose = True
|
||||
cmdargs.append("--verbose")
|
||||
if args.debug:
|
||||
config.setdefault("debug", 0)
|
||||
config["debug"] += args.debug
|
||||
cmdargs.append("-" + "x" * args.debug)
|
||||
|
||||
if verbose:
|
||||
print("cmdargs for restart are %s" % cmdargs)
|
||||
|
||||
#
|
||||
# set defaults
|
||||
|
||||
hb_hosts = args.hosts
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
#
|
||||
if verbose:
|
||||
print("notice: hb_hosts: %s" % str(hb_hosts))
|
||||
print("notice: hb_port: %s" % hb_port)
|
||||
print("notice: interval: %s" % interval)
|
||||
print("notice: iam: %s" % iam)
|
||||
print("notice: msgonly: %s" % msgonly)
|
||||
print("notice: msgboot: %s" % msgboot)
|
||||
|
||||
if not msgonly:
|
||||
msgboot["interval"] = interval
|
||||
|
||||
conns = {}
|
||||
while True:
|
||||
if verbose:
|
||||
log("create connections")
|
||||
createConnections(hb_hosts)
|
||||
if len(conns) != 0:
|
||||
break
|
||||
if verbose:
|
||||
log("no connections yet, sleep a bit")
|
||||
time.sleep(2)
|
||||
|
||||
if verbose:
|
||||
log("%s connections created" % (len(conns)))
|
||||
|
||||
if len(msgboot) > 0:
|
||||
if verbose:
|
||||
print("on boot")
|
||||
msgboot["acks"] = 0
|
||||
for conn in conns:
|
||||
conns[conn].sendto(msgboot)
|
||||
|
||||
if msgonly:
|
||||
if verbose:
|
||||
print("msgboot done msgonly=%s" % msgonly)
|
||||
closeall()
|
||||
sys.exit(0)
|
||||
|
||||
#
|
||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
||||
if fdaemon:
|
||||
print("daemoinizing.")
|
||||
daemonize()
|
||||
daemonized = True
|
||||
syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
|
||||
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
try:
|
||||
process()
|
||||
except Exception as e:
|
||||
syslogtrace("process")
|
||||
if verbose:
|
||||
print("err: process exit: %s" % e)
|
||||
|
||||
if verbose:
|
||||
log("main: cleanup")
|
||||
cleanup()
|
||||
if dorestart:
|
||||
restart()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
-381
@@ -1,381 +0,0 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.__dict__)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except Exception:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.cver = 0
|
||||
self.connections = {}
|
||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["ver"] = str(self.cver)
|
||||
d["num"] = self.num
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["ver"] = "Ver"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c not in self.connections:
|
||||
continue
|
||||
# dirty ugly hack: fix conn to host backpointer
|
||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
||||
cld["host"] = cld["host"].name
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def setcver(self, cver):
|
||||
self.cver = cver
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
pass
|
||||
|
||||
# def dispstate(self):
|
||||
# if self.state in ["down", "overdue"]:
|
||||
# state = "<b>%s</b>" % self.state
|
||||
# elif self.state in ["up", "UP"]:
|
||||
# state = ""
|
||||
# for x in list(self.connections.keys()):
|
||||
# try:
|
||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
||||
# except:
|
||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
||||
# state = ""
|
||||
# else:
|
||||
# state = "%s" % self.state
|
||||
# return state
|
||||
|
||||
def dispstats(self):
|
||||
if self.doesack != -1:
|
||||
if self.upcount > 0:
|
||||
r = ""
|
||||
for v in range(3):
|
||||
a, u = self.hdwcounts[v]
|
||||
if (self.upcount - u) != 0:
|
||||
vs = "%0.0f" % (
|
||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
||||
)
|
||||
if vs == "0":
|
||||
vs = ""
|
||||
else:
|
||||
vs = "-"
|
||||
r += '<td align="right">%s</td>' % vs
|
||||
return r
|
||||
else:
|
||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
"ver",
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le :]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
-213
@@ -1,213 +0,0 @@
|
||||
"""HTTP server implementation using aiohttp and jinja2."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import os
|
||||
import logging
|
||||
from aiohttp import web
|
||||
import jinja2
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _render_template(html_str: str, **context) -> str:
|
||||
tmpl = jinja2.Template(html_str)
|
||||
return tmpl.render(**context)
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
port: int,
|
||||
config,
|
||||
hbdclass,
|
||||
msgs_getter,
|
||||
log=None,
|
||||
email=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
tcss=None,
|
||||
DEBUG=0,
|
||||
verbose=False,
|
||||
get_now=None,
|
||||
VER="",
|
||||
):
|
||||
"""Start an aiohttp web server and block until cancelled.
|
||||
|
||||
This function is intended to be awaited inside the main asyncio event loop.
|
||||
"""
|
||||
get_now = get_now or (lambda: time.time())
|
||||
|
||||
async def index(request):
|
||||
res = []
|
||||
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
||||
res.append("<html>")
|
||||
res.append("<head>")
|
||||
res.append("<title>Heartbeat</title>")
|
||||
if tcss:
|
||||
res.append(tcss)
|
||||
res.append("</head>")
|
||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
||||
res += hbdclass.ubHost.buildhosttable()
|
||||
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
|
||||
res.append(
|
||||
"<p> %s (%s)</p>"
|
||||
% (
|
||||
time.strftime("%H:%M:%S", time.localtime(get_now())),
|
||||
config.get("tz", "CET-1CDT"),
|
||||
)
|
||||
)
|
||||
res.append("</body></html>")
|
||||
body = "\n".join(res)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def api_hosts(request):
|
||||
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
|
||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||
|
||||
async def api_messages(request):
|
||||
lst = msgs_getter()[-30:]
|
||||
return web.json_response(lst)
|
||||
|
||||
async def cmd(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
ucmd = qa.get("c")
|
||||
if not ucmd or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
hbdclass.Host.hosts[uname].cmds.append(
|
||||
("CMD", {"cmd": urllib.parse.unquote(ucmd)})
|
||||
)
|
||||
return web.Response(text=f"cmd {uname} queued")
|
||||
|
||||
async def drop(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if log:
|
||||
log(uname, "dropped")
|
||||
del hbdclass.Host.hosts[uname]
|
||||
return web.Response(text="Done")
|
||||
|
||||
async def register(request):
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
ll = hbdclass.Host.hosts[uname].registerDns()
|
||||
if log:
|
||||
log(uname, ll)
|
||||
return web.Response(text=str(ll))
|
||||
|
||||
async def update(request):
|
||||
qa = request.rel_url.query
|
||||
uname = urllib.parse.unquote(qa.get("h", ""))
|
||||
ucode = qa.get("c")
|
||||
if not ucode or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname != "All" and uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if uname != "All":
|
||||
names = [uname]
|
||||
else:
|
||||
names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
|
||||
out = []
|
||||
for n in names:
|
||||
err = None
|
||||
try:
|
||||
r = {"csum": None, "code": ucode}
|
||||
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
|
||||
except Exception as e:
|
||||
err = str(e)
|
||||
out.append(f"update started for {n}: {err if err else 'OK'}")
|
||||
return web.Response(text="\n".join(out))
|
||||
|
||||
async def live(request):
|
||||
# render template from hbd/templates/live.html using Jinja2
|
||||
# Resolve templates directory relative to the hbd package
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
host = config.get("hb_host", "localhost")
|
||||
extra_scripts = config.get("http_extra_scripts", "")
|
||||
host = request.host.split(":")[0]
|
||||
if config.get("wss_port"):
|
||||
heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
|
||||
else:
|
||||
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
|
||||
tmpl = env.get_template("live.html")
|
||||
body = tmpl.render(
|
||||
title="Heartbeat",
|
||||
header="Heartbeat",
|
||||
request=request,
|
||||
heartbeat_ws_url=heartbeat_ws_url,
|
||||
extra_scripts=extra_scripts,
|
||||
hosts=[
|
||||
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
messages=msgs_getter()[-30:],
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def static(request):
|
||||
"""Serve files from the package static directory.
|
||||
|
||||
URL form: /static/<path>
|
||||
"""
|
||||
p = request.match_info.get("path", "")
|
||||
logger.debug("static file requested: %s", p)
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
||||
# normalize and prevent directory traversal
|
||||
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
||||
if not target.startswith(base + os.sep) and target != base:
|
||||
return web.Response(status=403, text="Forbidden")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
logger.info("serving static file: %s", target)
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
async def favicon(request):
|
||||
"""Serve favicon.ico from the package static directory."""
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
|
||||
target = os.path.join(base, "favicon.ico")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
app = web.Application()
|
||||
app.add_routes(
|
||||
[
|
||||
web.get("/", index),
|
||||
web.get("/api/0/hosts", api_hosts),
|
||||
web.get("/api/0/messages", api_messages),
|
||||
web.get("/c", cmd),
|
||||
web.get("/d", drop),
|
||||
web.get("/n", register),
|
||||
web.get("/u", update),
|
||||
web.get("/live", live),
|
||||
web.get("/static/{path:.*}", static),
|
||||
web.get("/favicon.ico", favicon),
|
||||
]
|
||||
)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, host, port)
|
||||
await site.start()
|
||||
|
||||
if verbose:
|
||||
print(f"HTTP server started on {host}:{port}")
|
||||
|
||||
try:
|
||||
await asyncio.Future()
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
@@ -1,50 +0,0 @@
|
||||
"""monitor helper and thread for heartbeat daemon."""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
|
||||
|
||||
def checkoverdue(
|
||||
config: dict,
|
||||
hbdclass,
|
||||
log: callable,
|
||||
pushmsg: callable,
|
||||
msg_to_websockets: callable,
|
||||
):
|
||||
now = time.time()
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
pmsg = []
|
||||
for c in hbdclass.Host.hosts[h].connections:
|
||||
conn = hbdclass.Host.hosts[h].connections[c]
|
||||
if conn.state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
||||
pmsg.append(conn.afam)
|
||||
if (
|
||||
conn.state == hbdclass.Connection.OVERDUE
|
||||
and (now - conn.lastbeat) > DROPOVERDUE
|
||||
):
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
if pmsg != []:
|
||||
if h in config.get("watchhosts", []):
|
||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
||||
log(h, "%s overdue" % " and ".join(pmsg))
|
||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
||||
|
||||
|
||||
async def start(
|
||||
config: dict,
|
||||
hbdclass: callable,
|
||||
log=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
):
|
||||
"""start a monitor loop that checks for overdue hosts every minute"""
|
||||
while True:
|
||||
await asyncio.sleep(15) # 15 seconds between checks
|
||||
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
|
||||
-202
@@ -1,202 +0,0 @@
|
||||
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
import http.client
|
||||
import urllib.parse
|
||||
import subprocess
|
||||
import smtplib
|
||||
import time
|
||||
|
||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
||||
|
||||
# module-level configuration set via setup()
|
||||
_config = {}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def setup(cfg: dict):
|
||||
"""Initialize notifier defaults from a configuration dict."""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
|
||||
|
||||
def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
|
||||
"""Send a plain email via SMTP. Returns True on success."""
|
||||
try:
|
||||
smtpport = _config.get("smtpport", 587)
|
||||
server = smtplib.SMTP(smtpserver, smtpport)
|
||||
if debug > 0:
|
||||
server.set_debuglevel(1)
|
||||
if smtpport == 587:
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
smtpuser = _config.get("smtpuser", None)
|
||||
smtppassword = _config.get("smtppassword", None)
|
||||
if smtpuser and smtppassword:
|
||||
server.login(smtpuser, smtppassword)
|
||||
server.sendmail(sender, toaddrs, body)
|
||||
except Exception as e:
|
||||
logger.warning("email send failed: %s", e)
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def email(subject: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Convenience wrapper exposed to the rest of the application.
|
||||
|
||||
Uses module-level configuration to supply recipient list, smtp server
|
||||
and sender address.
|
||||
"""
|
||||
toaddrs = _config.get("toemail")
|
||||
fromemail = _config.get("fromemail")
|
||||
smtpserver = _config.get("smtpserver")
|
||||
if not toaddrs or not fromemail or not smtpserver:
|
||||
logger.warning(
|
||||
"email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
|
||||
toaddrs,
|
||||
fromemail,
|
||||
smtpserver,
|
||||
)
|
||||
return False
|
||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||
toaddrs[0] if toaddrs else "",
|
||||
fromemail,
|
||||
subject,
|
||||
date,
|
||||
msg,
|
||||
)
|
||||
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
|
||||
|
||||
|
||||
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Send message via Pushover API."""
|
||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||
try:
|
||||
conn.request(
|
||||
"POST",
|
||||
"/1/messages.json",
|
||||
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
|
||||
{"Content-type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
r = conn.getresponse()
|
||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||
return r.status == 200
|
||||
except Exception as e:
|
||||
logger.error("pushover error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushmattermost(
|
||||
host: str,
|
||||
token: str,
|
||||
channel: str,
|
||||
msg: str,
|
||||
username: str = "hbd",
|
||||
icon: Optional[str] = None,
|
||||
debug: int = 0,
|
||||
) -> bool:
|
||||
"""Send a message to Mattermost via simple webhook driver if available.
|
||||
|
||||
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
|
||||
If the import fails it returns False.
|
||||
"""
|
||||
try:
|
||||
from mattermostdriver import Driver
|
||||
except Exception:
|
||||
return False
|
||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||
mm = Driver(ses)
|
||||
payload = {"text": msg, "channel": channel, "username": username}
|
||||
if icon:
|
||||
payload["icon_url"] = icon
|
||||
try:
|
||||
rc = mm.webhooks.call_webhook(token, payload)
|
||||
logger.debug("mattermost rc: %s", rc)
|
||||
return bool(rc is None or rc == "")
|
||||
except Exception as e:
|
||||
logger.error("mattermost error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushsignal(
|
||||
signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
|
||||
) -> bool:
|
||||
"""Send a message via signal-cli (requires local installation).
|
||||
|
||||
Uses subprocess to call signal-cli. Returns True if the command succeeded.
|
||||
"""
|
||||
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
|
||||
logger.debug("signal cli: %s", CLI)
|
||||
try:
|
||||
res = subprocess.run(CLI, capture_output=True)
|
||||
if res.returncode != 0:
|
||||
logger.error("signal failed: %s".res.stderr.decode())
|
||||
return False
|
||||
logger.debug("signal sent: %s", res.stdout.decode())
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception("signal exception: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
||||
|
||||
cfg is expected to contain keys for different services when needed, e.g.
|
||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
||||
- cfg['pushover_token'], cfg['pushover_user']
|
||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
||||
|
||||
Returns a dict of results per provider.
|
||||
"""
|
||||
results = {}
|
||||
p = cfg.get("pushsrv", "pushover")
|
||||
if p in ("all", "pushover"):
|
||||
ok = pushover(
|
||||
cfg.get("pushover_token", ""),
|
||||
cfg.get("pushover_user", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["pushover"] = ok
|
||||
if p in ("all", "mattermost"):
|
||||
ok = pushmattermost(
|
||||
cfg.get("matter_host", ""),
|
||||
cfg.get("matter_token", ""),
|
||||
cfg.get("matter_channel", ""),
|
||||
msg,
|
||||
username=cfg.get("matter_username", "hbd"),
|
||||
icon=cfg.get("matter_icon"),
|
||||
debug=debug,
|
||||
)
|
||||
results["mattermost"] = ok
|
||||
if p in ("all", "signal"):
|
||||
ok = pushsignal(
|
||||
cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
|
||||
cfg.get("signal_user", ""),
|
||||
cfg.get("signal_recipient", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["signal"] = ok
|
||||
if p in ("all", "email"):
|
||||
ok = email("Heartbeat notification", msg, debug=debug)
|
||||
results["email"] = ok
|
||||
logger.debug("push results: %s", results)
|
||||
return results
|
||||
|
||||
|
||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
||||
"""Use the module-level configuration dict to dispatch a push message."""
|
||||
return pushmsg(_config, msg, debug=debug)
|
||||
@@ -1,82 +0,0 @@
|
||||
"""Message encoding/decoding utilities for hbd protocol."""
|
||||
|
||||
from typing import Dict, Any
|
||||
import zlib
|
||||
|
||||
|
||||
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
||||
"""Serialize a dict to protocol message bytes.
|
||||
|
||||
If compress is True, the payload is zlib-compressed and the message is
|
||||
prefixed with `!ID:` as the original script did. Otherwise the format is
|
||||
`ID:key=value;...` (bytes).
|
||||
"""
|
||||
s = []
|
||||
for k in d:
|
||||
v = d[k]
|
||||
if isinstance(v, float):
|
||||
s.append(f"{k}={v:0.5f}")
|
||||
else:
|
||||
s.append(f"{k}={v}")
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
else:
|
||||
return (ID + ":" + pk).encode()
|
||||
|
||||
|
||||
def stodict(msg: bytes):
|
||||
"""Deserialize a protocol message into a dict.
|
||||
|
||||
Mirrors original behaviour: detects compressed messages starting with
|
||||
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
|
||||
message ID and the parsed key/value pairs.
|
||||
"""
|
||||
d = {}
|
||||
if len(msg) > 0 and chr(msg[0]) == "!":
|
||||
# message is: b'!ID:' + compressed_payload
|
||||
# original code used msg[1:4].decode() for ID (3 bytes including colon)
|
||||
try:
|
||||
pk = zlib.decompress(msg[5:]).decode()
|
||||
except Exception:
|
||||
# malformed compressed payload
|
||||
return {}
|
||||
d["ID"] = msg[1:4].decode()
|
||||
else:
|
||||
try:
|
||||
r0 = msg.split(b":", 1)
|
||||
pk = r0[1].decode()
|
||||
d["ID"] = r0[0].decode()
|
||||
except Exception:
|
||||
return {}
|
||||
if not pk:
|
||||
return d
|
||||
parts = pk.split(";")
|
||||
for v in parts:
|
||||
if not v:
|
||||
continue
|
||||
vr = v.split("=", 1)
|
||||
k = vr[0].strip()
|
||||
if len(vr) == 1:
|
||||
d[k] = None
|
||||
else:
|
||||
val = vr[1].strip()
|
||||
if val and val[0].isdigit():
|
||||
try:
|
||||
val_e = eval(val)
|
||||
except Exception:
|
||||
val_e = val
|
||||
d[k] = val_e
|
||||
else:
|
||||
d[k] = val
|
||||
return d
|
||||
|
||||
|
||||
def oldmtodict(msg: bytes):
|
||||
"""Compatibility wrapper for old-style messages (no ID prefix).
|
||||
|
||||
The original implementation prefixed with 'HTB:' and called stodict.
|
||||
"""
|
||||
return stodict(b"HTB:" + msg)
|
||||
-370
@@ -1,370 +0,0 @@
|
||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
import signal
|
||||
import sys
|
||||
import ssl
|
||||
from . import __version__
|
||||
|
||||
from . import udp
|
||||
from . import hbdclass
|
||||
|
||||
from . import ws as ws_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
|
||||
logf = None
|
||||
lastfm = ["", "", ""]
|
||||
|
||||
# shared runtime collections and helpers
|
||||
msgs = []
|
||||
|
||||
|
||||
def initlog(logfile):
|
||||
try:
|
||||
return open(logfile, "a+")
|
||||
except Exception as e:
|
||||
import sys
|
||||
|
||||
print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
|
||||
return sys.stderr
|
||||
|
||||
|
||||
def log(host, m, service=None):
|
||||
ts = time.time()
|
||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
|
||||
msgs.append(s)
|
||||
logger.info(s)
|
||||
if logf:
|
||||
try:
|
||||
logf.write(s + "\n")
|
||||
logf.flush()
|
||||
except Exception as e:
|
||||
logger.warning("failed to write to logfile: %s", e)
|
||||
msg_to_websockets("message", s)
|
||||
|
||||
|
||||
def cleanup_function(config):
|
||||
"""This function will be executed upon program exit."""
|
||||
logger.info("Running cleanup function...")
|
||||
import pickle
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
|
||||
pickf = open(pickfile, "wb")
|
||||
pick = pickle.Pickler(pickf)
|
||||
pick.dump(hbdclass.Host.hosts)
|
||||
pick.dump(msgs)
|
||||
pick.dump(lastfm)
|
||||
pickf.close()
|
||||
|
||||
logger.info("Cleanup complete.")
|
||||
|
||||
|
||||
async def _run_async(config):
|
||||
loop = asyncio.get_running_loop()
|
||||
shutdown_event = asyncio.Event()
|
||||
|
||||
# Signal handlers for graceful shutdown
|
||||
def signal_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||
loop.call_soon_threadsafe(shutdown_event.set)
|
||||
|
||||
# Register signal handlers
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||
|
||||
from . import http as http_mod
|
||||
from . import dns as dns_mod
|
||||
from . import notify as notify_mod
|
||||
from . import monitor as monitor_mod
|
||||
|
||||
notify_mod.setup(config)
|
||||
|
||||
pushmsg = notify_mod.pushmsg_from_config
|
||||
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||
# This option is system-dependent; on many systems, setting it to False enables
|
||||
# the socket to handle both IPv4 and IPv6 traffic.
|
||||
try:
|
||||
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||
)
|
||||
|
||||
# 3. Bind to all interfaces (::) on a specific port
|
||||
|
||||
# UDP server endpoint (handler wired to handle_datagram with context)
|
||||
bind_addr = ("::", config.get("hb_port", 50003))
|
||||
sock.bind(bind_addr)
|
||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||
|
||||
def udp_handler(msg, addr, transport):
|
||||
ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
)
|
||||
udp.handle_datagram(msg, addr, transport, ctx)
|
||||
|
||||
transport, protocol = await loop.create_datagram_endpoint(
|
||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||
sock=sock,
|
||||
)
|
||||
|
||||
# HTTP server (asyncio-based via aiohttp)
|
||||
try:
|
||||
http_task = asyncio.create_task(
|
||||
http_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
port=config.get("hbd_port", 50004),
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
msgs_getter=lambda: msgs,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
tcss=None,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"HTTP server started on %s:%s",
|
||||
config.get("hbd_host", ""),
|
||||
config.get("hbd_port", 50004),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("failed to start HTTP server: %s", e)
|
||||
|
||||
# start dns update worker (async)
|
||||
dns_task = None
|
||||
try:
|
||||
dns_task = dns_mod.start_dns_worker(
|
||||
hbdclass, config, log=log, pushmsg=pushmsg, loop=loop
|
||||
)
|
||||
logger.info("dns update worker started")
|
||||
except Exception as e:
|
||||
logger.exception("dns worker failed to start: %s", e)
|
||||
|
||||
# Start the websocket servers as a background task
|
||||
if config.get("wss_port", None):
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ssl_path = config.get("cert_path", "")
|
||||
wss_pem = ssl_path + config.get("wss_pem", "")
|
||||
wss_key = ssl_path + config.get("wss_key", "")
|
||||
try:
|
||||
ssl_context.load_cert_chain(wss_pem, keyfile=wss_key)
|
||||
except FileNotFoundError:
|
||||
logger.error("error: missing SSL keys %s or %s", wss_pem, wss_key)
|
||||
sys.exit(1)
|
||||
logger.info(
|
||||
"Starting secure WebSocket server on port %s with cert %s",
|
||||
config.get("wss_port", None),
|
||||
wss_pem,
|
||||
)
|
||||
else:
|
||||
ssl_context = None
|
||||
|
||||
try:
|
||||
ws_task = asyncio.create_task(
|
||||
ws_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
ws_port=config.get("ws_port", None),
|
||||
wss_port=config.get("wss_port", None),
|
||||
ssl_context=ssl_context,
|
||||
get_hosts=lambda: [
|
||||
hbdclass.Host.hosts[h].stateinfo()
|
||||
for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
get_msgs=lambda: msgs,
|
||||
verbose=config.get("verbose", False),
|
||||
)
|
||||
)
|
||||
logger.info("WebSocket task started")
|
||||
except Exception as e:
|
||||
logger.exception("websocket server failed to start: %s", e)
|
||||
|
||||
# Start the monitor thread as a background task
|
||||
try:
|
||||
monitor_task = asyncio.create_task(
|
||||
monitor_mod.start(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=log,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
)
|
||||
)
|
||||
logger.info("Monitor task started")
|
||||
except Exception as e:
|
||||
logger.exception("monitor task failed to start: %s", e)
|
||||
|
||||
try:
|
||||
# run forever until shutdown event is set
|
||||
await shutdown_event.wait()
|
||||
logger.info("Shutdown signal received, stopping services...")
|
||||
except Exception as e:
|
||||
logger.exception("Error in main loop: %s", e)
|
||||
finally:
|
||||
# Cancel all running tasks
|
||||
logger.info("Cancelling tasks...")
|
||||
try:
|
||||
transport.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing UDP transport: %s", e)
|
||||
|
||||
tasks_to_cancel = [http_task, ws_task, monitor_task]
|
||||
for task in tasks_to_cancel:
|
||||
if task:
|
||||
try:
|
||||
task.cancel()
|
||||
logger.debug("Cancelled task: %s", task)
|
||||
except Exception as e:
|
||||
logger.warning("Error cancelling task: %s", e)
|
||||
|
||||
# Wait for tasks to finish cancellation with timeout
|
||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||
if remaining_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||
timeout=2.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for tasks to cancel")
|
||||
except Exception as e:
|
||||
logger.debug("Exception during task cancellation: %s", e)
|
||||
|
||||
# Signal DNS worker to exit and await it
|
||||
try:
|
||||
if "dns_task" in locals() and dns_task:
|
||||
try:
|
||||
hbdclass.Host.dnsQ.put(None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||
logger.info("DNS worker finished")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for DNS worker to finish")
|
||||
dns_task.cancel()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("DNS worker was cancelled")
|
||||
except Exception as e:
|
||||
logger.warning("Error awaiting DNS worker: %s", e)
|
||||
finally:
|
||||
# Clear queue bridge to release any held references
|
||||
hbdclass.Host.dnsQ = None
|
||||
except Exception as e:
|
||||
logger.warning("Error stopping DNS worker: %s", e)
|
||||
|
||||
logger.info("All tasks cancelled")
|
||||
|
||||
|
||||
def load_pickled_hosts(config, hbdclass):
|
||||
"""Load pickled hosts from file, if available."""
|
||||
global lastfm, msgs
|
||||
import os
|
||||
import pickle
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config.get("dyndnshosts", [])
|
||||
watchhosts = config.get("watchhosts", [])
|
||||
drophosts = config.get("drophosts", [])
|
||||
if 1 and os.path.exists(pickfile):
|
||||
if config.get("verbose", False):
|
||||
logger.info("opening pickls %s", pickfile)
|
||||
pickf = open(pickfile, "rb")
|
||||
pick = pickle.Unpickler(pickf)
|
||||
try:
|
||||
hbdclass.Host.hosts = pick.load()
|
||||
msgs = pick.load()
|
||||
try:
|
||||
lastfm = pick.load()
|
||||
except Exception:
|
||||
lastfm = ["", "", ""]
|
||||
pickf.close()
|
||||
except Exception as e:
|
||||
logger.exception("load pickled failed: %s", e)
|
||||
os.unlink(pickfile)
|
||||
hbdclass.Connection.htab = {}
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||
hbdclass.Host.hosts[h].fixup()
|
||||
for h in drophosts:
|
||||
if h in hbdclass.Host.hosts:
|
||||
del hbdclass.Host.hosts[h]
|
||||
if config.get("verbose", False):
|
||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||
else:
|
||||
if config.get("verbose", False):
|
||||
logger.info("no pickled data")
|
||||
|
||||
|
||||
def run(config):
|
||||
"""Start the hbd service (blocking).
|
||||
|
||||
Manually manages the event loop to ensure clean shutdown.
|
||||
"""
|
||||
global logf
|
||||
import os
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO
|
||||
)
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
logf = initlog(logfile=config.get("logfile", "messages.log"))
|
||||
log(None, f"hbd version {__version__} starting up")
|
||||
|
||||
# Create and set the event loop manually
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(_run_async(config))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception in main: %s", e)
|
||||
finally:
|
||||
cleanup_function(config)
|
||||
logger.info("hbd shutdown complete")
|
||||
if logf and logf != sys.stderr:
|
||||
try:
|
||||
logf.close()
|
||||
except Exception:
|
||||
pass
|
||||
# Explicitly close the loop
|
||||
try:
|
||||
# Cancel all remaining tasks
|
||||
pending = asyncio.all_tasks(loop)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
# Run one more cycle to process cancellations
|
||||
if pending:
|
||||
loop.run_until_complete(
|
||||
asyncio.gather(*pending, return_exceptions=True)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# Exit
|
||||
os._exit(0)
|
||||
@@ -0,0 +1,3 @@
|
||||
"""HeartBeat Daemon (hbd) - Server/daemon component."""
|
||||
|
||||
from hbd import __version__
|
||||
@@ -0,0 +1,104 @@
|
||||
"""Command line interface for hbd package."""
|
||||
|
||||
import argparse
|
||||
import getpass
|
||||
import sys
|
||||
|
||||
from .config import load_config
|
||||
from .main import run as run_server
|
||||
|
||||
PUSHSRVS = ["all", "pushover", "mattermost"]
|
||||
|
||||
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbd",
|
||||
description="HeartBeatDaemon - Wait for heartbeat messages and act on them (or their absence)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
# --- serve (default) ---
|
||||
serve_p = subparsers.add_parser("serve", help="Start the hbd server (default)")
|
||||
serve_p.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
serve_p.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||
serve_p.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
serve_p.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||
help="Push service to use")
|
||||
serve_p.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||
|
||||
# Legacy top-level flags (no subcommand) — kept for backward compatibility
|
||||
parser.add_argument("-c", "--config", dest="configfile", help="Config file path (YAML)")
|
||||
parser.add_argument("-f", "--foreground", action="store_true", help="Run in foreground")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument("-p", "--pushsrv", dest="pushsrv", choices=PUSHSRVS,
|
||||
help="Push service to use")
|
||||
parser.add_argument("-x", "--debug", action="count", default=0, help="Increase debug level")
|
||||
|
||||
# --- passwd ---
|
||||
passwd_p = subparsers.add_parser(
|
||||
"passwd",
|
||||
help="Generate a password hash for use in the config file",
|
||||
)
|
||||
passwd_p.add_argument(
|
||||
"username",
|
||||
nargs="?",
|
||||
help="Username (informational only, for display)",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def cmd_passwd(args):
|
||||
"""Interactive password hash generator."""
|
||||
from .users import hash_password
|
||||
|
||||
username = args.username or ""
|
||||
prompt = f"New password for {username}: " if username else "New password: "
|
||||
while True:
|
||||
pw = getpass.getpass(prompt)
|
||||
if not pw:
|
||||
print("Password must not be empty.", file=sys.stderr)
|
||||
continue
|
||||
pw2 = getpass.getpass("Confirm password: ")
|
||||
if pw != pw2:
|
||||
print("Passwords do not match, try again.", file=sys.stderr)
|
||||
continue
|
||||
break
|
||||
|
||||
hashed = hash_password(pw)
|
||||
if username:
|
||||
print(f"\nAdd the following to your config under users: -> {username}:")
|
||||
else:
|
||||
print("\nPassword hash (paste into config file under the user's 'password' key):")
|
||||
print(f" password: {hashed}")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.command == "passwd":
|
||||
cmd_passwd(args)
|
||||
return
|
||||
|
||||
# Default: run the server (supports both `hbd serve ...` and `hbd ...`)
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Apply CLI overrides
|
||||
if args.foreground:
|
||||
config["foreground"] = True
|
||||
if args.verbose:
|
||||
config["verbose"] = True
|
||||
if args.pushsrv:
|
||||
config["pushsrv"] = args.pushsrv
|
||||
if args.debug > 0:
|
||||
config["debug"] = args.debug
|
||||
|
||||
# Pass config_path for reloading support
|
||||
run_server(config, config_path=args.configfile)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,393 @@
|
||||
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
SERVER_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port to listen for heartbeats
|
||||
"hbd_port": 50004, # HTTP API port
|
||||
"hbd_host": "", # Bind address (empty = all interfaces)
|
||||
|
||||
# Persistence
|
||||
"pickfile": "/tmp/hb.pick",
|
||||
|
||||
# Logging
|
||||
"logfile": "/var/log/heartbeat.log",
|
||||
"logfmt": "text", # text or msg or json
|
||||
|
||||
# Notification channels
|
||||
"notification_channels": {}, # Named channels with type and credentials
|
||||
"default_notification_channels": [], # Default channels if host doesn't specify
|
||||
|
||||
# Monitoring settings
|
||||
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||
"grace": 2, # Grace multiplier (interval * grace = timeout)
|
||||
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||
|
||||
# User management
|
||||
"users": {}, # username -> {full_name, avatar, password, admin, notification_channels}
|
||||
"default_owner": None, # Username that owns hosts with no explicit owner
|
||||
|
||||
# Host management
|
||||
"hosts": {}, # New unified host definitions (optional)
|
||||
"watchhosts": [], # Hosts to monitor and notify about (legacy)
|
||||
"dyndnshosts": [], # Hosts with dynamic DNS (legacy)
|
||||
"drophosts": [], # Hosts to ignore
|
||||
"dyndomains": ["wrede.org"],
|
||||
|
||||
# DNS updates
|
||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||
|
||||
# WebSocket settings
|
||||
"ws_port": 50005,
|
||||
"wss_port": None,
|
||||
"cert_path": "/usr/local/etc/ssl/",
|
||||
"wss_pem": "fullchain.pem",
|
||||
"wss_key": "privkey.pem",
|
||||
|
||||
# Message journal configuration
|
||||
"journal_enabled": True,
|
||||
"journal_dir": "/var/log/heartbeat",
|
||||
"journal_file": "messages.journal",
|
||||
"journal_max_size": 100 * 1024 * 1024, # 100MB
|
||||
"journal_max_backups": 10,
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
# Plugin/threshold configs (for clients reporting to this server)
|
||||
"plugins": {},
|
||||
"thresholds": {},
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with server defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = SERVER_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
# Keep all keys from YAML to support plugin configs and future extensions
|
||||
for k, v in data.items():
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
|
||||
|
||||
class ReloadableConfig:
|
||||
"""Thread-safe/async-safe configuration wrapper that supports runtime reloading.
|
||||
|
||||
This class wraps the configuration dictionary and provides:
|
||||
- Thread-safe config reloading via SIGHUP
|
||||
- Backward-compatible dict-like access
|
||||
- Async lock to prevent concurrent reloads
|
||||
"""
|
||||
|
||||
def __init__(self, initial_config, config_path=None):
|
||||
"""Initialize with initial configuration.
|
||||
|
||||
Args:
|
||||
initial_config: Initial configuration dictionary
|
||||
config_path: Path to config file for reloading (optional)
|
||||
"""
|
||||
self._config = initial_config
|
||||
self._config_path = config_path
|
||||
self._lock = asyncio.Lock()
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
async def reload(self, config_path=None):
|
||||
"""Reload configuration from file.
|
||||
|
||||
Args:
|
||||
config_path: Path to config file (uses stored path if not provided)
|
||||
|
||||
Returns:
|
||||
New configuration dictionary
|
||||
|
||||
Raises:
|
||||
Exception if reload fails (keeps existing config)
|
||||
"""
|
||||
path = config_path or self._config_path
|
||||
if not path:
|
||||
raise ValueError("No config path specified for reload")
|
||||
|
||||
async with self._lock:
|
||||
try:
|
||||
# Load new config
|
||||
new_config = load_config(path)
|
||||
|
||||
# Store old config for rollback if needed
|
||||
old_config = self._config
|
||||
|
||||
# Update config
|
||||
self._config = new_config
|
||||
self._logger.info(f"Configuration reloaded from {path}")
|
||||
|
||||
return new_config
|
||||
except Exception as e:
|
||||
self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
|
||||
# Keep existing config on error
|
||||
raise
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get a config value (dict-compatible)."""
|
||||
return self._config.get(key, default)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Get a config value via subscript (dict-compatible)."""
|
||||
return self._config[key]
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check if key exists (dict-compatible)."""
|
||||
return key in self._config
|
||||
|
||||
def keys(self):
|
||||
"""Return config keys (dict-compatible)."""
|
||||
return self._config.keys()
|
||||
|
||||
def items(self):
|
||||
"""Return config items (dict-compatible)."""
|
||||
return self._config.items()
|
||||
|
||||
def values(self):
|
||||
"""Return config values (dict-compatible)."""
|
||||
return self._config.values()
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
"""Get the underlying config dict (for components that need full dict)."""
|
||||
return self._config
|
||||
|
||||
|
||||
def get_watchhosts(config):
|
||||
"""Extract watchhosts from config, supporting both new and legacy formats.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of hostnames to watch
|
||||
"""
|
||||
watchhosts = []
|
||||
|
||||
# New format: hosts section with watch attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
|
||||
watchhosts.append(host_name)
|
||||
|
||||
# Legacy format: watchhosts list
|
||||
if "watchhosts" in config:
|
||||
legacy_watchhosts = config.get("watchhosts", [])
|
||||
if isinstance(legacy_watchhosts, (list, set)):
|
||||
watchhosts.extend(legacy_watchhosts)
|
||||
elif isinstance(legacy_watchhosts, dict):
|
||||
# Old dict format: {"host1": {attrs}, "host2": {attrs}}
|
||||
watchhosts.extend(legacy_watchhosts.keys())
|
||||
|
||||
return list(set(watchhosts)) # Remove duplicates
|
||||
|
||||
|
||||
def get_dyndnshosts(config):
|
||||
"""Extract dyndnshosts from config, supporting both new and legacy formats.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of hostnames with dynamic DNS
|
||||
"""
|
||||
dyndnshosts = []
|
||||
|
||||
# New format: hosts section with dyndns attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
|
||||
dyndnshosts.append(host_name)
|
||||
|
||||
# Legacy format: dyndnshosts list/set
|
||||
if "dyndnshosts" in config:
|
||||
legacy_dyndnshosts = config.get("dyndnshosts", [])
|
||||
if isinstance(legacy_dyndnshosts, (list, set)):
|
||||
dyndnshosts.extend(legacy_dyndnshosts)
|
||||
|
||||
return list(set(dyndnshosts)) # Remove duplicates
|
||||
|
||||
|
||||
def get_host_config(config, hostname):
|
||||
"""Get configuration for a specific host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
Dictionary with host attributes or empty dict
|
||||
"""
|
||||
if "hosts" in config:
|
||||
hosts_config = config.get("hosts", {})
|
||||
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||
return hosts_config[hostname] if isinstance(hosts_config[hostname], dict) else {}
|
||||
|
||||
# Check legacy watchhosts for notification settings
|
||||
if "watchhosts" in config:
|
||||
watchhosts = config.get("watchhosts", {})
|
||||
if isinstance(watchhosts, dict) and hostname in watchhosts:
|
||||
legacy_attrs = watchhosts[hostname]
|
||||
if isinstance(legacy_attrs, dict):
|
||||
# Convert legacy format to new format
|
||||
return {
|
||||
"watch": True,
|
||||
"notify": legacy_attrs.get("notify"),
|
||||
"notify_src": legacy_attrs.get("src"),
|
||||
}
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def get_notification_channels_for_host(config, hostname):
|
||||
"""Get notification channels configured for a specific host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
List of channel names to use for this host
|
||||
"""
|
||||
host_config = get_host_config(config, hostname)
|
||||
|
||||
# Check if host specifies notification channels
|
||||
channels = host_config.get("notification_channels", [])
|
||||
if channels:
|
||||
if isinstance(channels, str):
|
||||
return [channels]
|
||||
elif isinstance(channels, list):
|
||||
return channels
|
||||
|
||||
# Fall back to default channels
|
||||
default_channels = config.get("default_notification_channels", [])
|
||||
if default_channels:
|
||||
if isinstance(default_channels, str):
|
||||
return [default_channels]
|
||||
elif isinstance(default_channels, list):
|
||||
return default_channels
|
||||
|
||||
# No channels configured, return empty list (will use legacy global config)
|
||||
return []
|
||||
|
||||
|
||||
def get_channel_config(config, channel_name):
|
||||
"""Get configuration for a specific notification channel.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
channel_name: Name of the notification channel
|
||||
|
||||
Returns:
|
||||
Dictionary with channel configuration or None if not found
|
||||
"""
|
||||
channels = config.get("notification_channels", {})
|
||||
if isinstance(channels, dict) and channel_name in channels:
|
||||
return channels[channel_name]
|
||||
return None
|
||||
|
||||
|
||||
def get_notification_channels_config(config, hostname):
|
||||
"""Get list of notification channel configurations for a host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
List of (channel_name, channel_config) tuples
|
||||
"""
|
||||
channel_names = get_notification_channels_for_host(config, hostname)
|
||||
|
||||
channels = []
|
||||
for channel_name in channel_names:
|
||||
channel_config = get_channel_config(config, channel_name)
|
||||
if channel_config and channel_config.get("type"):
|
||||
channels.append((channel_name, channel_config))
|
||||
|
||||
return channels
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# User / host-access helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_default_owner(config) -> str | None:
|
||||
"""Return the configured default_owner username, or the first admin user, or None."""
|
||||
explicit = config.get("default_owner")
|
||||
if explicit:
|
||||
return explicit
|
||||
# Fall back to first admin user found in config
|
||||
users_cfg = config.get("users", {})
|
||||
if isinstance(users_cfg, dict):
|
||||
for username, attrs in users_cfg.items():
|
||||
if isinstance(attrs, dict) and attrs.get("admin", False):
|
||||
return username
|
||||
return None
|
||||
|
||||
|
||||
def get_host_access(config, hostname) -> dict:
|
||||
"""Return the access dict for *hostname*: owner, managers, monitors.
|
||||
|
||||
Falls back to default_owner for hosts without an explicit owner.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"owner": str | None,
|
||||
"managers": list[str],
|
||||
"monitors": list[str],
|
||||
}
|
||||
"""
|
||||
host_cfg = get_host_config(config, hostname)
|
||||
|
||||
owner = host_cfg.get("owner") or get_default_owner(config)
|
||||
|
||||
managers = host_cfg.get("managers", [])
|
||||
if isinstance(managers, str):
|
||||
managers = [managers]
|
||||
|
||||
monitors = host_cfg.get("monitors", [])
|
||||
if isinstance(monitors, str):
|
||||
monitors = [monitors]
|
||||
|
||||
return {
|
||||
"owner": owner,
|
||||
"managers": list(managers),
|
||||
"monitors": list(monitors),
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
|
||||
class Data:
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.data = {}
|
||||
|
||||
def update(self, new_data):
|
||||
self.data.update(new_data)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.data.get(key, default)
|
||||
@@ -136,16 +136,7 @@ async def dns_update_worker(
|
||||
)
|
||||
if err:
|
||||
m += f", DNS update failed: {err}"
|
||||
if pushmsg:
|
||||
try:
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
pushmsg,
|
||||
"error: nsupdate failed",
|
||||
f"{name}.dy.{dyndomain}: {m}",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
logger.error("DNS update failed for %s: %s", name, err)
|
||||
else:
|
||||
m += ", DNS updated."
|
||||
|
||||
@@ -171,7 +162,6 @@ def start_dns_worker(
|
||||
hbdclass,
|
||||
cfg: dict,
|
||||
log: Optional[callable] = None,
|
||||
pushmsg: Optional[callable] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||
):
|
||||
"""Start the async DNS worker and return the Task.
|
||||
@@ -218,7 +208,7 @@ def start_dns_worker(
|
||||
|
||||
task = loop.create_task(
|
||||
dns_update_worker(
|
||||
hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
|
||||
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||
)
|
||||
)
|
||||
return task
|
||||
@@ -0,0 +1,629 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
self.overdue_timer = None
|
||||
self.overdue_callback = None
|
||||
self.timeout_duration = None
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def __getstate__(self):
|
||||
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
|
||||
state = self.__dict__.copy()
|
||||
# Remove asyncio timer objects that can't be pickled
|
||||
# These will be recreated when the next HTB arrives after unpickling
|
||||
state['overdue_timer'] = None
|
||||
state['overdue_callback'] = None
|
||||
state['timeout_duration'] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""Restore Connection from pickle, reinitializing timer fields."""
|
||||
self.__dict__.update(state)
|
||||
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
|
||||
if not hasattr(self, 'overdue_timer'):
|
||||
self.overdue_timer = None
|
||||
if not hasattr(self, 'overdue_callback'):
|
||||
self.overdue_callback = None
|
||||
if not hasattr(self, 'timeout_duration'):
|
||||
self.timeout_duration = None
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
"""Serialize connection to JSON, excluding non-serializable timer objects."""
|
||||
data = {}
|
||||
for key, value in self.__dict__.items():
|
||||
# Skip timer-related fields that can't be serialized
|
||||
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||
continue
|
||||
# Handle host backpointer by converting to name
|
||||
if key == 'host':
|
||||
data[key] = value.name if value else None
|
||||
else:
|
||||
data[key] = value
|
||||
return json.dumps(data)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
if rtt is not None:
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except Exception:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
def reset_overdue_timer(self, timeout_seconds, callback):
|
||||
"""Reset the overdue timer for this connection.
|
||||
|
||||
Cancels any existing timer and sets a new one that will mark
|
||||
the connection as overdue if no heartbeat arrives before timeout.
|
||||
|
||||
Args:
|
||||
timeout_seconds: Seconds before marking as overdue
|
||||
callback: Async function to call when timer expires
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
# Cancel existing timer if any
|
||||
if self.overdue_timer and not self.overdue_timer.cancelled():
|
||||
self.overdue_timer.cancel()
|
||||
|
||||
# Store parameters for later reference
|
||||
self.timeout_duration = timeout_seconds
|
||||
self.overdue_callback = callback
|
||||
|
||||
# Create new timer
|
||||
async def timer_expired():
|
||||
await callback(self)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
self.overdue_timer = loop.call_later(timeout_seconds,
|
||||
lambda: asyncio.create_task(timer_expired()))
|
||||
except RuntimeError:
|
||||
# No event loop running yet
|
||||
pass
|
||||
|
||||
def cancel_overdue_timer(self):
|
||||
"""Cancel the overdue timer if it exists and clear all timer references."""
|
||||
if self.overdue_timer:
|
||||
try:
|
||||
if not self.overdue_timer.cancelled():
|
||||
self.overdue_timer.cancel()
|
||||
except Exception:
|
||||
pass
|
||||
# Clear all timer-related references
|
||||
self.overdue_timer = None
|
||||
self.overdue_callback = None
|
||||
self.timeout_duration = None
|
||||
|
||||
def get_avg_rtt(self):
|
||||
"""Get average RTT from recent samples."""
|
||||
valid_rtts = [r for r in self.rtts if r > 0]
|
||||
if valid_rtts:
|
||||
return sum(valid_rtts) / len(valid_rtts)
|
||||
return 0
|
||||
|
||||
def get_current_rtt(self):
|
||||
"""Get most recent RTT value."""
|
||||
return self.rtts[-1] if self.rtts else 0
|
||||
|
||||
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
|
||||
"""Check if RTT exceeds thresholds.
|
||||
|
||||
Args:
|
||||
warning_threshold: RTT in ms for warning level
|
||||
critical_threshold: RTT in ms for critical level
|
||||
|
||||
Returns:
|
||||
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
|
||||
"""
|
||||
rtt = self.get_current_rtt()
|
||||
if rtt <= 0:
|
||||
return (None, rtt)
|
||||
|
||||
if critical_threshold and rtt > critical_threshold:
|
||||
return ('CRITICAL', rtt)
|
||||
elif warning_threshold and rtt > warning_threshold:
|
||||
return ('WARNING', rtt)
|
||||
|
||||
return (None, rtt)
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.connections = {}
|
||||
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
|
||||
self.plugin_data = {}
|
||||
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||
# Alert state tracking: {metric_path: AlertState}
|
||||
self.alert_states = {}
|
||||
# User access control
|
||||
self.owner: str | None = None # username of owner
|
||||
self.managers: list = [] # usernames with manager role
|
||||
self.monitors: list = [] # usernames with monitor role
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["num"] = self.num
|
||||
|
||||
# Add alert counts (split by acknowledged status)
|
||||
warning_unacked = 0
|
||||
warning_acked = 0
|
||||
critical_unacked = 0
|
||||
critical_acked = 0
|
||||
for metric_path, alert_state in self.alert_states.items():
|
||||
# Import AlertLevel here to avoid circular imports
|
||||
from .threshold import AlertLevel
|
||||
if alert_state.level == AlertLevel.WARNING:
|
||||
if alert_state.acknowledged:
|
||||
warning_acked += 1
|
||||
else:
|
||||
warning_unacked += 1
|
||||
elif alert_state.level == AlertLevel.CRITICAL:
|
||||
if alert_state.acknowledged:
|
||||
critical_acked += 1
|
||||
else:
|
||||
critical_unacked += 1
|
||||
|
||||
d["alert_warning_unacked"] = warning_unacked
|
||||
d["alert_warning_acked"] = warning_acked
|
||||
d["alert_critical_unacked"] = critical_unacked
|
||||
d["alert_critical_acked"] = critical_acked
|
||||
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d in ["alert_states", "plugin_data"]:
|
||||
continue
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c not in self.connections:
|
||||
continue
|
||||
# Create connection dict, excluding non-serializable timer objects
|
||||
conn = self.connections[c]
|
||||
cld = {}
|
||||
for key, value in conn.__dict__.items():
|
||||
# Skip timer-related fields that can't be serialized
|
||||
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||
continue
|
||||
# Handle host backpointer by converting to name
|
||||
if key == 'host':
|
||||
cld[key] = value.name if value else None
|
||||
else:
|
||||
# Safe copy for serializable values
|
||||
try:
|
||||
cld[key] = copy.deepcopy(value)
|
||||
except Exception:
|
||||
# If deepcopy fails, use shallow copy
|
||||
cld[key] = value
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
|
||||
# Add alert counts (computed from alert_states)
|
||||
warning_unacked = 0
|
||||
warning_acked = 0
|
||||
critical_unacked = 0
|
||||
critical_acked = 0
|
||||
if hasattr(self, 'alert_states'):
|
||||
from .threshold import AlertLevel
|
||||
for metric_path, alert_state in self.alert_states.items():
|
||||
if alert_state.level == AlertLevel.WARNING:
|
||||
if alert_state.acknowledged:
|
||||
warning_acked += 1
|
||||
else:
|
||||
warning_unacked += 1
|
||||
elif alert_state.level == AlertLevel.CRITICAL:
|
||||
if alert_state.acknowledged:
|
||||
critical_acked += 1
|
||||
else:
|
||||
critical_unacked += 1
|
||||
|
||||
ddict["alert_warning_unacked"] = warning_unacked
|
||||
ddict["alert_warning_acked"] = warning_acked
|
||||
ddict["alert_critical_unacked"] = critical_unacked
|
||||
ddict["alert_critical_acked"] = critical_acked
|
||||
|
||||
# User access
|
||||
ddict["owner"] = getattr(self, "owner", None)
|
||||
ddict["managers"] = list(getattr(self, "managers", []))
|
||||
ddict["monitors"] = list(getattr(self, "monitors", []))
|
||||
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
# Add plugin_data if missing (for backward compatibility)
|
||||
if not hasattr(self, "plugin_data"):
|
||||
self.plugin_data = {}
|
||||
if not hasattr(self, "plugin_retention"):
|
||||
self.plugin_retention = 100
|
||||
if not hasattr(self, "alert_states"):
|
||||
self.alert_states = {}
|
||||
# User access fields (added in user-management feature)
|
||||
if not hasattr(self, "owner"):
|
||||
self.owner = None
|
||||
if not hasattr(self, "managers"):
|
||||
self.managers = []
|
||||
if not hasattr(self, "monitors"):
|
||||
self.monitors = []
|
||||
|
||||
pass
|
||||
|
||||
def add_plugin_data(self, plugin_name, data, timestamp=None):
|
||||
"""Store plugin data with timestamp.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin (e.g., "cpu_monitor")
|
||||
data: Dict of plugin data
|
||||
timestamp: Optional timestamp (default: current time)
|
||||
"""
|
||||
if timestamp is None:
|
||||
timestamp = time.time()
|
||||
|
||||
if plugin_name not in self.plugin_data:
|
||||
self.plugin_data[plugin_name] = []
|
||||
|
||||
# Add new data
|
||||
self.plugin_data[plugin_name].append((timestamp, data))
|
||||
|
||||
# Enforce retention limit (keep last N samples)
|
||||
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
|
||||
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
|
||||
|
||||
def get_plugin_data(self, plugin_name, limit=None):
|
||||
"""Retrieve plugin data for a specific plugin.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin
|
||||
limit: Optional limit on number of recent samples to return
|
||||
|
||||
Returns:
|
||||
List of (timestamp, data) tuples, most recent last
|
||||
"""
|
||||
data = self.plugin_data.get(plugin_name, [])
|
||||
if limit and len(data) > limit:
|
||||
return data[-limit:]
|
||||
return data
|
||||
|
||||
def get_latest_plugin_data(self, plugin_name):
|
||||
"""Get the most recent plugin data for a plugin.
|
||||
|
||||
Args:
|
||||
plugin_name: Name of the plugin
|
||||
|
||||
Returns:
|
||||
(timestamp, data) tuple or None if no data
|
||||
"""
|
||||
data = self.plugin_data.get(plugin_name, [])
|
||||
return data[-1] if data else None
|
||||
|
||||
def get_all_plugin_data(self):
|
||||
"""Get all plugin data for this host.
|
||||
|
||||
Returns:
|
||||
Dict of {plugin_name: [(timestamp, data), ...]}
|
||||
"""
|
||||
return self.plugin_data
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# User-role helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def apply_access(self, owner, managers, monitors):
|
||||
"""Set owner/managers/monitors on this host (called from config load)."""
|
||||
self.owner = owner
|
||||
self.managers = list(managers)
|
||||
self.monitors = list(monitors)
|
||||
|
||||
def is_owner(self, username: str) -> bool:
|
||||
return self.owner == username
|
||||
|
||||
def is_manager(self, username: str) -> bool:
|
||||
return username in self.managers or self.is_owner(username)
|
||||
|
||||
def is_monitor(self, username: str) -> bool:
|
||||
return username in self.monitors or self.is_manager(username)
|
||||
|
||||
def access_dict(self) -> dict:
|
||||
return {
|
||||
"owner": self.owner,
|
||||
"managers": list(self.managers),
|
||||
"monitors": list(self.monitors),
|
||||
}
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le :]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
@@ -0,0 +1,860 @@
|
||||
"""HTTP server implementation using aiohttp and jinja2."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import urllib.parse
|
||||
import os
|
||||
import logging
|
||||
from aiohttp import web
|
||||
import jinja2
|
||||
from . import data
|
||||
from . import notify as notify_mod
|
||||
from . import settings as settings_mod
|
||||
from . import users as users_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
def _render_template(html_str: str, **context) -> str:
|
||||
tmpl = jinja2.Template(html_str)
|
||||
return tmpl.render(**context)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auth helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SESSION_COOKIE = "hbd_session"
|
||||
|
||||
|
||||
def _get_token(request) -> str:
|
||||
"""Extract session token from Bearer header, X-Auth-Token header, or cookie."""
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth.lower().startswith("bearer "):
|
||||
return auth[7:].strip()
|
||||
header_token = request.headers.get("X-Auth-Token", "").strip()
|
||||
if header_token:
|
||||
return header_token
|
||||
return request.cookies.get(SESSION_COOKIE, "")
|
||||
|
||||
|
||||
def _current_user(request):
|
||||
"""Return the authenticated User, or None when auth is not enabled."""
|
||||
if not users_mod.users_enabled():
|
||||
return None # unauthenticated mode — all access allowed
|
||||
return users_mod.get_session_user(_get_token(request))
|
||||
|
||||
|
||||
def _require_auth(request):
|
||||
"""Return (user, None) or (None, error Response)."""
|
||||
if not users_mod.users_enabled():
|
||||
return None, None
|
||||
user = users_mod.get_session_user(_get_token(request))
|
||||
if user is None:
|
||||
return None, web.json_response({"error": "Unauthorized"}, status=401)
|
||||
return user, None
|
||||
|
||||
|
||||
def _require_auth_redirect(request):
|
||||
"""Like _require_auth but returns a redirect to /login for browser requests."""
|
||||
if not users_mod.users_enabled():
|
||||
return None, None
|
||||
user = users_mod.get_session_user(_get_token(request))
|
||||
if user is None:
|
||||
raise web.HTTPFound("/login")
|
||||
return user, None
|
||||
|
||||
|
||||
def _can_view_host(user, host) -> bool:
|
||||
"""Return True if *user* may see *host* (monitor or higher, or no auth)."""
|
||||
if user is None:
|
||||
return True
|
||||
if user.admin:
|
||||
return True
|
||||
return host.is_monitor(user.username)
|
||||
|
||||
|
||||
def _can_operate_host(user, host) -> bool:
|
||||
"""Manager-level: queue commands, DNS, upgrade."""
|
||||
if user is None:
|
||||
return True
|
||||
if user.admin:
|
||||
return True
|
||||
return host.is_manager(user.username)
|
||||
|
||||
|
||||
def _can_own_host(user, host) -> bool:
|
||||
"""Owner-level: drop host, transfer ownership."""
|
||||
if user is None:
|
||||
return True
|
||||
if user.admin:
|
||||
return True
|
||||
return host.is_owner(user.username)
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
port: int,
|
||||
config,
|
||||
hbdclass,
|
||||
tcss=None,
|
||||
verbose=False,
|
||||
get_now=None,
|
||||
VER="",
|
||||
threshold_checker=None,
|
||||
):
|
||||
"""Start an aiohttp web server and block until cancelled.
|
||||
|
||||
This function is intended to be awaited inside the main asyncio event loop.
|
||||
"""
|
||||
get_now = get_now or (lambda: time.time())
|
||||
|
||||
async def old_index(request):
|
||||
_require_auth_redirect(request)
|
||||
res = []
|
||||
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
|
||||
res.append("<html>")
|
||||
res.append("<head>")
|
||||
res.append("<title>Heartbeat</title>")
|
||||
if tcss:
|
||||
res.append(tcss)
|
||||
res.append("</head>")
|
||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
||||
res += hbdclass.ubHost.buildhosttable()
|
||||
res += hbdclass.ubHost.buildmsgtable(data.msgs)
|
||||
res.append(
|
||||
"<p> %s (%s)</p>"
|
||||
% (
|
||||
time.strftime("%H:%M:%S", time.localtime(get_now())),
|
||||
config.get("tz", "CET-1CDT"),
|
||||
)
|
||||
)
|
||||
res.append("</body></html>")
|
||||
body = "\n".join(res)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def api_hosts(request):
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hosts = [
|
||||
hbdclass.Host.hosts[h]
|
||||
for h in hbdclass.Host.hosts
|
||||
if _can_view_host(user, hbdclass.Host.hosts[h])
|
||||
]
|
||||
lst = [h.jsons() for h in hosts]
|
||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||
|
||||
async def api_messages(request):
|
||||
lst = data.msgs[-30:]
|
||||
return web.json_response(lst)
|
||||
|
||||
async def cmd(request):
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
ucmd = qa.get("c")
|
||||
if not ucmd or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
host = hbdclass.Host.hosts[uname]
|
||||
if not _can_operate_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
host.cmds.append(("CMD", {"cmd": urllib.parse.unquote(ucmd)}))
|
||||
return web.Response(text=f"cmd {uname} queued")
|
||||
|
||||
async def drop(request):
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
host = hbdclass.Host.hosts[uname]
|
||||
if not _can_own_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
eventlog(uname, "INFO", "dropped")
|
||||
del hbdclass.Host.hosts[uname]
|
||||
return web.Response(text="Done")
|
||||
|
||||
async def register(request):
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
qa = request.rel_url.query
|
||||
uname = qa.get("h")
|
||||
if not uname:
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
host = hbdclass.Host.hosts[uname]
|
||||
if not _can_operate_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
ll = host.registerDns()
|
||||
eventlog(uname, "INFO", ll)
|
||||
return web.Response(text=str(ll))
|
||||
|
||||
async def update(request):
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
qa = request.rel_url.query
|
||||
uname = urllib.parse.unquote(qa.get("h", ""))
|
||||
ucode = qa.get("c")
|
||||
if not ucode or not uname:
|
||||
return web.Response(status=400, text="need h= and c= arguments")
|
||||
if uname != "All" and uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if uname != "All":
|
||||
names = [uname]
|
||||
else:
|
||||
names = [n for n in hbdclass.Host.hosts]
|
||||
out = []
|
||||
for n in names:
|
||||
host = hbdclass.Host.hosts[n]
|
||||
if not _can_operate_host(user, host):
|
||||
out.append(f"update skipped for {n}: Forbidden")
|
||||
continue
|
||||
op_err = None
|
||||
try:
|
||||
r = {"csum": None, "code": ucode}
|
||||
host.cmds.append(("UPD", r))
|
||||
except Exception as e:
|
||||
op_err = str(e)
|
||||
out.append(f"update started for {n}: {op_err if op_err else 'OK'}")
|
||||
return web.Response(text="\n".join(out))
|
||||
|
||||
async def live(request):
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
# render template from hbd/templates/live.html using Jinja2
|
||||
# Resolve templates directory relative to the hbd package
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
host = config.get("hb_host", "localhost")
|
||||
extra_scripts = config.get("http_extra_scripts", "")
|
||||
host = request.host.split(":")[0]
|
||||
if config.get("wss_port"):
|
||||
heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
|
||||
else:
|
||||
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
|
||||
tmpl = env.get_template("live.html")
|
||||
body = tmpl.render(
|
||||
title="Heartbeat",
|
||||
header="Heartbeat",
|
||||
request=request,
|
||||
heartbeat_ws_url=heartbeat_ws_url,
|
||||
extra_scripts=extra_scripts,
|
||||
hosts=[
|
||||
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
messages=data.msgs[-30:],
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
active_page="live",
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def static(request):
|
||||
"""Serve files from the package static directory.
|
||||
|
||||
URL form: /static/<path>
|
||||
"""
|
||||
p = request.match_info.get("path", "")
|
||||
logger.debug("static file requested: %s", p)
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
|
||||
# normalize and prevent directory traversal
|
||||
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
|
||||
if not target.startswith(base + os.sep) and target != base:
|
||||
return web.Response(status=403, text="Forbidden")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
logger.info("serving static file: %s", target)
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
async def favicon(request):
|
||||
"""Serve favicon.ico from the package static directory."""
|
||||
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
|
||||
target = os.path.join(base, "favicon.ico")
|
||||
if not os.path.exists(target) or not os.path.isfile(target):
|
||||
return web.Response(status=404, text="Not Found")
|
||||
return web.FileResponse(path=target)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Plugin Data API Endpoints
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def api_host_plugins(request):
|
||||
"""Get all plugin data for a specific host."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hostname = request.match_info.get("hostname")
|
||||
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
|
||||
# Get plugin data with most recent sample for each plugin
|
||||
plugins_summary = {}
|
||||
for plugin_name, samples in host.plugin_data.items():
|
||||
if samples:
|
||||
# Get most recent sample
|
||||
timestamp, data = samples[-1]
|
||||
plugins_summary[plugin_name] = {
|
||||
"timestamp": timestamp,
|
||||
"data": data,
|
||||
"sample_count": len(samples),
|
||||
}
|
||||
|
||||
return web.json_response({
|
||||
"hostname": hostname,
|
||||
"plugins": plugins_summary,
|
||||
})
|
||||
|
||||
async def api_host_plugin_detail(request):
|
||||
"""Get detailed data for a specific plugin on a host."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hostname = request.match_info.get("hostname")
|
||||
plugin_name = request.match_info.get("plugin_name")
|
||||
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
|
||||
# Get limit from query parameter
|
||||
limit = request.rel_url.query.get("limit", "10")
|
||||
try:
|
||||
limit = int(limit)
|
||||
except ValueError:
|
||||
limit = 10
|
||||
|
||||
# Get plugin data
|
||||
samples = host.get_plugin_data(plugin_name, limit=limit)
|
||||
|
||||
if not samples:
|
||||
return web.json_response(
|
||||
{"error": f"No data for plugin '{plugin_name}' on host '{hostname}'"},
|
||||
status=404
|
||||
)
|
||||
|
||||
# Format samples
|
||||
formatted_samples = [
|
||||
{
|
||||
"timestamp": ts,
|
||||
"data": data,
|
||||
}
|
||||
for ts, data in samples
|
||||
]
|
||||
|
||||
return web.json_response({
|
||||
"hostname": hostname,
|
||||
"plugin": plugin_name,
|
||||
"samples": formatted_samples,
|
||||
"sample_count": len(formatted_samples),
|
||||
})
|
||||
|
||||
async def api_host_alerts(request):
|
||||
"""Get alert states for a specific host."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hostname = request.match_info.get("hostname")
|
||||
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
|
||||
# Get alert states
|
||||
alerts = []
|
||||
for metric_path, alert_state in host.alert_states.items():
|
||||
alerts.append(alert_state.to_dict())
|
||||
|
||||
# Get summary if threshold_checker available
|
||||
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
|
||||
if threshold_checker:
|
||||
summary = threshold_checker.get_alert_summary(host.alert_states)
|
||||
|
||||
return web.json_response({
|
||||
"hostname": hostname,
|
||||
"alerts": alerts,
|
||||
"summary": summary,
|
||||
})
|
||||
|
||||
async def api_all_alerts(request):
|
||||
"""Get all active alerts across all hosts."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
all_alerts = []
|
||||
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
if not _can_view_host(user, host):
|
||||
continue
|
||||
if threshold_checker:
|
||||
active_alerts = threshold_checker.get_active_alerts(host.alert_states)
|
||||
else:
|
||||
# Fallback if no threshold checker
|
||||
from hbd.server.threshold import AlertLevel
|
||||
active_alerts = [
|
||||
state for state in host.alert_states.values()
|
||||
if state.level != AlertLevel.OK
|
||||
]
|
||||
|
||||
for alert in active_alerts:
|
||||
alert_dict = alert.to_dict()
|
||||
alert_dict["hostname"] = hostname
|
||||
all_alerts.append(alert_dict)
|
||||
|
||||
# Sort by level (critical first) then by hostname
|
||||
level_order = {"CRITICAL": 0, "WARNING": 1, "UNKNOWN": 2, "OK": 3}
|
||||
all_alerts.sort(
|
||||
key=lambda a: (level_order.get(a["level"], 99), a["hostname"], a["metric_path"])
|
||||
)
|
||||
|
||||
# Get summary counts
|
||||
summary = {"critical": 0, "warning": 0, "unknown": 0, "total": len(all_alerts)}
|
||||
for alert in all_alerts:
|
||||
level = alert["level"].lower()
|
||||
if level in summary:
|
||||
summary[level] += 1
|
||||
|
||||
return web.json_response({
|
||||
"alerts": all_alerts,
|
||||
"summary": summary,
|
||||
"host_count": len(hbdclass.Host.hosts),
|
||||
})
|
||||
|
||||
async def api_acknowledge_alert(request):
|
||||
"""Acknowledge an alert to stop reminder notifications."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
try:
|
||||
data = await request.json()
|
||||
except Exception:
|
||||
return web.json_response(
|
||||
{"error": "Invalid JSON in request body"},
|
||||
status=400
|
||||
)
|
||||
|
||||
hostname = data.get("hostname")
|
||||
metric_path = data.get("metric_path")
|
||||
|
||||
if not hostname or not metric_path:
|
||||
return web.json_response(
|
||||
{"error": "Missing required fields: hostname and metric_path"},
|
||||
status=400
|
||||
)
|
||||
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response(
|
||||
{"error": f"Host '{hostname}' not found"},
|
||||
status=404
|
||||
)
|
||||
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
|
||||
if metric_path not in host.alert_states:
|
||||
return web.json_response(
|
||||
{"error": f"Alert '{metric_path}' not found for host '{hostname}'"},
|
||||
status=404
|
||||
)
|
||||
|
||||
alert_state = host.alert_states[metric_path]
|
||||
alert_state.acknowledge()
|
||||
|
||||
return web.json_response({
|
||||
"success": True,
|
||||
"hostname": hostname,
|
||||
"metric_path": metric_path,
|
||||
"acknowledged_at": alert_state.acknowledged_at,
|
||||
})
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# UI Pages
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def plugins_page(request):
|
||||
"""Render the plugin metrics visualization page."""
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
|
||||
# Collect all hosts with plugin data (filtered by visibility)
|
||||
hosts_with_plugins = []
|
||||
for hostname in sorted(hbdclass.Host.hosts.keys()):
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(current_user, host):
|
||||
continue
|
||||
if host.plugin_data:
|
||||
hosts_with_plugins.append({
|
||||
"name": hostname,
|
||||
"plugins": list(host.plugin_data.keys()),
|
||||
})
|
||||
|
||||
tmpl = env.get_template("plugins.html")
|
||||
body = tmpl.render(
|
||||
title="Plugin Metrics - Heartbeat",
|
||||
header="Plugin Metrics",
|
||||
hosts=hosts_with_plugins,
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
active_page="plugins",
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
async def alerts_page(request):
|
||||
"""Render the alerts dashboard page."""
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
|
||||
tmpl = env.get_template("alerts.html")
|
||||
body = tmpl.render(
|
||||
title="Alerts Dashboard - Heartbeat",
|
||||
header="Alerts Dashboard",
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
active_page="alerts",
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Auth endpoints
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def api_login(request):
|
||||
"""POST /api/0/auth/login {username, password} -> {token}
|
||||
Also sets an hbd_session cookie for browser clients.
|
||||
"""
|
||||
if not users_mod.users_enabled():
|
||||
return web.json_response({"error": "Auth not configured"}, status=404)
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON"}, status=400)
|
||||
username = body.get("username", "")
|
||||
password = body.get("password", "")
|
||||
user = users_mod.authenticate(username, password)
|
||||
if user is None:
|
||||
return web.json_response({"error": "Invalid credentials"}, status=401)
|
||||
token = users_mod.create_session(username)
|
||||
resp = web.json_response({"token": token, "username": username})
|
||||
resp.set_cookie(
|
||||
SESSION_COOKIE,
|
||||
token,
|
||||
max_age=users_mod.SESSION_TTL,
|
||||
httponly=True,
|
||||
samesite="Lax",
|
||||
)
|
||||
return resp
|
||||
|
||||
async def login_page(request):
|
||||
"""GET /login — show login form; POST /login — process and redirect."""
|
||||
if not users_mod.users_enabled():
|
||||
raise web.HTTPFound("/")
|
||||
|
||||
error = ""
|
||||
if request.method == "POST":
|
||||
form = await request.post()
|
||||
username = form.get("username", "")
|
||||
password = form.get("password", "")
|
||||
user = users_mod.authenticate(username, password)
|
||||
if user:
|
||||
token = users_mod.create_session(username)
|
||||
redirect_to = request.rel_url.query.get("next", "/")
|
||||
resp = web.HTTPFound(redirect_to)
|
||||
resp.set_cookie(
|
||||
SESSION_COOKIE,
|
||||
token,
|
||||
max_age=users_mod.SESSION_TTL,
|
||||
httponly=True,
|
||||
samesite="Lax",
|
||||
)
|
||||
raise resp
|
||||
error = "Invalid username or password."
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Heartbeat — Login</title>
|
||||
<style>
|
||||
body {{ font-family: sans-serif; background: #f5f5f5; display: flex;
|
||||
justify-content: center; align-items: center; height: 100vh; margin: 0; }}
|
||||
.box {{ background: #fff; padding: 2em 2.5em; border-radius: 8px;
|
||||
box-shadow: 0 2px 12px rgba(0,0,0,.15); min-width: 300px; }}
|
||||
h2 {{ margin: 0 0 1.2em; color: #333; font-size: 1.4em; }}
|
||||
label {{ display: block; margin-bottom: .3em; font-size: .9em; color: #555; }}
|
||||
input {{ width: 100%; padding: .5em .7em; border: 1px solid #ccc;
|
||||
border-radius: 4px; font-size: 1em; box-sizing: border-box; }}
|
||||
button {{ margin-top: 1.2em; width: 100%; padding: .6em; background: #0066cc;
|
||||
color: #fff; border: none; border-radius: 4px; font-size: 1em; cursor: pointer; }}
|
||||
button:hover {{ background: #0055aa; }}
|
||||
.error {{ color: #c00; font-size: .9em; margin-bottom: .8em; }}
|
||||
.field {{ margin-bottom: .9em; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="box">
|
||||
<h2>Heartbeat</h2>
|
||||
{'<p class="error">' + error + '</p>' if error else ''}
|
||||
<form method="post">
|
||||
<div class="field"><label>Username</label><input name="username" autofocus></div>
|
||||
<div class="field"><label>Password</label><input name="password" type="password"></div>
|
||||
<button type="submit">Sign in</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
return web.Response(text=html, content_type="text/html")
|
||||
|
||||
async def web_logout(request):
|
||||
"""GET /logout — clear session cookie and redirect to /login."""
|
||||
token = request.cookies.get(SESSION_COOKIE, "")
|
||||
users_mod.delete_session(token)
|
||||
resp = web.HTTPFound("/login")
|
||||
resp.del_cookie(SESSION_COOKIE)
|
||||
raise resp
|
||||
|
||||
async def api_logout(request):
|
||||
"""POST /api/0/auth/logout"""
|
||||
token = _get_token(request)
|
||||
users_mod.delete_session(token)
|
||||
resp = web.json_response({"success": True})
|
||||
resp.del_cookie(SESSION_COOKIE)
|
||||
return resp
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# User endpoints
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def api_user_avatar(request):
|
||||
"""GET /api/0/users/{username}/avatar — serve a local avatar file.
|
||||
|
||||
Only reachable when the user's avatar config value starts with '/'.
|
||||
Falls back to 404 for external URLs (the browser fetches those directly).
|
||||
"""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
username = request.match_info.get("username")
|
||||
target_user = users_mod.get_user(username)
|
||||
if target_user is None:
|
||||
return web.Response(status=404, text="User not found")
|
||||
if not target_user.avatar_is_local():
|
||||
return web.Response(status=404, text="No local avatar configured")
|
||||
path = target_user.avatar
|
||||
if not os.path.isfile(path):
|
||||
return web.Response(status=404, text="Avatar file not found")
|
||||
# Infer content-type from extension
|
||||
ext = os.path.splitext(path)[1].lower()
|
||||
mime = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".svg": "image/svg+xml",
|
||||
}.get(ext, "application/octet-stream")
|
||||
return web.FileResponse(path=path, headers={"Content-Type": mime})
|
||||
|
||||
async def api_users(request):
|
||||
"""GET /api/0/users — admin only."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
if users_mod.users_enabled() and (user is None or not user.admin):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
return web.json_response([u.to_dict() for u in users_mod.users.values()])
|
||||
|
||||
async def api_user_self(request):
|
||||
"""GET /api/0/users/me — own profile."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
if user is None:
|
||||
return web.json_response({"error": "Auth not configured"}, status=404)
|
||||
return web.json_response(user.to_dict())
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Host access endpoints
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def api_host_access_get(request):
|
||||
"""GET /api/0/hosts/{hostname}/access"""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hostname = request.match_info.get("hostname")
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
return web.json_response(host.access_dict())
|
||||
|
||||
async def api_host_access_put(request):
|
||||
"""PUT /api/0/hosts/{hostname}/access — owner or admin only.
|
||||
|
||||
Body: {owner?: str, managers?: [str], monitors?: [str]}
|
||||
"""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
hostname = request.match_info.get("hostname")
|
||||
if hostname not in hbdclass.Host.hosts:
|
||||
return web.json_response({"error": f"Host '{hostname}' not found"}, status=404)
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_own_host(user, host):
|
||||
return web.json_response({"error": "Forbidden"}, status=403)
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "Invalid JSON"}, status=400)
|
||||
|
||||
if "owner" in body:
|
||||
host.owner = body["owner"] or None
|
||||
if "managers" in body:
|
||||
host.managers = list(body["managers"])
|
||||
if "monitors" in body:
|
||||
host.monitors = list(body["monitors"])
|
||||
|
||||
return web.json_response(host.access_dict())
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# User profile page
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def profile_page(request):
|
||||
"""GET /profile — current user's settings and host access summary."""
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
|
||||
# Build host access summary for this user
|
||||
owned, managed, monitored = [], [], []
|
||||
if current_user:
|
||||
for hostname, host in sorted(hbdclass.Host.hosts.items()):
|
||||
if host.is_owner(current_user.username):
|
||||
owned.append(hostname)
|
||||
elif host.is_manager(current_user.username):
|
||||
managed.append(hostname)
|
||||
elif host.is_monitor(current_user.username):
|
||||
monitored.append(hostname)
|
||||
|
||||
# Resolve notification channel configs for display
|
||||
notif_channels = []
|
||||
if current_user:
|
||||
for ch_name in (current_user.notification_channels or []):
|
||||
ch_cfg = config.get("notification_channels", {}).get(ch_name, {})
|
||||
notif_channels.append({"name": ch_name, "type": ch_cfg.get("type", "")})
|
||||
|
||||
tmpl = env.get_template("profile.html")
|
||||
body = tmpl.render(
|
||||
title="Profile - Heartbeat",
|
||||
header="My Profile",
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
owned_hosts=owned,
|
||||
managed_hosts=managed,
|
||||
monitored_hosts=monitored,
|
||||
notification_channels=notif_channels,
|
||||
active_page="profile",
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Settings page (admin only)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def settings_page(request):
|
||||
"""GET /settings — read-only view of the current server configuration."""
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
if current_user and not current_user.admin:
|
||||
raise web.HTTPForbidden(reason="Admin access required")
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
tmpl = env.get_template("settings.html")
|
||||
body = tmpl.render(
|
||||
title="Settings - Heartbeat",
|
||||
sections=settings_mod.get_settings_sections(config),
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
active_page="settings",
|
||||
)
|
||||
return web.Response(text=body, content_type="text/html")
|
||||
|
||||
app = web.Application()
|
||||
app.add_routes(
|
||||
[
|
||||
web.get("/", live),
|
||||
web.get("/old", old_index),
|
||||
# Auth
|
||||
web.get("/login", login_page),
|
||||
web.post("/login", login_page),
|
||||
web.get("/logout", web_logout),
|
||||
web.post("/api/0/auth/login", api_login),
|
||||
web.post("/api/0/auth/logout", api_logout),
|
||||
# Users
|
||||
web.get("/api/0/users", api_users),
|
||||
web.get("/api/0/users/me", api_user_self),
|
||||
web.get("/api/0/users/{username}/avatar", api_user_avatar),
|
||||
# Hosts
|
||||
web.get("/api/0/hosts", api_hosts),
|
||||
web.get("/api/0/messages", api_messages),
|
||||
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
|
||||
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
|
||||
web.get("/api/0/hosts/{hostname}/alerts", api_host_alerts),
|
||||
web.get("/api/0/hosts/{hostname}/access", api_host_access_get),
|
||||
web.put("/api/0/hosts/{hostname}/access", api_host_access_put),
|
||||
web.get("/api/0/alerts", api_all_alerts),
|
||||
web.post("/api/0/alerts/acknowledge", api_acknowledge_alert),
|
||||
web.get("/c", cmd),
|
||||
web.get("/d", drop),
|
||||
web.get("/n", register),
|
||||
web.get("/u", update),
|
||||
web.get("/live", live),
|
||||
web.get("/plugins", plugins_page),
|
||||
web.get("/alerts", alerts_page),
|
||||
web.get("/profile", profile_page),
|
||||
web.get("/settings", settings_page),
|
||||
web.get("/static/{path:.*}", static),
|
||||
web.get("/favicon.ico", favicon),
|
||||
]
|
||||
)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, host, port)
|
||||
await site.start()
|
||||
|
||||
if verbose:
|
||||
print(f"HTTP server started on {host}:{port}")
|
||||
|
||||
try:
|
||||
await asyncio.Future()
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Journal logging for heartbeat messages.
|
||||
|
||||
Provides size-based rotating log files for all received heartbeat messages.
|
||||
Messages are logged in JSON format for easy parsing and analysis.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MessageJournal:
|
||||
"""
|
||||
Journal logger for heartbeat messages with size-based rotation.
|
||||
|
||||
Features:
|
||||
- Logs all received messages in JSON format
|
||||
- Automatic rotation when file size exceeds threshold
|
||||
- Keeps configurable number of rotated logs
|
||||
- Thread-safe and async-safe operation
|
||||
- Configurable log directory and file naming
|
||||
|
||||
Configuration:
|
||||
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
|
||||
journal_file: Base filename (default: messages.journal)
|
||||
max_size: Maximum file size in bytes before rotation (default: 100MB)
|
||||
max_backups: Number of backup files to keep (default: 10)
|
||||
enabled: Enable/disable journaling (default: True)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the message journal.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary with journal settings
|
||||
"""
|
||||
self.config = config or {}
|
||||
|
||||
# Configuration options
|
||||
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
|
||||
self.journal_file = self.config.get('journal_file', 'messages.journal')
|
||||
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
|
||||
self.max_backups = self.config.get('journal_max_backups', 10)
|
||||
self.enabled = self.config.get('journal_enabled', True)
|
||||
|
||||
# Runtime state
|
||||
self._file_handle = None
|
||||
self._current_size = 0
|
||||
self._lock = asyncio.Lock()
|
||||
self._initialized = False
|
||||
|
||||
# Full path to current journal file
|
||||
self.journal_path = self.journal_dir / self.journal_file
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""
|
||||
Initialize the journal.
|
||||
|
||||
Creates journal directory if needed and opens the journal file.
|
||||
|
||||
Returns:
|
||||
True if initialization successful, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
logger.info("Message journal disabled in configuration")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Create journal directory if it doesn't exist
|
||||
self.journal_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Open journal file in append mode
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
|
||||
# Get current file size
|
||||
try:
|
||||
self._current_size = os.path.getsize(self.journal_path)
|
||||
except OSError:
|
||||
self._current_size = 0
|
||||
|
||||
self._initialized = True
|
||||
logger.info(f"Message journal initialized: {self.journal_path} "
|
||||
f"(current size: {self._current_size:,} bytes, "
|
||||
f"max: {self.max_size:,} bytes)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize message journal: {e}")
|
||||
self.enabled = False
|
||||
return False
|
||||
|
||||
async def log_message(
|
||||
self,
|
||||
msg: Dict[str, Any],
|
||||
addr: tuple,
|
||||
timestamp: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a received message to the journal.
|
||||
|
||||
Args:
|
||||
msg: Parsed message dictionary
|
||||
addr: Source address (ip, port) tuple
|
||||
timestamp: Message timestamp (defaults to current time)
|
||||
"""
|
||||
if not self.enabled or not self._initialized:
|
||||
return
|
||||
|
||||
# Skip HTB (heartbeat) messages - too verbose
|
||||
msg_id = msg.get('ID', '')
|
||||
if msg_id == 'HTB':
|
||||
return
|
||||
|
||||
async with self._lock:
|
||||
try:
|
||||
# Prepare journal entry
|
||||
if timestamp is None:
|
||||
import time
|
||||
timestamp = time.time()
|
||||
|
||||
entry = {
|
||||
'timestamp': timestamp,
|
||||
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
|
||||
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
|
||||
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
|
||||
'message': msg
|
||||
}
|
||||
|
||||
# Serialize to JSON (one line per entry)
|
||||
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
|
||||
json_bytes = json_line.encode('utf-8')
|
||||
|
||||
# Check if rotation is needed
|
||||
if self._current_size + len(json_bytes) > self.max_size:
|
||||
await self._rotate()
|
||||
|
||||
# Write to journal
|
||||
if self._file_handle:
|
||||
self._file_handle.write(json_line)
|
||||
self._file_handle.flush() # Ensure data is written
|
||||
self._current_size += len(json_bytes)
|
||||
|
||||
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing to journal: {e}")
|
||||
|
||||
async def _rotate(self):
|
||||
"""
|
||||
Rotate the journal file.
|
||||
|
||||
Renames current file with timestamp, opens new file, and removes
|
||||
old backups exceeding max_backups limit.
|
||||
"""
|
||||
try:
|
||||
# Close current file
|
||||
if self._file_handle:
|
||||
self._file_handle.close()
|
||||
self._file_handle = None
|
||||
|
||||
# Generate backup filename with timestamp
|
||||
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
|
||||
backup_name = f"{self.journal_file}.{timestamp_str}"
|
||||
backup_path = self.journal_dir / backup_name
|
||||
|
||||
# Rename current file to backup
|
||||
if self.journal_path.exists():
|
||||
self.journal_path.rename(backup_path)
|
||||
logger.info(f"Rotated journal: {backup_path} "
|
||||
f"(size: {self._current_size:,} bytes)")
|
||||
|
||||
# Open new journal file
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
self._current_size = 0
|
||||
|
||||
# Clean up old backups
|
||||
await self._cleanup_old_backups()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rotating journal: {e}")
|
||||
# Try to reopen the file even if rotation failed
|
||||
try:
|
||||
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
|
||||
except Exception as e2:
|
||||
logger.error(f"Failed to reopen journal after rotation error: {e2}")
|
||||
self.enabled = False
|
||||
|
||||
async def _cleanup_old_backups(self):
|
||||
"""
|
||||
Remove old backup files exceeding max_backups limit.
|
||||
|
||||
Keeps only the most recent backups based on filename (which includes timestamp).
|
||||
"""
|
||||
try:
|
||||
# Find all backup files
|
||||
backup_pattern = f"{self.journal_file}.*"
|
||||
backup_files = sorted(self.journal_dir.glob(backup_pattern))
|
||||
|
||||
# Remove oldest backups if we have too many
|
||||
if len(backup_files) > self.max_backups:
|
||||
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
|
||||
for backup_file in files_to_remove:
|
||||
try:
|
||||
backup_file.unlink()
|
||||
logger.info(f"Removed old backup: {backup_file.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up old backups: {e}")
|
||||
|
||||
async def log_threshold_event(
|
||||
self,
|
||||
host_name: str,
|
||||
metric_path: str,
|
||||
old_level: str,
|
||||
new_level: str,
|
||||
value: Any,
|
||||
timestamp: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a threshold state change event.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
|
||||
old_level: Previous alert level
|
||||
new_level: New alert level
|
||||
value: Current metric value
|
||||
timestamp: Event timestamp (default: current time)
|
||||
"""
|
||||
if not self.enabled or not self._initialized:
|
||||
return
|
||||
|
||||
try:
|
||||
if timestamp is None:
|
||||
timestamp = __import__('time').time()
|
||||
|
||||
event = {
|
||||
'timestamp': timestamp,
|
||||
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
|
||||
'event_type': 'threshold',
|
||||
'host': host_name,
|
||||
'metric': metric_path,
|
||||
'old_level': old_level,
|
||||
'new_level': new_level,
|
||||
'value': value,
|
||||
}
|
||||
|
||||
async with self._lock:
|
||||
if not self._file_handle:
|
||||
return
|
||||
|
||||
# Check if rotation is needed
|
||||
if self._current_size >= self.max_size:
|
||||
await self._rotate()
|
||||
|
||||
# Write event
|
||||
line = json.dumps(event) + '\n'
|
||||
self._file_handle.write(line)
|
||||
self._file_handle.flush()
|
||||
|
||||
# Update size
|
||||
self._current_size += len(line.encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging threshold event: {e}")
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close the journal and release resources.
|
||||
|
||||
Should be called during shutdown.
|
||||
"""
|
||||
async with self._lock:
|
||||
if self._file_handle:
|
||||
try:
|
||||
self._file_handle.close()
|
||||
logger.info("Message journal closed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing journal: {e}")
|
||||
finally:
|
||||
self._file_handle = None
|
||||
self._initialized = False
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get journal statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with journal stats
|
||||
"""
|
||||
return {
|
||||
'enabled': self.enabled,
|
||||
'initialized': self._initialized,
|
||||
'current_file': str(self.journal_path),
|
||||
'current_size': self._current_size,
|
||||
'max_size': self.max_size,
|
||||
'max_backups': self.max_backups,
|
||||
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
|
||||
}
|
||||
|
||||
|
||||
# Global journal instance
|
||||
_journal_instance: Optional[MessageJournal] = None
|
||||
|
||||
|
||||
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
|
||||
"""
|
||||
Get or create the global journal instance.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary (only used on first call)
|
||||
|
||||
Returns:
|
||||
MessageJournal instance
|
||||
"""
|
||||
global _journal_instance
|
||||
if _journal_instance is None:
|
||||
_journal_instance = MessageJournal(config)
|
||||
return _journal_instance
|
||||
|
||||
|
||||
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
|
||||
"""
|
||||
Convenience function to log a message using the global journal.
|
||||
|
||||
Args:
|
||||
msg: Parsed message dictionary
|
||||
addr: Source address (ip, port) tuple
|
||||
timestamp: Message timestamp (defaults to current time)
|
||||
"""
|
||||
journal = get_journal()
|
||||
await journal.log_message(msg, addr, timestamp)
|
||||
@@ -0,0 +1,520 @@
|
||||
"""Server runtime: starts UDP listener, HTTP server and websocket stubs."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
import signal
|
||||
import sys
|
||||
import ssl
|
||||
from . import __version__
|
||||
|
||||
from . import udp
|
||||
from . import hbdclass
|
||||
|
||||
from . import ws as ws_mod
|
||||
from . import notify as notify_mod
|
||||
from . import data
|
||||
from . import users as users_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
# shared runtime collections and helpers
|
||||
|
||||
def save_state(config, hbdclass):
|
||||
"""Save current state to pickle file. Safe to call at any time."""
|
||||
import pickle
|
||||
import os
|
||||
|
||||
# Clear timer references before pickling (they can't be serialized)
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
if hasattr(conn, 'overdue_timer'):
|
||||
conn.overdue_timer = None
|
||||
if hasattr(conn, 'overdue_callback'):
|
||||
conn.overdue_callback = None
|
||||
if hasattr(conn, 'timeout_duration'):
|
||||
conn.timeout_duration = None
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
tmpfile = pickfile + ".tmp"
|
||||
|
||||
try:
|
||||
with open(tmpfile, "wb") as pickf:
|
||||
pick = pickle.Pickler(pickf)
|
||||
pick.dump(hbdclass.Host.hosts)
|
||||
pick.dump(data.msgs)
|
||||
os.replace(tmpfile, pickfile)
|
||||
except Exception as e:
|
||||
logger.error("Failed to save state: %s", e)
|
||||
try:
|
||||
os.unlink(tmpfile)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def cleanup_function(config, hbdclass):
|
||||
"""This function will be executed upon program exit."""
|
||||
logger.info("Running cleanup function...")
|
||||
save_state(config, hbdclass)
|
||||
logger.info("Cleanup complete.")
|
||||
|
||||
|
||||
async def reload_configuration(config_obj, config_path, components):
|
||||
"""Reload configuration and update all components.
|
||||
|
||||
Args:
|
||||
config_obj: ReloadableConfig instance
|
||||
config_path: Path to config file
|
||||
components: Dict with threshold_checker and other components
|
||||
|
||||
Returns:
|
||||
True if reload succeeded, False otherwise
|
||||
"""
|
||||
try:
|
||||
logger.info("=" * 60)
|
||||
logger.info("Starting configuration reload...")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Reload config file
|
||||
new_config = await config_obj.reload(config_path)
|
||||
|
||||
# Update notify module
|
||||
notify_mod.reload_config(new_config)
|
||||
|
||||
# Reload users
|
||||
users_mod.load_users(new_config)
|
||||
|
||||
# Re-apply host access from updated config to all known hosts
|
||||
from . import config as config_mod
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
access = config_mod.get_host_access(new_config, hostname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
|
||||
# Reload threshold checker
|
||||
if 'threshold_checker' in components:
|
||||
components['threshold_checker'].reload(new_config)
|
||||
|
||||
# Note: Changes to the following require restart:
|
||||
# - hb_port, hbd_port, ws_port (already bound)
|
||||
# - SSL certificates (already loaded)
|
||||
# - pickfile (already opened)
|
||||
# - journal settings (journal already initialized)
|
||||
|
||||
# These are reloadable and effective immediately:
|
||||
# - notification_channels
|
||||
# - threshold_configs
|
||||
# - hosts (watchhosts, dyndnshosts, notification_channels)
|
||||
# - grace period (used on next heartbeat)
|
||||
# - debug/verbose flags (used on next message)
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Configuration reload completed successfully")
|
||||
logger.info("=" * 60)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("=" * 60)
|
||||
logger.error(f"Failed to reload configuration: {e}", exc_info=True)
|
||||
logger.error("Keeping previous configuration")
|
||||
logger.error("=" * 60)
|
||||
return False
|
||||
|
||||
|
||||
async def _run_async(config, config_path=None):
|
||||
loop = asyncio.get_running_loop()
|
||||
shutdown_event = asyncio.Event()
|
||||
reload_event = asyncio.Event()
|
||||
|
||||
# Signal handlers for graceful shutdown and reload
|
||||
def signal_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating shutdown...")
|
||||
loop.call_soon_threadsafe(shutdown_event.set)
|
||||
|
||||
def reload_handler(signum, frame):
|
||||
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
|
||||
logger.info(f"Received {sig_name}, initiating config reload...")
|
||||
loop.call_soon_threadsafe(reload_event.set)
|
||||
|
||||
# Register signal handlers
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
|
||||
loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
|
||||
|
||||
from . import http as http_mod
|
||||
from . import dns as dns_mod
|
||||
from . import notify as notify_mod
|
||||
from . import journal as journal_mod
|
||||
from . import threshold as threshold_mod
|
||||
|
||||
notify_mod.setup(config)
|
||||
|
||||
# Initialize message journal
|
||||
msg_journal = journal_mod.get_journal(config)
|
||||
await msg_journal.initialize()
|
||||
|
||||
# Initialize threshold checker
|
||||
threshold_checker = threshold_mod.ThresholdChecker(
|
||||
config=config,
|
||||
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||
journal=msg_journal,
|
||||
)
|
||||
logger.info("Threshold checker initialized")
|
||||
|
||||
# Components dict for reload orchestration
|
||||
components = {
|
||||
'threshold_checker': threshold_checker,
|
||||
'msg_journal': msg_journal,
|
||||
}
|
||||
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||
# This option is system-dependent; on many systems, setting it to False enables
|
||||
# the socket to handle both IPv4 and IPv6 traffic.
|
||||
try:
|
||||
sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, False)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"Warning: Could not reset IPV6_V6ONLY not supported or dual-stack is unavailable. Error: {e}"
|
||||
)
|
||||
|
||||
bind_addr = ("::", config.get("hb_port", 50003))
|
||||
sock.bind(bind_addr)
|
||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||
|
||||
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMPNS).
|
||||
# If supported, read datagrams via recvmsg() so RTT uses the kernel
|
||||
# timestamp rather than the time.time() call after asyncio scheduling.
|
||||
use_kernel_ts = udp.enable_kernel_timestamps(sock)
|
||||
if use_kernel_ts:
|
||||
logger.info("SO_TIMESTAMPNS enabled: using kernel receive timestamps for RTT")
|
||||
else:
|
||||
logger.info("SO_TIMESTAMPNS not available: using time.time() for RTT")
|
||||
|
||||
def udp_handler(msg, addr, transport, recv_ts=None):
|
||||
ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=eventlog,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
msg_journal=msg_journal,
|
||||
threshold_checker=threshold_checker,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
recv_ts=recv_ts,
|
||||
)
|
||||
udp.handle_datagram(msg, addr, transport, ctx)
|
||||
|
||||
if use_kernel_ts:
|
||||
# recvmsg path: manage the socket ourselves with loop.add_reader()
|
||||
sock.setblocking(False)
|
||||
transport = udp.RecvmsgTransport(loop, sock)
|
||||
reader = udp.make_recvmsg_reader(sock, udp_handler, transport)
|
||||
loop.add_reader(sock.fileno(), reader)
|
||||
protocol = None
|
||||
else:
|
||||
transport, protocol = await loop.create_datagram_endpoint(
|
||||
lambda: udp.EchoServerProtocol(config=config, handler=udp_handler),
|
||||
sock=sock,
|
||||
)
|
||||
|
||||
# Restore connection timers for hosts loaded from pickle
|
||||
restore_ctx = dict(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=eventlog,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
threshold_checker=threshold_checker,
|
||||
)
|
||||
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||
|
||||
# HTTP server (asyncio-based via aiohttp)
|
||||
try:
|
||||
http_task = asyncio.create_task(
|
||||
http_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
port=config.get("hbd_port", 50004),
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
tcss=None,
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"HTTP server started on %s:%s",
|
||||
config.get("hbd_host", ""),
|
||||
config.get("hbd_port", 50004),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("failed to start HTTP server: %s", e)
|
||||
|
||||
# start dns update worker (async)
|
||||
dns_task = None
|
||||
try:
|
||||
dns_task = dns_mod.start_dns_worker(
|
||||
hbdclass, config, log=eventlog, loop=loop
|
||||
)
|
||||
logger.info("dns update worker started")
|
||||
except Exception as e:
|
||||
logger.exception("dns worker failed to start: %s", e)
|
||||
|
||||
# Start the websocket servers as a background task
|
||||
if config.get("wss_port", None):
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ssl_path = config.get("cert_path", "")
|
||||
wss_pem = ssl_path + config.get("wss_pem", "")
|
||||
wss_key = ssl_path + config.get("wss_key", "")
|
||||
try:
|
||||
ssl_context.load_cert_chain(wss_pem, keyfile=wss_key)
|
||||
except FileNotFoundError:
|
||||
logger.error("error: missing SSL keys %s or %s", wss_pem, wss_key)
|
||||
sys.exit(1)
|
||||
logger.info(
|
||||
"Starting secure WebSocket server on port %s with cert %s",
|
||||
config.get("wss_port", None),
|
||||
wss_pem,
|
||||
)
|
||||
else:
|
||||
ssl_context = None
|
||||
|
||||
try:
|
||||
ws_port = config.get("ws_port", 50005)
|
||||
logger.info("Starting WebSocket server on port %s", ws_port)
|
||||
ws_task = asyncio.create_task(
|
||||
ws_mod.start(
|
||||
host=config.get("hbd_host", ""),
|
||||
ws_port=ws_port,
|
||||
wss_port=config.get("wss_port", None),
|
||||
ssl_context=ssl_context,
|
||||
get_hosts=lambda: [
|
||||
hbdclass.Host.hosts[h].stateinfo()
|
||||
for h in sorted(hbdclass.Host.hosts)
|
||||
],
|
||||
# get_msgs=lambda: msgs,
|
||||
config=config,
|
||||
)
|
||||
)
|
||||
logger.info("WebSocket task started")
|
||||
except Exception as e:
|
||||
logger.exception("websocket server failed to start: %s", e)
|
||||
|
||||
# Periodic autosave task
|
||||
autosave_interval = config.get("autosave_interval", 300) # default: 5 minutes
|
||||
|
||||
async def autosave_task():
|
||||
while True:
|
||||
await asyncio.sleep(autosave_interval)
|
||||
logger.debug("Autosaving state...")
|
||||
save_state(config, hbdclass)
|
||||
logger.debug("Autosave complete (%d hosts)", len(hbdclass.Host.hosts))
|
||||
|
||||
autosave = asyncio.create_task(autosave_task())
|
||||
logger.info("Autosave task started (interval: %ds)", autosave_interval)
|
||||
|
||||
# Main event loop - monitor shutdown and reload events
|
||||
try:
|
||||
while True:
|
||||
# Wait for either shutdown or reload event
|
||||
done, pending = await asyncio.wait(
|
||||
[
|
||||
asyncio.create_task(shutdown_event.wait()),
|
||||
asyncio.create_task(reload_event.wait()),
|
||||
],
|
||||
return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
# Check which event was triggered
|
||||
if shutdown_event.is_set():
|
||||
logger.info("Shutdown signal received, stopping services...")
|
||||
# Cancel pending wait tasks
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
break
|
||||
|
||||
if reload_event.is_set():
|
||||
# Clear the event for next reload
|
||||
reload_event.clear()
|
||||
|
||||
# Cancel pending wait tasks
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
|
||||
# Perform reload if config_path is available
|
||||
if config_path:
|
||||
await reload_configuration(config, config_path, components)
|
||||
else:
|
||||
logger.warning("Cannot reload: no config path available")
|
||||
|
||||
# Continue main loop
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error in main loop: %s", e)
|
||||
finally:
|
||||
# Cancel all running tasks
|
||||
logger.info("Cancelling tasks...")
|
||||
try:
|
||||
transport.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing UDP transport: %s", e)
|
||||
|
||||
tasks_to_cancel = [http_task, ws_task, autosave]
|
||||
for task in tasks_to_cancel:
|
||||
if task:
|
||||
try:
|
||||
task.cancel()
|
||||
logger.debug("Cancelled task: %s", task)
|
||||
except Exception as e:
|
||||
logger.warning("Error cancelling task: %s", e)
|
||||
|
||||
# Wait for tasks to finish cancellation with timeout
|
||||
remaining_tasks = [t for t in tasks_to_cancel if t]
|
||||
if remaining_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*remaining_tasks, return_exceptions=True),
|
||||
timeout=2.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for tasks to cancel")
|
||||
except Exception as e:
|
||||
logger.debug("Exception during task cancellation: %s", e)
|
||||
|
||||
# Close message journal
|
||||
try:
|
||||
await msg_journal.close()
|
||||
except Exception as e:
|
||||
logger.warning("Error closing message journal: %s", e)
|
||||
|
||||
# Signal DNS worker to exit and await it
|
||||
try:
|
||||
if "dns_task" in locals() and dns_task:
|
||||
try:
|
||||
hbdclass.Host.dnsQ.put(None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(dns_task, timeout=2.0)
|
||||
logger.info("DNS worker finished")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeout waiting for DNS worker to finish")
|
||||
dns_task.cancel()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("DNS worker was cancelled")
|
||||
except Exception as e:
|
||||
logger.warning("Error awaiting DNS worker: %s", e)
|
||||
finally:
|
||||
# Clear queue bridge to release any held references
|
||||
hbdclass.Host.dnsQ = None
|
||||
except Exception as e:
|
||||
logger.warning("Error stopping DNS worker: %s", e)
|
||||
|
||||
logger.info("All tasks cancelled")
|
||||
|
||||
|
||||
def load_pickled_hosts(config, hbdclass):
|
||||
"""Load pickled hosts from file, if available."""
|
||||
import os
|
||||
import pickle
|
||||
from . import config as config_mod
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||
watchhosts = config_mod.get_watchhosts(config)
|
||||
drophosts = config.get("drophosts", [])
|
||||
if 1 and os.path.exists(pickfile):
|
||||
if config.get("verbose", False):
|
||||
logger.info("opening pickls %s", pickfile)
|
||||
pickf = open(pickfile, "rb")
|
||||
pick = pickle.Unpickler(pickf)
|
||||
try:
|
||||
hbdclass.Host.hosts = pick.load()
|
||||
data.msgs = pick.load()
|
||||
pickf.close()
|
||||
except Exception as e:
|
||||
logger.exception("load pickled failed: %s", e)
|
||||
os.unlink(pickfile)
|
||||
hbdclass.Connection.htab = {}
|
||||
for h in list(hbdclass.Host.hosts.keys()):
|
||||
hbdclass.Host.hosts[h].dyn = h in dyndnshosts
|
||||
hbdclass.Host.hosts[h].watched = h in watchhosts
|
||||
hbdclass.Host.hosts[h].fixup()
|
||||
access = config_mod.get_host_access(config, h)
|
||||
hbdclass.Host.hosts[h].apply_access(
|
||||
access["owner"], access["managers"], access["monitors"]
|
||||
)
|
||||
for h in drophosts:
|
||||
if h in hbdclass.Host.hosts:
|
||||
del hbdclass.Host.hosts[h]
|
||||
if config.get("verbose", False):
|
||||
logger.info("%s pickled hosts loaded", len(hbdclass.Host.hosts))
|
||||
else:
|
||||
if config.get("verbose", False):
|
||||
logger.info("no pickled data")
|
||||
|
||||
|
||||
def run(config, config_path=None):
|
||||
"""Start the hbd service (blocking).
|
||||
|
||||
Manually manages the event loop to ensure clean shutdown.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
config_path: Path to config file (for reload support)
|
||||
"""
|
||||
import os
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO
|
||||
)
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||
users_mod.load_users(config)
|
||||
eventlog(None, "INFO", f"hbd version {__version__} starting up")
|
||||
|
||||
if config_path:
|
||||
logger.info(f"Config file: {config_path} (reload with SIGHUP)")
|
||||
else:
|
||||
logger.warning("No config path provided - reload via SIGHUP disabled")
|
||||
|
||||
# Create and set the event loop manually
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(_run_async(config, config_path=config_path))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received KeyboardInterrupt, shutting down...")
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception in main: %s", e)
|
||||
finally:
|
||||
cleanup_function(config, hbdclass)
|
||||
logger.info("hbd shutdown complete")
|
||||
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
|
||||
notify_mod.closelog()
|
||||
# Explicitly close the loop
|
||||
try:
|
||||
# Cancel all remaining tasks
|
||||
pending = asyncio.all_tasks(loop)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
# Run one more cycle to process cancellations
|
||||
if pending:
|
||||
loop.run_until_complete(
|
||||
asyncio.gather(*pending, return_exceptions=True)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# Exit
|
||||
os._exit(0)
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Monitor helper for heartbeat daemon.
|
||||
|
||||
This module provides monitoring tasks for the heartbeat daemon.
|
||||
The primary reachability monitoring is now event-driven (timers set/reset
|
||||
on HTB arrival in udp.py) rather than periodic polling.
|
||||
|
||||
This module can be extended for additional monitoring tasks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
from . import notify as notify_mod
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
|
||||
async def cleanup_connections(hbdclass):
|
||||
"""Clean up connection timers on shutdown.
|
||||
|
||||
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||
"""
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
|
||||
@@ -0,0 +1,326 @@
|
||||
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
import http.client
|
||||
import urllib.parse
|
||||
import subprocess
|
||||
import smtplib
|
||||
import time
|
||||
import sys
|
||||
from . import data
|
||||
from . import ws as ws_mod
|
||||
from . import main as main_mod
|
||||
|
||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
||||
msg_to_websockets = ws_mod.broadcast
|
||||
|
||||
# module-level configuration set via setup()
|
||||
_config = {}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logf = None
|
||||
|
||||
def initlog(logfile):
|
||||
global logf
|
||||
try:
|
||||
logf = open(logfile, "a+")
|
||||
except Exception as e:
|
||||
import sys
|
||||
|
||||
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
||||
logf = sys.stderr
|
||||
return logf
|
||||
|
||||
def closelog():
|
||||
global logf
|
||||
if logf and logf != sys.stderr:
|
||||
try:
|
||||
logf.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def eventlog(host, lvl, m, service=None):
|
||||
ts = time.time()
|
||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||
if host:
|
||||
s += f"{host} "
|
||||
s += m
|
||||
data.msgs.append(s)
|
||||
logger.info(s)
|
||||
if logf:
|
||||
try:
|
||||
logf.write(s + "\n")
|
||||
logf.flush()
|
||||
except Exception as e:
|
||||
logger.warning("failed to write to logfile: %s", e)
|
||||
msg_to_websockets("message", s)
|
||||
|
||||
def setup(cfg: dict):
|
||||
"""Initialize notifier defaults from a configuration dict."""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
|
||||
|
||||
def reload_config(cfg: dict):
|
||||
"""Reload notification configuration.
|
||||
|
||||
This function updates the module-level notification configuration
|
||||
during runtime config reloads.
|
||||
|
||||
Args:
|
||||
cfg: New configuration dictionary
|
||||
"""
|
||||
global _config
|
||||
_config = dict(cfg)
|
||||
logger.info("Notification configuration reloaded")
|
||||
|
||||
|
||||
def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
|
||||
"""Send a plain email via SMTP. Returns True on success."""
|
||||
try:
|
||||
smtpport = _config.get("smtpport", 587)
|
||||
server = smtplib.SMTP(smtpserver, smtpport)
|
||||
if debug > 0:
|
||||
server.set_debuglevel(1)
|
||||
if smtpport == 587:
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
smtpuser = _config.get("smtpuser", None)
|
||||
smtppassword = _config.get("smtppassword", None)
|
||||
if smtpuser and smtppassword:
|
||||
server.login(smtpuser, smtppassword)
|
||||
server.sendmail(sender, toaddrs, body)
|
||||
except Exception as e:
|
||||
logger.warning("email send failed: %s", e)
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
try:
|
||||
server.quit()
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def email(subject: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Convenience wrapper exposed to the rest of the application.
|
||||
|
||||
Uses module-level configuration to supply recipient list, smtp server
|
||||
and sender address.
|
||||
"""
|
||||
toaddrs = _config.get("toemail")
|
||||
fromemail = _config.get("fromemail")
|
||||
smtpserver = _config.get("smtpserver")
|
||||
if not toaddrs or not fromemail or not smtpserver:
|
||||
logger.warning(
|
||||
"email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
|
||||
toaddrs,
|
||||
fromemail,
|
||||
smtpserver,
|
||||
)
|
||||
return False
|
||||
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
|
||||
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
|
||||
toaddrs[0] if toaddrs else "",
|
||||
fromemail,
|
||||
subject,
|
||||
date,
|
||||
msg,
|
||||
)
|
||||
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
|
||||
|
||||
|
||||
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
|
||||
"""Send message via Pushover API."""
|
||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||
try:
|
||||
conn.request(
|
||||
"POST",
|
||||
"/1/messages.json",
|
||||
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
|
||||
{"Content-type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
r = conn.getresponse()
|
||||
logger.debug("pushover response: %s %s", r.status, r.reason)
|
||||
return r.status == 200
|
||||
except Exception as e:
|
||||
logger.error("pushover error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushmattermost(
|
||||
host: str,
|
||||
token: str,
|
||||
channel: str,
|
||||
msg: str,
|
||||
username: str = "hbd",
|
||||
icon: Optional[str] = None,
|
||||
debug: int = 0,
|
||||
) -> bool:
|
||||
"""Send a message to Mattermost via simple webhook driver if available.
|
||||
|
||||
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
|
||||
If the import fails it returns False.
|
||||
"""
|
||||
try:
|
||||
from mattermostdriver import Driver
|
||||
except Exception:
|
||||
return False
|
||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||
mm = Driver(ses)
|
||||
payload = {"text": msg, "channel": channel, "username": username}
|
||||
if icon:
|
||||
payload["icon_url"] = icon
|
||||
try:
|
||||
rc = mm.webhooks.call_webhook(token, payload)
|
||||
logger.debug("mattermost rc: %s", rc)
|
||||
return bool(rc is None or rc == "")
|
||||
except Exception as e:
|
||||
logger.error("mattermost error: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def pushsignal(
|
||||
signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
|
||||
) -> bool:
|
||||
"""Send a message via signal-cli (requires local installation).
|
||||
|
||||
Uses subprocess to call signal-cli. Returns True if the command succeeded.
|
||||
"""
|
||||
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
|
||||
logger.debug("signal cli: %s", CLI)
|
||||
try:
|
||||
res = subprocess.run(CLI, capture_output=True)
|
||||
if res.returncode != 0:
|
||||
logger.error("signal failed: %s".res.stderr.decode())
|
||||
return False
|
||||
logger.debug("signal sent: %s", res.stdout.decode())
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception("signal exception: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def _dispatch_to_channel(channel_name: str, channel_config: dict, msg: str, debug: int = 0) -> bool:
|
||||
"""Dispatch a message to a specific notification channel.
|
||||
|
||||
Args:
|
||||
channel_name: Name of the channel (for logging)
|
||||
channel_config: Channel configuration dictionary with 'type' and type-specific fields
|
||||
msg: Message to send
|
||||
debug: Debug level
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully, False otherwise
|
||||
"""
|
||||
channel_type = channel_config.get("type")
|
||||
|
||||
if channel_type == "pushover":
|
||||
return pushover(
|
||||
channel_config.get("token", ""),
|
||||
channel_config.get("user", ""),
|
||||
msg,
|
||||
debug=debug
|
||||
)
|
||||
|
||||
elif channel_type == "email":
|
||||
# Build email from channel config
|
||||
recipients = channel_config.get("recipients", [])
|
||||
sender = channel_config.get("sender", "")
|
||||
smtp_server = channel_config.get("smtp_server", "")
|
||||
smtp_port = channel_config.get("smtp_port", 587)
|
||||
smtp_user = channel_config.get("smtp_user")
|
||||
smtp_password = channel_config.get("smtp_password")
|
||||
|
||||
if not recipients or not sender or not smtp_server:
|
||||
logger.warning(
|
||||
"Email channel '%s' missing required fields: recipients=%s, sender=%s, smtp_server=%s",
|
||||
channel_name, recipients, sender, smtp_server
|
||||
)
|
||||
return False
|
||||
|
||||
# Temporarily update _config for email() function
|
||||
old_config = dict(_config)
|
||||
_config["toemail"] = recipients
|
||||
_config["fromemail"] = sender
|
||||
_config["smtpserver"] = smtp_server
|
||||
_config["smtpport"] = smtp_port
|
||||
if smtp_user:
|
||||
_config["smtpuser"] = smtp_user
|
||||
if smtp_password:
|
||||
_config["smtppassword"] = smtp_password
|
||||
|
||||
result = email("Heartbeat notification", msg, debug=debug)
|
||||
|
||||
# Restore config
|
||||
_config.clear()
|
||||
_config.update(old_config)
|
||||
|
||||
return result
|
||||
|
||||
elif channel_type == "signal":
|
||||
return pushsignal(
|
||||
channel_config.get("cli_path", "/usr/local/bin/signal-cli"),
|
||||
channel_config.get("user", ""),
|
||||
channel_config.get("recipient", ""),
|
||||
msg,
|
||||
debug=debug
|
||||
)
|
||||
|
||||
elif channel_type == "mattermost":
|
||||
return pushmattermost(
|
||||
channel_config.get("host", ""),
|
||||
channel_config.get("token", ""),
|
||||
channel_config.get("channel", ""),
|
||||
msg,
|
||||
username=channel_config.get("username", "hbd"),
|
||||
icon=channel_config.get("icon"),
|
||||
debug=debug
|
||||
)
|
||||
|
||||
else:
|
||||
logger.warning("Unknown channel type '%s' for channel '%s'", channel_type, channel_name)
|
||||
return False
|
||||
|
||||
|
||||
def pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict:
|
||||
"""Send notification for a specific host using its configured channels.
|
||||
|
||||
This function looks up the host's notification channels from the config
|
||||
and sends the message to those channels.
|
||||
|
||||
Args:
|
||||
hostname: Name of the host to send notification for
|
||||
msg: Message to send
|
||||
debug: Debug level
|
||||
|
||||
Returns:
|
||||
Dictionary of results per channel: {"channel_name": True/False}
|
||||
"""
|
||||
from . import config as config_mod
|
||||
|
||||
# Get notification channels for this host
|
||||
channels = config_mod.get_notification_channels_config(_config, hostname)
|
||||
|
||||
if not channels:
|
||||
logger.warning("No notification channels configured for host '%s'", hostname)
|
||||
return {}
|
||||
|
||||
# Dispatch to each channel
|
||||
results = {}
|
||||
for channel_name, channel_config in channels:
|
||||
try:
|
||||
success = _dispatch_to_channel(channel_name, channel_config, msg, debug=debug)
|
||||
results[channel_name] = success
|
||||
if success:
|
||||
logger.info("Notification sent to channel '%s': %s", channel_name, msg)
|
||||
else:
|
||||
logger.warning("Failed to send notification to channel '%s'", channel_name)
|
||||
except Exception as e:
|
||||
logger.error("Error sending to channel '%s': %s", channel_name, e)
|
||||
results[channel_name] = False
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,330 @@
|
||||
"""Settings descriptor: maps config keys to display metadata.
|
||||
|
||||
``get_settings_sections(config)`` returns an ordered list of sections, each
|
||||
containing a list of field descriptors. The template iterates this structure
|
||||
generically, so adding editability later is a matter of:
|
||||
|
||||
1. Setting ``"editable": True`` on a field.
|
||||
2. Adding the matching ``<input>``/``<select>`` in the template
|
||||
(guided by ``"type"``).
|
||||
3. Wiring a POST handler in http.py.
|
||||
|
||||
Field descriptor keys
|
||||
---------------------
|
||||
key str Config key (for future form POST matching)
|
||||
label str Human-readable label
|
||||
description str One-line help text shown below the value
|
||||
value any Sanitized display value (secrets replaced with "•••")
|
||||
type str One of: text | number | port | boolean | path | duration |
|
||||
list | secret | size | select
|
||||
editable bool Reserved for future use — currently always False
|
||||
sensitive bool True when the raw value must never be shown
|
||||
"""
|
||||
|
||||
# Credential field names that should always be masked.
|
||||
_SECRET_KEYS = frozenset({
|
||||
"password", "token", "user_key", "api_key", "secret",
|
||||
"smtp_password", "smtp_user",
|
||||
})
|
||||
|
||||
_CHANNEL_TYPE_LABELS = {
|
||||
"pushover": "Pushover",
|
||||
"email": "E-mail",
|
||||
"signal": "Signal",
|
||||
"mattermost": "Mattermost",
|
||||
}
|
||||
|
||||
|
||||
def _mask(value):
|
||||
"""Return a masked placeholder for sensitive values."""
|
||||
if not value:
|
||||
return ""
|
||||
return "•••"
|
||||
|
||||
|
||||
def _fmt_size(n):
|
||||
"""Format a byte count as a human-readable string."""
|
||||
try:
|
||||
n = int(n)
|
||||
except (TypeError, ValueError):
|
||||
return str(n)
|
||||
for unit in ("B", "KB", "MB", "GB"):
|
||||
if n < 1024:
|
||||
return f"{n} {unit}"
|
||||
n //= 1024
|
||||
return f"{n} TB"
|
||||
|
||||
|
||||
def _fmt_duration(seconds):
|
||||
"""Format seconds into a human-readable duration string."""
|
||||
try:
|
||||
s = int(seconds)
|
||||
except (TypeError, ValueError):
|
||||
return str(seconds)
|
||||
if s < 60:
|
||||
return f"{s}s"
|
||||
if s < 3600:
|
||||
m, sec = divmod(s, 60)
|
||||
return f"{m}m {sec}s" if sec else f"{m}m"
|
||||
h, rem = divmod(s, 3600)
|
||||
m = rem // 60
|
||||
return f"{h}h {m}m" if m else f"{h}h"
|
||||
|
||||
|
||||
def _sanitize_channel(name, cfg):
|
||||
"""Return a sanitized copy of a notification channel config."""
|
||||
result = {}
|
||||
for k, v in cfg.items():
|
||||
if k in _SECRET_KEYS:
|
||||
result[k] = _mask(v)
|
||||
elif isinstance(v, list):
|
||||
result[k] = v
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_settings_sections(config: dict) -> list:
|
||||
"""Return ordered list of setting sections for the settings page.
|
||||
|
||||
Each section:
|
||||
{
|
||||
"title": str,
|
||||
"description": str,
|
||||
"fields": [ field_descriptor, ... ]
|
||||
}
|
||||
|
||||
Each field_descriptor:
|
||||
{
|
||||
"key": str,
|
||||
"label": str,
|
||||
"description": str,
|
||||
"value": display_value,
|
||||
"raw": raw_config_value, # None for sensitive
|
||||
"type": str,
|
||||
"editable": bool,
|
||||
"sensitive": bool,
|
||||
}
|
||||
"""
|
||||
def field(key, label, ftype, description="", editable=False, sensitive=False):
|
||||
raw = config.get(key)
|
||||
if sensitive:
|
||||
display = _mask(raw)
|
||||
raw_out = None
|
||||
elif ftype == "size":
|
||||
display = _fmt_size(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "duration":
|
||||
display = _fmt_duration(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "boolean":
|
||||
display = bool(raw)
|
||||
raw_out = raw
|
||||
elif ftype == "list":
|
||||
val = raw or []
|
||||
display = list(val) if not isinstance(val, list) else val
|
||||
raw_out = display
|
||||
else:
|
||||
display = raw if raw is not None else ""
|
||||
raw_out = raw
|
||||
return {
|
||||
"key": key,
|
||||
"label": label,
|
||||
"description": description,
|
||||
"value": display,
|
||||
"raw": raw_out,
|
||||
"type": ftype,
|
||||
"editable": editable,
|
||||
"sensitive": sensitive,
|
||||
}
|
||||
|
||||
# ---- Notification channels (complex, built separately) ----------------
|
||||
notif_channels = []
|
||||
for ch_name, ch_cfg in (config.get("notification_channels") or {}).items():
|
||||
if not isinstance(ch_cfg, dict):
|
||||
continue
|
||||
ch_type = ch_cfg.get("type", "")
|
||||
fields = []
|
||||
for k, v in ch_cfg.items():
|
||||
if k == "type":
|
||||
continue
|
||||
sensitive = k in _SECRET_KEYS
|
||||
fields.append({
|
||||
"key": k,
|
||||
"label": k.replace("_", " ").title(),
|
||||
"value": _mask(v) if sensitive else (
|
||||
", ".join(v) if isinstance(v, list) else str(v)
|
||||
),
|
||||
"sensitive": sensitive,
|
||||
})
|
||||
notif_channels.append({
|
||||
"name": ch_name,
|
||||
"type": ch_type,
|
||||
"type_label": _CHANNEL_TYPE_LABELS.get(ch_type, ch_type.title()),
|
||||
"fields": fields,
|
||||
})
|
||||
|
||||
# ---- Users (show metadata only, never password hashes) ----------------
|
||||
users_list = []
|
||||
for username, attrs in (config.get("users") or {}).items():
|
||||
if not isinstance(attrs, dict):
|
||||
continue
|
||||
users_list.append({
|
||||
"username": username,
|
||||
"full_name": attrs.get("full_name", ""),
|
||||
"admin": bool(attrs.get("admin", False)),
|
||||
"avatar": attrs.get("avatar", ""),
|
||||
"notification_channels": attrs.get("notification_channels", []),
|
||||
})
|
||||
|
||||
# ---- Hosts summary ----------------------------------------------------
|
||||
hosts_list = []
|
||||
for hname, hcfg in (config.get("hosts") or {}).items():
|
||||
if not isinstance(hcfg, dict):
|
||||
continue
|
||||
hosts_list.append({
|
||||
"name": hname,
|
||||
"watch": bool(hcfg.get("watch", False)),
|
||||
"dyndns": bool(hcfg.get("dyndns", False)),
|
||||
"owner": hcfg.get("owner", ""),
|
||||
"managers": hcfg.get("managers", []),
|
||||
"monitors": hcfg.get("monitors", []),
|
||||
"threshold_config": hcfg.get("threshold_config", ""),
|
||||
"notification_channels": hcfg.get("notification_channels", []),
|
||||
})
|
||||
|
||||
return [
|
||||
{
|
||||
"id": "network",
|
||||
"title": "Network",
|
||||
"description": "Ports and bind addresses for all server sockets.",
|
||||
"fields": [
|
||||
field("hb_port", "Heartbeat UDP port", "port",
|
||||
"UDP port the server listens on for heartbeat datagrams."),
|
||||
field("hbd_host", "HTTP bind address", "text",
|
||||
"Interface to bind the HTTP server to. Empty = all interfaces."),
|
||||
field("hbd_port", "HTTP API port", "port",
|
||||
"TCP port for the HTTP API and web UI."),
|
||||
field("ws_port", "WebSocket port", "port",
|
||||
"TCP port for the plain WebSocket server."),
|
||||
field("wss_port", "Secure WebSocket port", "port",
|
||||
"TCP port for WSS (TLS WebSocket). Leave empty to disable."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "tls",
|
||||
"title": "TLS / WebSocket Security",
|
||||
"description": "Certificate paths used when wss_port is set.",
|
||||
"fields": [
|
||||
field("cert_path", "Certificate directory", "path",
|
||||
"Directory containing the TLS certificate and key files."),
|
||||
field("wss_pem", "Certificate file", "text",
|
||||
"Filename of the TLS certificate chain (PEM format)."),
|
||||
field("wss_key", "Key file", "text",
|
||||
"Filename of the TLS private key (PEM format)."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "monitoring",
|
||||
"title": "Monitoring",
|
||||
"description": "Heartbeat timing and alert re-notification behaviour.",
|
||||
"fields": [
|
||||
field("interval", "Heartbeat interval", "duration",
|
||||
"Expected time between heartbeat messages from each client."),
|
||||
field("grace", "Grace multiplier", "number",
|
||||
"A host is marked overdue after interval × grace seconds of silence."),
|
||||
field("threshold_renotify_interval", "Re-notify interval", "duration",
|
||||
"How often to re-send notifications for ongoing threshold alerts."),
|
||||
field("autosave_interval", "Autosave interval", "duration",
|
||||
"How often the server saves its state to disk."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "persistence",
|
||||
"title": "Persistence & Logging",
|
||||
"description": "State file and event log settings.",
|
||||
"fields": [
|
||||
field("pickfile", "State file", "path",
|
||||
"Path to the pickle file used to persist host state across restarts."),
|
||||
field("logfile", "Event log", "path",
|
||||
"Path to the event log file."),
|
||||
field("logfmt", "Log format", "select",
|
||||
"Format for event log entries: text, msg, or json."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "journal",
|
||||
"title": "Message Journal",
|
||||
"description": "All received heartbeat and plugin messages are journalled here.",
|
||||
"fields": [
|
||||
field("journal_enabled", "Enabled", "boolean",
|
||||
"Turn journalling on or off."),
|
||||
field("journal_dir", "Journal directory","path",
|
||||
"Directory where journal files are written."),
|
||||
field("journal_file", "Journal filename", "text",
|
||||
"Base filename for the journal (rotated copies get a numeric suffix)."),
|
||||
field("journal_max_size", "Max file size", "size",
|
||||
"Rotate the journal when it exceeds this size."),
|
||||
field("journal_max_backups", "Backup count", "number",
|
||||
"Number of rotated journal files to keep."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "dns",
|
||||
"title": "Dynamic DNS",
|
||||
"description": "nsupdate-based DNS registration for dynamic hosts.",
|
||||
"fields": [
|
||||
field("nsupdate_bin", "nsupdate binary", "path",
|
||||
"Full path to the nsupdate executable."),
|
||||
field("dyndomains", "Dynamic domains", "list",
|
||||
"DNS zones managed by nsupdate for dynamic hosts."),
|
||||
field("drophosts", "Drop hosts", "list",
|
||||
"Hostnames to silently ignore — no state, no alerts."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "users",
|
||||
"title": "Users",
|
||||
"description": "Accounts defined in the config file. Password hashes are never shown.",
|
||||
"users": users_list,
|
||||
"fields": [
|
||||
field("default_owner", "Default owner", "text",
|
||||
"Username that owns hosts with no explicit owner. "
|
||||
"Falls back to the first admin user."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "channels",
|
||||
"title": "Notification Channels",
|
||||
"description": "Named notification providers. Credentials are masked.",
|
||||
"channels": notif_channels,
|
||||
"fields": [
|
||||
field("default_notification_channels", "Default channels", "list",
|
||||
"Channels used when a host does not specify its own."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "hosts",
|
||||
"title": "Hosts",
|
||||
"description": "Host definitions loaded from the config file.",
|
||||
"hosts": hosts_list,
|
||||
"fields": [],
|
||||
},
|
||||
{
|
||||
"id": "runtime",
|
||||
"title": "Runtime",
|
||||
"description": "Flags set at startup (require restart to change).",
|
||||
"fields": [
|
||||
field("foreground", "Foreground mode", "boolean",
|
||||
"Run in the foreground instead of daemonising."),
|
||||
field("verbose", "Verbose logging", "boolean",
|
||||
"Enable verbose log output."),
|
||||
field("debug", "Debug level", "number",
|
||||
"0 = off. Higher values increase log verbosity."),
|
||||
],
|
||||
},
|
||||
]
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 181 KiB |
@@ -139,4 +139,5 @@
|
||||
font-size: 9px;
|
||||
float: left;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,557 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 20px;
|
||||
background: #f5f5f5;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.summary-cards {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.summary-card {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.summary-card.critical {
|
||||
border-left: 5px solid #f44336;
|
||||
}
|
||||
|
||||
.summary-card.warning {
|
||||
border-left: 5px solid #ff9800;
|
||||
}
|
||||
|
||||
.summary-card.ok {
|
||||
border-left: 5px solid #4caf50;
|
||||
}
|
||||
|
||||
.summary-number {
|
||||
font-size: 3em;
|
||||
font-weight: bold;
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
.summary-number.critical {
|
||||
color: #f44336;
|
||||
}
|
||||
|
||||
.summary-number.warning {
|
||||
color: #ff9800;
|
||||
}
|
||||
|
||||
.summary-number.ok {
|
||||
color: #4caf50;
|
||||
}
|
||||
|
||||
.summary-label {
|
||||
color: #666;
|
||||
text-transform: uppercase;
|
||||
font-size: 0.9em;
|
||||
letter-spacing: 1px;
|
||||
}
|
||||
|
||||
.filters {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
padding: 15px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
display: flex;
|
||||
gap: 15px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.filter-label {
|
||||
font-weight: bold;
|
||||
color: #555;
|
||||
}
|
||||
|
||||
.filter-button {
|
||||
padding: 8px 16px;
|
||||
border: 2px solid #ddd;
|
||||
background: white;
|
||||
border-radius: 20px;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.filter-button:hover {
|
||||
border-color: #2196f3;
|
||||
}
|
||||
|
||||
.filter-button.active {
|
||||
background: #2196f3;
|
||||
color: white;
|
||||
border-color: #2196f3;
|
||||
}
|
||||
|
||||
.alerts-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.alert-item {
|
||||
border-left: 5px solid #ddd;
|
||||
padding: 15px;
|
||||
margin-bottom: 15px;
|
||||
background: #fafafa;
|
||||
border-radius: 4px;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.alert-item.acknowledged {
|
||||
opacity: 0.6;
|
||||
background: #f0f0f0;
|
||||
}
|
||||
|
||||
.alert-item:hover {
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
transform: translateX(5px);
|
||||
}
|
||||
|
||||
.alert-item.critical {
|
||||
border-left-color: #f44336;
|
||||
background: #ffebee;
|
||||
}
|
||||
|
||||
.alert-item.warning {
|
||||
border-left-color: #ff9800;
|
||||
background: #fff3e0;
|
||||
}
|
||||
|
||||
.alert-item.unknown {
|
||||
border-left-color: #9e9e9e;
|
||||
background: #f5f5f5;
|
||||
}
|
||||
|
||||
.alert-main {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.alert-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 15px;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.alert-level {
|
||||
padding: 4px 12px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.75em;
|
||||
font-weight: bold;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.alert-level.critical {
|
||||
background: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-level.warning {
|
||||
background: #ff9800;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-level.unknown {
|
||||
background: #9e9e9e;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.alert-hostname {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
.alert-metric {
|
||||
color: #666;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.alert-details {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
color: #666;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.alert-value {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.alert-duration {
|
||||
color: #999;
|
||||
font-size: 0.85em;
|
||||
}
|
||||
|
||||
.alert-actions {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
margin-left: 15px;
|
||||
}
|
||||
|
||||
.acknowledge-btn {
|
||||
padding: 8px 16px;
|
||||
background: #2196f3;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 0.85em;
|
||||
transition: all 0.2s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.acknowledge-btn:hover {
|
||||
background: #1976d2;
|
||||
transform: scale(1.05);
|
||||
}
|
||||
|
||||
.acknowledge-btn:disabled {
|
||||
background: #ccc;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
.acknowledged-badge {
|
||||
padding: 4px 8px;
|
||||
background: #4caf50;
|
||||
color: white;
|
||||
border-radius: 4px;
|
||||
font-size: 0.75em;
|
||||
text-align: center;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.no-alerts {
|
||||
text-align: center;
|
||||
padding: 60px 20px;
|
||||
color: #999;
|
||||
}
|
||||
|
||||
.no-alerts-icon {
|
||||
font-size: 4em;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.error {
|
||||
background: #ffebee;
|
||||
border-left: 4px solid #f44336;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
border-radius: 4px;
|
||||
color: #c62828;
|
||||
}
|
||||
|
||||
.refresh-info {
|
||||
text-align: center;
|
||||
color: #999;
|
||||
font-size: 0.85em;
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.last-update {
|
||||
color: #666;
|
||||
font-size: 0.9em;
|
||||
text-align: right;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
|
||||
|
||||
<div class="summary-cards" id="summary-cards">
|
||||
<div class="summary-card critical">
|
||||
<div class="summary-label">Critical</div>
|
||||
<div class="summary-number critical" id="critical-count">-</div>
|
||||
</div>
|
||||
<div class="summary-card warning">
|
||||
<div class="summary-label">Warning</div>
|
||||
<div class="summary-number warning" id="warning-count">-</div>
|
||||
</div>
|
||||
<div class="summary-card ok">
|
||||
<div class="summary-label">Total Hosts</div>
|
||||
<div class="summary-number ok" id="host-count">-</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="filters">
|
||||
<span class="filter-label">Show:</span>
|
||||
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
|
||||
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
|
||||
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
|
||||
</div>
|
||||
|
||||
<div class="alerts-container">
|
||||
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
|
||||
<div id="alerts-list">
|
||||
<div class="loading">Loading alerts...</div>
|
||||
</div>
|
||||
<div class="refresh-info">
|
||||
Auto-refreshing every 15 seconds
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let currentFilter = 'all';
|
||||
let allAlerts = [];
|
||||
|
||||
async function loadAlerts() {
|
||||
try {
|
||||
const response = await fetch('/api/0/alerts');
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
allAlerts = data.alerts;
|
||||
|
||||
// Update summary cards
|
||||
document.getElementById('critical-count').textContent = data.summary.critical || 0;
|
||||
document.getElementById('warning-count').textContent = data.summary.warning || 0;
|
||||
document.getElementById('host-count').textContent = data.host_count || 0;
|
||||
|
||||
// Update last update time
|
||||
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
|
||||
|
||||
// Render alerts
|
||||
renderAlerts(allAlerts);
|
||||
|
||||
} catch (error) {
|
||||
document.getElementById('alerts-list').innerHTML =
|
||||
`<div class="error">Failed to load alerts: ${error.message}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
function renderAlerts(alerts) {
|
||||
const container = document.getElementById('alerts-list');
|
||||
|
||||
// Filter alerts based on current filter
|
||||
let filteredAlerts = alerts;
|
||||
if (currentFilter !== 'all') {
|
||||
filteredAlerts = alerts.filter(alert =>
|
||||
alert.level.toLowerCase() === currentFilter
|
||||
);
|
||||
}
|
||||
|
||||
if (filteredAlerts.length === 0) {
|
||||
if (currentFilter === 'all' && alerts.length === 0) {
|
||||
container.innerHTML = `
|
||||
<div class="no-alerts">
|
||||
<div class="no-alerts-icon">✓</div>
|
||||
<h2>All Systems Normal</h2>
|
||||
<p>No active alerts at this time</p>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
container.innerHTML = `
|
||||
<div class="no-alerts">
|
||||
<p>No ${currentFilter} alerts</p>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
for (const alert of filteredAlerts) {
|
||||
html += renderAlert(alert);
|
||||
}
|
||||
container.innerHTML = html;
|
||||
}
|
||||
|
||||
function renderAlert(alert) {
|
||||
const level = alert.level.toLowerCase();
|
||||
const duration = getDuration(alert.since);
|
||||
const acknowledged = alert.acknowledged || false;
|
||||
|
||||
// Use formatted message if available, otherwise build from individual fields
|
||||
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||
if (alert.formatted_message) {
|
||||
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
|
||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||
}
|
||||
|
||||
// Build actions section
|
||||
let actionsHtml = '';
|
||||
if (acknowledged) {
|
||||
actionsHtml = `
|
||||
<div class="alert-actions">
|
||||
<div class="acknowledged-badge">✓ Acknowledged</div>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
actionsHtml = `
|
||||
<div class="alert-actions">
|
||||
<button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
|
||||
Acknowledge
|
||||
</button>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
|
||||
<div class="alert-main">
|
||||
<div class="alert-header">
|
||||
<span class="alert-level ${level}">${alert.level}</span>
|
||||
<span class="alert-hostname">${alert.hostname}</span>
|
||||
</div>
|
||||
<div class="alert-metric">${alert.metric_path}</div>
|
||||
<div class="alert-details">
|
||||
<span>${valueText}</span>
|
||||
<span class="alert-duration">Active for ${duration}</span>
|
||||
</div>
|
||||
</div>
|
||||
${actionsHtml}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function formatValue(value) {
|
||||
if (typeof value === 'number') {
|
||||
if (value > 1000) {
|
||||
return value.toLocaleString();
|
||||
}
|
||||
return value.toFixed(2);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function getDuration(timestamp) {
|
||||
const now = Date.now() / 1000;
|
||||
const seconds = Math.floor(now - timestamp);
|
||||
|
||||
if (seconds < 60) {
|
||||
return `${seconds}s`;
|
||||
} else if (seconds < 3600) {
|
||||
return `${Math.floor(seconds / 60)}m`;
|
||||
} else if (seconds < 86400) {
|
||||
const hours = Math.floor(seconds / 3600);
|
||||
const minutes = Math.floor((seconds % 3600) / 60);
|
||||
return `${hours}h ${minutes}m`;
|
||||
} else {
|
||||
const days = Math.floor(seconds / 86400);
|
||||
const hours = Math.floor((seconds % 86400) / 3600);
|
||||
return `${days}d ${hours}h`;
|
||||
}
|
||||
}
|
||||
|
||||
function filterAlerts(filter) {
|
||||
currentFilter = filter;
|
||||
|
||||
// Update active button
|
||||
document.querySelectorAll('.filter-button').forEach(btn => {
|
||||
btn.classList.remove('active');
|
||||
});
|
||||
event.target.classList.add('active');
|
||||
|
||||
// Re-render with new filter
|
||||
renderAlerts(allAlerts);
|
||||
}
|
||||
|
||||
async function acknowledgeAlert(hostname, metricPath, event) {
|
||||
// Prevent event bubbling
|
||||
if (event) {
|
||||
event.stopPropagation();
|
||||
}
|
||||
|
||||
// Disable the button
|
||||
const button = event.target;
|
||||
button.disabled = true;
|
||||
button.textContent = 'Acknowledging...';
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/0/alerts/acknowledge', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
hostname: hostname,
|
||||
metric_path: metricPath,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
// Update the alert in our local data
|
||||
const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
|
||||
if (alert) {
|
||||
alert.acknowledged = true;
|
||||
alert.acknowledged_at = result.acknowledged_at;
|
||||
}
|
||||
|
||||
// Re-render alerts
|
||||
renderAlerts(allAlerts);
|
||||
|
||||
} catch (error) {
|
||||
alert(`Failed to acknowledge alert: ${error.message}`);
|
||||
button.disabled = false;
|
||||
button.textContent = 'Acknowledge';
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-refresh every 15 seconds
|
||||
setInterval(loadAlerts, 15000);
|
||||
|
||||
// Initial load
|
||||
loadAlerts();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,5 +1,5 @@
|
||||
<footer>
|
||||
<div id="copyright">
|
||||
©2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||
©2002-2026 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
|
||||
</div>
|
||||
</footer>
|
||||
@@ -0,0 +1,61 @@
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||
<title>{{ title }}</title>
|
||||
{% if extra_scripts %}<script src="{{ extra_scripts }}"></script>{% endif %}
|
||||
<style>
|
||||
/* Navigation bar — shared across all pages */
|
||||
.nav {
|
||||
background: #fff;
|
||||
padding: 10px 15px;
|
||||
margin-bottom: 10px;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,.1);
|
||||
border-radius: 4px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
}
|
||||
.nav-links { display: flex; align-items: center; }
|
||||
.nav a {
|
||||
margin-right: 20px;
|
||||
text-decoration: none;
|
||||
color: #0066cc;
|
||||
font-weight: 500;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.nav a:hover { text-decoration: underline; }
|
||||
.nav a.active { color: #333; font-weight: bold; }
|
||||
.nav-user {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
text-decoration: none;
|
||||
color: #333;
|
||||
font-size: 0.9em;
|
||||
font-weight: 500;
|
||||
padding: 4px 8px;
|
||||
border-radius: 20px;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.nav-user:hover { background: #f0f4ff; text-decoration: none; }
|
||||
.nav-avatar {
|
||||
width: 28px; height: 28px;
|
||||
border-radius: 50%;
|
||||
object-fit: cover;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.nav-initials {
|
||||
width: 28px; height: 28px;
|
||||
border-radius: 50%;
|
||||
background: #0066cc;
|
||||
color: #fff;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 0.75em;
|
||||
font-weight: 700;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
@@ -0,0 +1,477 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 10px;
|
||||
background: #f5f5f5;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1600px;
|
||||
margin: 0 auto;
|
||||
max-height: calc(100vh - 120px);
|
||||
overflow-y: auto;
|
||||
padding-right: 10px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 5px;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
h2 {
|
||||
color: #333;
|
||||
margin-bottom: 10px;
|
||||
font-size: 1.2em;
|
||||
padding: 10px 15px;
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 15px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.content {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
.table-section {
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.log-section {
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
#ntable {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
#ntable td,
|
||||
#ntable th {
|
||||
border: 1px solid #e0e0e0;
|
||||
text-align: left;
|
||||
padding: 8px 10px;
|
||||
}
|
||||
|
||||
#ntable tr:nth-child(even) {
|
||||
background-color: #fafafa;
|
||||
}
|
||||
|
||||
#ntable tr:hover {
|
||||
background-color: #e3f2fd;
|
||||
}
|
||||
|
||||
#ntable th {
|
||||
padding: 12px 10px;
|
||||
background-color: #2196f3;
|
||||
color: white;
|
||||
font-weight: 600;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
#ntable
|
||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||
content: " ⇅";
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
/* Alert count column styling */
|
||||
#ntable td.alert-warning {
|
||||
color: #ff9800;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
#ntable td.alert-critical {
|
||||
color: #f44336;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Scrollbar styling */
|
||||
.container::-webkit-scrollbar,
|
||||
.log-section::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-track,
|
||||
.log-section::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-thumb,
|
||||
.log-section::-webkit-scrollbar-thumb {
|
||||
background: #888;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-thumb:hover,
|
||||
.log-section::-webkit-scrollbar-thumb:hover {
|
||||
background: #555;
|
||||
}
|
||||
|
||||
/* Message styling */
|
||||
#messages {
|
||||
font-size: 0.85em;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
#messages div {
|
||||
padding: 5px 0;
|
||||
border-bottom: 1px solid #f0f0f0;
|
||||
}
|
||||
|
||||
/* Modal for connection status messages */
|
||||
.connection-modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
z-index: 1000;
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
.connection-modal.show {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.connection-modal-content {
|
||||
background-color: white;
|
||||
padding: 30px 40px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
|
||||
min-width: 300px;
|
||||
}
|
||||
|
||||
.connection-modal-content p {
|
||||
margin: 0;
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
/* State indicators */
|
||||
.state-up {
|
||||
color: #4caf50;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.state-down {
|
||||
color: #f44336;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.state-overdue {
|
||||
color: #ff9800;
|
||||
font-weight: 700;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
var nTable = document;
|
||||
var name_idx = {};
|
||||
var c = 0;
|
||||
|
||||
function setup() {
|
||||
name_idx = {};
|
||||
nTable = document.getElementById("ntable");
|
||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||
if (i == 0) continue;
|
||||
name = nTable.rows[i].cells[0].innerText;
|
||||
name_idx[name] = nTable.rows[i];
|
||||
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
|
||||
}
|
||||
}
|
||||
|
||||
function createRow(data) {
|
||||
var row = document.createElement("tr");
|
||||
var c_name = document.createElement("td");
|
||||
var c_warning = document.createElement("td");
|
||||
c_warning.style.textAlign = "center";
|
||||
c_warning.style.color = "#ff9800";
|
||||
c_warning.style.fontWeight = "bold";
|
||||
var c_critical = document.createElement("td");
|
||||
c_critical.style.textAlign = "center";
|
||||
c_critical.style.color = "#f44336";
|
||||
c_critical.style.fontWeight = "bold";
|
||||
var c_ipv4addr = document.createElement("td");
|
||||
var c_ipv4state = document.createElement("td");
|
||||
var c_ipv4latency = document.createElement("td");
|
||||
c_ipv4latency.style.textAlign = "right";
|
||||
var c_ipv4statets = document.createElement("td");
|
||||
c_ipv4statets.style.textAlign = "right";
|
||||
var c_ipv6addr = document.createElement("td");
|
||||
var c_ipv6state = document.createElement("td");
|
||||
var c_ipv6latency = document.createElement("td");
|
||||
c_ipv6latency.style.textAlign = "right";
|
||||
var c_ipv6statets = document.createElement("td");
|
||||
c_ipv6statets.style.textAlign = "right";
|
||||
row.appendChild(c_name);
|
||||
row.appendChild(c_warning);
|
||||
row.appendChild(c_critical);
|
||||
row.appendChild(c_ipv4addr);
|
||||
row.appendChild(c_ipv4state);
|
||||
row.appendChild(c_ipv4latency);
|
||||
row.appendChild(c_ipv4statets);
|
||||
row.appendChild(c_ipv6addr);
|
||||
row.appendChild(c_ipv6state);
|
||||
row.appendChild(c_ipv6latency);
|
||||
row.appendChild(c_ipv6statets);
|
||||
if (data.dyn) {
|
||||
c_name.innerHTML = "<b>" + data.name + "</b>";
|
||||
} else {
|
||||
c_name.innerHTML = data.name;
|
||||
}
|
||||
|
||||
// Set alert counts in "x/y" format (unacked/acked)
|
||||
var warningUnacked = data.alert_warning_unacked || 0;
|
||||
var warningAcked = data.alert_warning_acked || 0;
|
||||
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||
var criticalAcked = data.alert_critical_acked || 0;
|
||||
|
||||
if (warningUnacked > 0 || warningAcked > 0) {
|
||||
c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||
} else {
|
||||
c_warning.innerHTML = "";
|
||||
}
|
||||
|
||||
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||
c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||
} else {
|
||||
c_critical.innerHTML = "";
|
||||
}
|
||||
|
||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||
c_ipv4state.innerHTML = data.connections[0].state;
|
||||
if (data.connections.length > 1) {
|
||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||
c_ipv6state.innerHTML = data.connections[1].state;
|
||||
}
|
||||
var table = document.getElementById("ntablebody"); // find table to append to
|
||||
table.appendChild(row); // append row to table
|
||||
name_idx[c_name] = row;
|
||||
}
|
||||
|
||||
function formatTS(ts) {
|
||||
const milliseconds = ts * 1000;
|
||||
const dateObject = new Date(milliseconds);
|
||||
return dateObject.toLocaleString("de-DE");
|
||||
}
|
||||
|
||||
function update_table(data) {
|
||||
if (!(data.name in name_idx)) {
|
||||
createRow(data);
|
||||
setup();
|
||||
}
|
||||
|
||||
// Update warning and critical counts in "x/y" format (unacked/acked)
|
||||
var warningUnacked = data.alert_warning_unacked || 0;
|
||||
var warningAcked = data.alert_warning_acked || 0;
|
||||
var criticalUnacked = data.alert_critical_unacked || 0;
|
||||
var criticalAcked = data.alert_critical_acked || 0;
|
||||
|
||||
if (warningUnacked > 0 || warningAcked > 0) {
|
||||
name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
|
||||
} else {
|
||||
name_idx[data.name].cells[1].innerHTML = "";
|
||||
}
|
||||
|
||||
if (criticalUnacked > 0 || criticalAcked > 0) {
|
||||
name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
|
||||
} else {
|
||||
name_idx[data.name].cells[2].innerHTML = "";
|
||||
}
|
||||
|
||||
for (var i = 0; i < data.connections.length; i++) {
|
||||
// Offset by 2 for the warning/critical count columns
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
|
||||
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
|
||||
data.connections[i].statetime
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = '<span class="state-up">up</span>';
|
||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
latency = "";
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = "";
|
||||
name_idx[data.name].cells[6 + i * 4].innerHTML = "";
|
||||
} else if (data.connections[i].state == "down") {
|
||||
state = '<span class="state-down">down</span>';
|
||||
latency = "-";
|
||||
} else if (data.connections[i].state == "overdue") {
|
||||
state = '<span class="state-overdue">overdue</span>';
|
||||
latency = "-";
|
||||
} else {
|
||||
state = "<b>" + data.connections[i].state + "</b>";
|
||||
latency = "-";
|
||||
}
|
||||
}
|
||||
name_idx[data.name].cells[4 + i * 4].innerHTML = state;
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
|
||||
}
|
||||
}
|
||||
|
||||
function WS_Connect() {
|
||||
if ("WebSocket" in window) {
|
||||
//N.B: subprotocol field causes chrome to error 1006
|
||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||
|
||||
ws_hbd.onopen = function () {
|
||||
// Web Socket is connected, send data using send()
|
||||
console.log("ws connect {{heartbeat_ws_url}}");
|
||||
// Hide modal window if visible
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.remove("show");
|
||||
}
|
||||
ws_hbd.send("heartbeat_web");
|
||||
};
|
||||
|
||||
ws_hbd.onerror = function (event) {
|
||||
console.log(event);
|
||||
};
|
||||
|
||||
ws_hbd.onmessage = function (event) {
|
||||
/* console.log(event.data); */
|
||||
var state = JSON.parse(event.data);
|
||||
/* console.log("State: " + state.type); */
|
||||
if (state.type == "host") {
|
||||
update_table(state.data);
|
||||
} else if (state.type == "message") {
|
||||
var msgs = document.getElementById("messages");
|
||||
msgs.insertAdjacentHTML("afterbegin", "<div>" + state.data + "</div>");
|
||||
}
|
||||
cnt++;
|
||||
};
|
||||
|
||||
ws_hbd.onclose = function (event) {
|
||||
/* console.log(event); */
|
||||
console.log("Connection is closed, reopening");
|
||||
// Show modal window
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.add("show");
|
||||
}
|
||||
setTimeout(function () {
|
||||
WS_Connect();
|
||||
}, 3000);
|
||||
};
|
||||
} else {
|
||||
// The browser doesn't support WebSocket
|
||||
console.log("WebSocket NOT supported by your Browser!");
|
||||
}
|
||||
}
|
||||
WS_Connect();
|
||||
</script>
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
{% include 'menu.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Real-time host monitoring and event log</p>
|
||||
|
||||
<div class="table-section">
|
||||
<table id="ntable" class="sortable">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th style="text-align: center" title="Warning Alerts">⚠️</th>
|
||||
<th style="text-align: center" title="Critical Alerts">🔴</th>
|
||||
<th>IPv4 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latency</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
<th>IPv6 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latency</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="ntablebody">
|
||||
{% for host in hosts %}
|
||||
<tr>
|
||||
<td>{{ host.name }}</td>
|
||||
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||
{%- set warning_acked = host.alert_warning_acked -%}
|
||||
{%- if warning_unacked > 0 or warning_acked > 0 -%}
|
||||
{{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
|
||||
{%- endif -%}
|
||||
</td>
|
||||
<td style="text-align: center; color: #f44336; font-weight: bold;">
|
||||
{%- set critical_unacked = host.alert_critical_unacked -%}
|
||||
{%- set critical_acked = host.alert_critical_acked -%}
|
||||
{%- if critical_unacked > 0 or critical_acked > 0 -%}
|
||||
{{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
|
||||
{%- endif -%}
|
||||
</td>
|
||||
{% for conn in host.connections %}
|
||||
<td>{{ conn.addr if conn.addr else '' }}</td>
|
||||
<td>{{ conn.state if conn.state else '' }}</td>
|
||||
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
|
||||
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
|
||||
{% endfor %}
|
||||
{% if host.connections|length == 0 %}
|
||||
<td></td><td></td><td></td><td></td>
|
||||
<td></td><td></td><td></td><td></td>
|
||||
{% elif host.connections|length == 1 %}
|
||||
<td></td><td></td><td></td><td></td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="log-section">
|
||||
<h2>Log of Events</h2>
|
||||
<div id="messages"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% include 'foot.html' %}
|
||||
|
||||
<!-- Connection status modal -->
|
||||
<div id="connectionModal" class="connection-modal">
|
||||
<div class="connection-modal-content">
|
||||
<p>⚠️ Connection is closed, reopening...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
setup();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,2 @@
|
||||
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||
s<header>{{ header }}</header> -->
|
||||
@@ -0,0 +1,19 @@
|
||||
<div class="nav">
|
||||
<div class="nav-links">
|
||||
<a href="/live"{% if active_page == "live" %} class="active"{% endif %}>Live Dashboard</a>
|
||||
<a href="/plugins"{% if active_page == "plugins" %} class="active"{% endif %}>Plugin Metrics</a>
|
||||
<a href="/alerts"{% if active_page == "alerts" %} class="active"{% endif %}>Alerts</a>
|
||||
{% if current_user and current_user.admin %}
|
||||
<a href="/settings"{% if active_page == "settings" %} class="active"{% endif %}>Settings</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if current_user %}
|
||||
<a href="/profile" class="nav-user{% if active_page == 'profile' %} active{% endif %}" title="{{ current_user.full_name or current_user.username }}">
|
||||
{% if current_user.avatar %}
|
||||
<img class="nav-avatar" src="{{ current_user.avatar_url }}" alt="{{ current_user.full_name or current_user.username }}">
|
||||
{% else %}
|
||||
<span class="nav-initials">{{ (current_user.full_name or current_user.username)[:1] | upper }}</span>
|
||||
{% endif %}
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,334 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 20px;
|
||||
background: #f5f5f5;
|
||||
font-family: 'Segoe UI', system-ui, sans-serif;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 900px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 4px;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #666;
|
||||
margin-bottom: 24px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
/* ---- Profile card ---- */
|
||||
.profile-card {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||
padding: 28px 32px;
|
||||
margin-bottom: 24px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 28px;
|
||||
}
|
||||
|
||||
.avatar-large {
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
border-radius: 50%;
|
||||
object-fit: cover;
|
||||
flex-shrink: 0;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||
}
|
||||
|
||||
.avatar-initials-large {
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
border-radius: 50%;
|
||||
background: #0066cc;
|
||||
color: #fff;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 2em;
|
||||
font-weight: 700;
|
||||
flex-shrink: 0;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
||||
}
|
||||
|
||||
.profile-info { flex: 1; }
|
||||
|
||||
.profile-name {
|
||||
font-size: 1.4em;
|
||||
font-weight: 700;
|
||||
color: #222;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.profile-username {
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 10px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.78em;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.4px;
|
||||
}
|
||||
|
||||
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||
.badge-user { background: #f1f3f4; color: #555; }
|
||||
|
||||
.profile-logout {
|
||||
margin-top: 14px;
|
||||
}
|
||||
|
||||
.btn-logout {
|
||||
display: inline-block;
|
||||
padding: 6px 16px;
|
||||
border-radius: 4px;
|
||||
background: #f44336;
|
||||
color: #fff;
|
||||
font-size: 0.85em;
|
||||
font-weight: 500;
|
||||
text-decoration: none;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-logout:hover { background: #d32f2f; text-decoration: none; }
|
||||
|
||||
/* ---- Section cards ---- */
|
||||
.section {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 6px rgba(0,0,0,0.1);
|
||||
padding: 20px 24px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.section h2 {
|
||||
font-size: 1em;
|
||||
font-weight: 700;
|
||||
color: #333;
|
||||
margin: 0 0 16px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
/* ---- Settings rows ---- */
|
||||
.settings-row {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
padding: 8px 0;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.settings-row:last-child { border-bottom: none; }
|
||||
|
||||
.settings-label {
|
||||
width: 180px;
|
||||
flex-shrink: 0;
|
||||
color: #666;
|
||||
font-size: 0.88em;
|
||||
}
|
||||
|
||||
.settings-value { color: #222; }
|
||||
|
||||
.settings-empty { color: #aaa; font-style: italic; }
|
||||
|
||||
/* ---- Host lists ---- */
|
||||
.host-grid {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.host-chip {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 4px 12px;
|
||||
border-radius: 16px;
|
||||
font-size: 0.85em;
|
||||
font-weight: 500;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.host-chip.owner { background: #e8f5e9; color: #2e7d32; }
|
||||
.host-chip.manager { background: #e3f2fd; color: #1565c0; }
|
||||
.host-chip.monitor { background: #f3e5f5; color: #6a1b9a; }
|
||||
|
||||
.host-chip-dot {
|
||||
width: 7px; height: 7px; border-radius: 50%;
|
||||
}
|
||||
.owner .host-chip-dot { background: #2e7d32; }
|
||||
.manager .host-chip-dot { background: #1565c0; }
|
||||
.monitor .host-chip-dot { background: #6a1b9a; }
|
||||
|
||||
.no-hosts {
|
||||
color: #aaa;
|
||||
font-size: 0.9em;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/* ---- Notification channels ---- */
|
||||
.channel-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 6px 0;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.channel-row:last-child { border-bottom: none; }
|
||||
|
||||
.channel-type {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 10px;
|
||||
font-size: 0.78em;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
background: #f1f3f4;
|
||||
color: #555;
|
||||
min-width: 70px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.channel-name { color: #333; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>{{ header }}</h1>
|
||||
<p class="subtitle">Your account settings and host access</p>
|
||||
|
||||
<!-- Profile card -->
|
||||
<div class="profile-card">
|
||||
{% if current_user and current_user.avatar %}
|
||||
<img class="avatar-large" src="{{ current_user.avatar_url }}" alt="">
|
||||
{% else %}
|
||||
<div class="avatar-initials-large">
|
||||
{{ ((current_user.full_name if current_user else '') or (current_user.username if current_user else '?'))[:1] | upper }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="profile-info">
|
||||
<div class="profile-name">{{ current_user.full_name if current_user and current_user.full_name else (current_user.username if current_user else '—') }}</div>
|
||||
<div class="profile-username">@{{ current_user.username if current_user else '—' }}</div>
|
||||
{% if current_user and current_user.admin %}
|
||||
<span class="badge badge-admin">Admin</span>
|
||||
{% else %}
|
||||
<span class="badge badge-user">User</span>
|
||||
{% endif %}
|
||||
<div class="profile-logout">
|
||||
<a href="/logout" class="btn-logout">Sign out</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Account settings -->
|
||||
<div class="section">
|
||||
<h2>Account</h2>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Username</span>
|
||||
<span class="settings-value">{{ current_user.username if current_user else '—' }}</span>
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Full name</span>
|
||||
{% if current_user and current_user.full_name %}
|
||||
<span class="settings-value">{{ current_user.full_name }}</span>
|
||||
{% else %}
|
||||
<span class="settings-empty">Not set</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Role</span>
|
||||
<span class="settings-value">{{ 'Administrator' if current_user and current_user.admin else 'User' }}</span>
|
||||
</div>
|
||||
<div class="settings-row">
|
||||
<span class="settings-label">Avatar</span>
|
||||
{% if current_user and current_user.avatar %}
|
||||
<span class="settings-value" style="word-break:break-all;">{{ current_user.avatar }}</span>
|
||||
{% else %}
|
||||
<span class="settings-empty">Not set (initials used)</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Notification channels -->
|
||||
<div class="section">
|
||||
<h2>Notification Channels</h2>
|
||||
{% if notification_channels %}
|
||||
{% for ch in notification_channels %}
|
||||
<div class="channel-row">
|
||||
<span class="channel-type">{{ ch.type }}</span>
|
||||
<span class="channel-name">{{ ch.name }}</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">No personal notification channels configured.</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- Host access -->
|
||||
<div class="section">
|
||||
<h2>Host Access</h2>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Owner</span>
|
||||
<div class="host-grid">
|
||||
{% if owned_hosts %}
|
||||
{% for h in owned_hosts %}
|
||||
<span class="host-chip owner"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 14px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Manager</span>
|
||||
<div class="host-grid">
|
||||
{% if managed_hosts %}
|
||||
{% for h in managed_hosts %}
|
||||
<span class="host-chip manager"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="settings-row" style="align-items: flex-start; padding-bottom: 4px;">
|
||||
<span class="settings-label" style="padding-top: 2px;">Monitor</span>
|
||||
<div class="host-grid">
|
||||
{% if monitored_hosts %}
|
||||
{% for h in monitored_hosts %}
|
||||
<span class="host-chip monitor"><span class="host-chip-dot"></span>{{ h }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="no-hosts">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,429 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 20px;
|
||||
background: #f5f5f5;
|
||||
font-family: 'Segoe UI', system-ui, sans-serif;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 960px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
h1 { color: #333; margin-bottom: 4px; font-size: 1.5em; }
|
||||
.subtitle { color: #666; margin-bottom: 24px; font-size: 0.9em; }
|
||||
|
||||
/* ---- Sidebar + content layout ---- */
|
||||
.settings-layout {
|
||||
display: flex;
|
||||
gap: 24px;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.settings-sidebar {
|
||||
width: 180px;
|
||||
flex-shrink: 0;
|
||||
position: sticky;
|
||||
top: 20px;
|
||||
}
|
||||
|
||||
.sidebar-nav a {
|
||||
display: block;
|
||||
padding: 6px 10px;
|
||||
border-radius: 4px;
|
||||
text-decoration: none;
|
||||
font-size: 0.85em;
|
||||
color: #444;
|
||||
margin-bottom: 2px;
|
||||
transition: background 0.1s, color 0.1s;
|
||||
}
|
||||
.sidebar-nav a:hover { background: #e8eaf6; color: #1a237e; }
|
||||
.sidebar-nav a.active { background: #e3f2fd; color: #0066cc; font-weight: 600; }
|
||||
|
||||
.settings-main { flex: 1; min-width: 0; }
|
||||
|
||||
/* ---- Section card ---- */
|
||||
.section {
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,.08);
|
||||
margin-bottom: 24px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.section-header {
|
||||
padding: 14px 20px 12px;
|
||||
border-bottom: 1px solid #eee;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-size: 0.95em;
|
||||
font-weight: 700;
|
||||
color: #222;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
margin: 0 0 3px;
|
||||
}
|
||||
|
||||
.section-desc {
|
||||
font-size: 0.82em;
|
||||
color: #888;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ---- Field rows ---- */
|
||||
.field-row {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
padding: 10px 20px;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
gap: 16px;
|
||||
}
|
||||
.field-row:last-child { border-bottom: none; }
|
||||
|
||||
.field-label {
|
||||
width: 200px;
|
||||
flex-shrink: 0;
|
||||
font-size: 0.88em;
|
||||
font-weight: 500;
|
||||
color: #444;
|
||||
}
|
||||
|
||||
.field-body { flex: 1; min-width: 0; }
|
||||
|
||||
.field-value {
|
||||
font-size: 0.9em;
|
||||
color: #222;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
.field-desc {
|
||||
font-size: 0.78em;
|
||||
color: #999;
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
/* ---- Value type renderers ---- */
|
||||
.val-boolean {
|
||||
display: inline-block;
|
||||
padding: 2px 9px;
|
||||
border-radius: 10px;
|
||||
font-size: 0.8em;
|
||||
font-weight: 600;
|
||||
}
|
||||
.val-boolean.on { background: #e8f5e9; color: #2e7d32; }
|
||||
.val-boolean.off { background: #fce4ec; color: #c62828; }
|
||||
|
||||
.val-masked {
|
||||
font-family: monospace;
|
||||
color: #bbb;
|
||||
letter-spacing: 2px;
|
||||
}
|
||||
|
||||
.val-list { display: flex; flex-wrap: wrap; gap: 5px; }
|
||||
.val-tag {
|
||||
display: inline-block;
|
||||
padding: 2px 9px;
|
||||
background: #e8eaf6;
|
||||
color: #283593;
|
||||
border-radius: 10px;
|
||||
font-size: 0.8em;
|
||||
}
|
||||
.val-empty { color: #ccc; font-style: italic; font-size: 0.88em; }
|
||||
|
||||
/* ---- Users table ---- */
|
||||
.mini-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.875em;
|
||||
}
|
||||
.mini-table th {
|
||||
background: #f5f5f5;
|
||||
padding: 7px 12px;
|
||||
text-align: left;
|
||||
font-weight: 600;
|
||||
color: #555;
|
||||
font-size: 0.82em;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.4px;
|
||||
border-bottom: 1px solid #e0e0e0;
|
||||
}
|
||||
.mini-table td {
|
||||
padding: 7px 12px;
|
||||
border-bottom: 1px solid #f0f0f0;
|
||||
color: #333;
|
||||
vertical-align: middle;
|
||||
}
|
||||
.mini-table tbody tr:last-child td { border-bottom: none; }
|
||||
.mini-table tbody tr:hover { background: #fafafa; }
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 1px 8px;
|
||||
border-radius: 10px;
|
||||
font-size: 0.75em;
|
||||
font-weight: 600;
|
||||
}
|
||||
.badge-admin { background: #e8f0fe; color: #1a73e8; }
|
||||
.badge-user { background: #f1f3f4; color: #666; }
|
||||
|
||||
/* ---- Notification channels ---- */
|
||||
.channel-card {
|
||||
border: 1px solid #e8eaf6;
|
||||
border-radius: 6px;
|
||||
margin: 12px 20px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.channel-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 9px 14px;
|
||||
background: #f8f9ff;
|
||||
border-bottom: 1px solid #e8eaf6;
|
||||
}
|
||||
|
||||
.channel-name-text { font-weight: 600; font-size: 0.9em; color: #222; }
|
||||
|
||||
.ch-type-badge {
|
||||
padding: 2px 8px;
|
||||
border-radius: 8px;
|
||||
font-size: 0.75em;
|
||||
font-weight: 600;
|
||||
background: #e8eaf6;
|
||||
color: #3949ab;
|
||||
}
|
||||
|
||||
.channel-fields { padding: 6px 0; }
|
||||
|
||||
.channel-field {
|
||||
display: flex;
|
||||
padding: 5px 14px;
|
||||
font-size: 0.85em;
|
||||
border-bottom: 1px solid #f5f5f5;
|
||||
gap: 12px;
|
||||
}
|
||||
.channel-field:last-child { border-bottom: none; }
|
||||
.channel-field-label { width: 130px; flex-shrink: 0; color: #777; }
|
||||
.channel-field-value { color: #333; word-break: break-all; }
|
||||
|
||||
/* ---- Hosts table ---- */
|
||||
.host-bool { text-align: center; }
|
||||
.dot-yes { color: #2e7d32; font-size: 1.1em; }
|
||||
.dot-no { color: #ddd; font-size: 1.1em; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
{% include 'nav.html' %}
|
||||
|
||||
<div class="container">
|
||||
<h1>Settings</h1>
|
||||
<p class="subtitle">Current server configuration — read from the config file at startup.</p>
|
||||
|
||||
<div class="settings-layout">
|
||||
|
||||
<!-- Sidebar navigation -->
|
||||
<nav class="settings-sidebar">
|
||||
<div class="sidebar-nav" id="sidebar-nav">
|
||||
{% for section in sections %}
|
||||
<a href="#{{ section.id }}">{{ section.title }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<!-- Main content -->
|
||||
<div class="settings-main">
|
||||
{% for section in sections %}
|
||||
<div class="section" id="{{ section.id }}">
|
||||
<div class="section-header">
|
||||
<p class="section-title">{{ section.title }}</p>
|
||||
{% if section.description %}<p class="section-desc">{{ section.description }}</p>{% endif %}
|
||||
</div>
|
||||
|
||||
{# ---- Standard field rows ---- #}
|
||||
{% for f in section.fields %}
|
||||
<div class="field-row">
|
||||
<div class="field-label">{{ f.label }}</div>
|
||||
<div class="field-body">
|
||||
{% if f.sensitive %}
|
||||
<div class="field-value"><span class="val-masked">••••••••</span></div>
|
||||
{% elif f.type == "boolean" %}
|
||||
<div class="field-value">
|
||||
<span class="val-boolean {{ 'on' if f.value else 'off' }}">
|
||||
{{ 'Enabled' if f.value else 'Disabled' }}
|
||||
</span>
|
||||
</div>
|
||||
{% elif f.type == "list" %}
|
||||
<div class="field-value">
|
||||
{% if f.value %}
|
||||
<span class="val-list">
|
||||
{% for item in f.value %}<span class="val-tag">{{ item }}</span>{% endfor %}
|
||||
</span>
|
||||
{% else %}
|
||||
<span class="val-empty">None</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% elif f.value is none or f.value == "" %}
|
||||
<div class="field-value"><span class="val-empty">Not set</span></div>
|
||||
{% else %}
|
||||
<div class="field-value">{{ f.value }}</div>
|
||||
{% endif %}
|
||||
{% if f.description %}
|
||||
<div class="field-desc">{{ f.description }}</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
{# ---- Users section ---- #}
|
||||
{% if section.id == "users" and section.users %}
|
||||
<div style="padding: 0 0 4px;">
|
||||
<table class="mini-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Username</th>
|
||||
<th>Full Name</th>
|
||||
<th>Role</th>
|
||||
<th>Avatar</th>
|
||||
<th>Channels</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for u in section.users %}
|
||||
<tr>
|
||||
<td><strong>{{ u.username }}</strong></td>
|
||||
<td>{{ u.full_name or '—' }}</td>
|
||||
<td>
|
||||
{% if u.admin %}
|
||||
<span class="badge badge-admin">Admin</span>
|
||||
{% else %}
|
||||
<span class="badge badge-user">User</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="font-size:0.8em; color:#888;">
|
||||
{% if u.avatar %}{{ u.avatar }}{% else %}—{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if u.notification_channels %}
|
||||
<span class="val-list">
|
||||
{% for ch in u.notification_channels %}
|
||||
<span class="val-tag">{{ ch }}</span>
|
||||
{% endfor %}
|
||||
</span>
|
||||
{% else %}—{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Notification channels section ---- #}
|
||||
{% if section.id == "channels" %}
|
||||
{% for ch in section.channels %}
|
||||
<div class="channel-card">
|
||||
<div class="channel-header">
|
||||
<span class="channel-name-text">{{ ch.name }}</span>
|
||||
<span class="ch-type-badge">{{ ch.type_label }}</span>
|
||||
</div>
|
||||
<div class="channel-fields">
|
||||
{% for cf in ch.fields %}
|
||||
<div class="channel-field">
|
||||
<span class="channel-field-label">{{ cf.label }}</span>
|
||||
<span class="channel-field-value">
|
||||
{% if cf.sensitive %}
|
||||
<span class="val-masked">••••••••</span>
|
||||
{% elif cf.value is iterable and cf.value is not string %}
|
||||
{{ cf.value | join(', ') }}
|
||||
{% else %}
|
||||
{{ cf.value }}
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% if not section.channels %}
|
||||
<div class="field-row"><span class="val-empty">No notification channels configured.</span></div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{# ---- Hosts section ---- #}
|
||||
{% if section.id == "hosts" %}
|
||||
{% if section.hosts %}
|
||||
<div style="overflow-x: auto;">
|
||||
<table class="mini-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Host</th>
|
||||
<th>Watch</th>
|
||||
<th>DynDNS</th>
|
||||
<th>Owner</th>
|
||||
<th>Threshold config</th>
|
||||
<th>Channels</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for h in section.hosts %}
|
||||
<tr>
|
||||
<td><strong>{{ h.name }}</strong></td>
|
||||
<td class="host-bool">
|
||||
<span class="{{ 'dot-yes' if h.watch else 'dot-no' }}">●</span>
|
||||
</td>
|
||||
<td class="host-bool">
|
||||
<span class="{{ 'dot-yes' if h.dyndns else 'dot-no' }}">●</span>
|
||||
</td>
|
||||
<td>{{ h.owner or '—' }}</td>
|
||||
<td>{{ h.threshold_config or '—' }}</td>
|
||||
<td>
|
||||
{% if h.notification_channels %}
|
||||
<span class="val-list">
|
||||
{% for ch in h.notification_channels %}
|
||||
<span class="val-tag">{{ ch }}</span>
|
||||
{% endfor %}
|
||||
</span>
|
||||
{% else %}—{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="field-row"><span class="val-empty">No hosts defined in config.</span></div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
</div>{# /section #}
|
||||
{% endfor %}
|
||||
</div>{# /settings-main #}
|
||||
</div>{# /settings-layout #}
|
||||
</div>{# /container #}
|
||||
|
||||
<script>
|
||||
// Highlight sidebar link for the section currently in view
|
||||
const sections = document.querySelectorAll('.section');
|
||||
const navLinks = document.querySelectorAll('.sidebar-nav a');
|
||||
|
||||
const observer = new IntersectionObserver(entries => {
|
||||
entries.forEach(entry => {
|
||||
if (entry.isIntersecting) {
|
||||
const id = entry.target.id;
|
||||
navLinks.forEach(a => {
|
||||
a.classList.toggle('active', a.getAttribute('href') === '#' + id);
|
||||
});
|
||||
}
|
||||
});
|
||||
}, { threshold: 0.25 });
|
||||
|
||||
sections.forEach(s => observer.observe(s));
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,467 @@
|
||||
"""UDP listener and datagram processing."""
|
||||
|
||||
import asyncio
|
||||
import socket
|
||||
import struct
|
||||
import time
|
||||
import zlib
|
||||
import logging
|
||||
|
||||
from ..common.proto import stodict, oldmtodict
|
||||
from ..common.utils import dur
|
||||
from . import notify as notify_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
|
||||
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
|
||||
# Python's socket module on all platforms, so fall back to the Linux value (29)
|
||||
# when absent.
|
||||
_SO_TIMESTAMP = getattr(socket, 'SO_TIMESTAMP', 29)
|
||||
# struct timeval uses two native C longs: tv_sec and tv_usec
|
||||
_TIMEVAL = struct.Struct('@ll')
|
||||
|
||||
|
||||
def enable_kernel_timestamps(sock) -> bool:
|
||||
"""Try to enable SO_TIMESTAMP on *sock*.
|
||||
|
||||
Returns True if the kernel will supply receive timestamps, False otherwise
|
||||
(unsupported platform, older kernel, or insufficient permissions).
|
||||
"""
|
||||
try:
|
||||
sock.setsockopt(socket.SOL_SOCKET, _SO_TIMESTAMP, 1)
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _extract_kernel_ts(ancdata) -> float | None:
|
||||
"""Parse recvmsg ancillary data and return the kernel receive time.
|
||||
|
||||
Returns seconds as a float, or None if no SO_TIMESTAMP cmsg is present.
|
||||
"""
|
||||
for cmsg_level, cmsg_type, cmsg_data in ancdata:
|
||||
if cmsg_level == socket.SOL_SOCKET and cmsg_type == _SO_TIMESTAMP:
|
||||
if len(cmsg_data) >= _TIMEVAL.size:
|
||||
sec, usec = _TIMEVAL.unpack_from(cmsg_data)
|
||||
return sec + usec * 1e-6
|
||||
return None
|
||||
|
||||
|
||||
class RecvmsgTransport:
|
||||
"""Thin wrapper used when SO_TIMESTAMP is active (add_reader path).
|
||||
|
||||
Exposes the same sendto() / close() interface as asyncio's DatagramTransport
|
||||
so the rest of the code does not need to know which path is in use.
|
||||
"""
|
||||
def __init__(self, loop, sock):
|
||||
self._loop = loop
|
||||
self._sock = sock
|
||||
|
||||
def sendto(self, data, addr):
|
||||
try:
|
||||
self._sock.sendto(data, addr)
|
||||
except Exception as e:
|
||||
logger.debug("sendto failed: %s", e)
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self._loop.remove_reader(self._sock.fileno())
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self._sock.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def make_recvmsg_reader(sock, handler, transport):
|
||||
"""Return a callback suitable for loop.add_reader().
|
||||
|
||||
Reads one datagram per call using recvmsg() so that kernel timestamps in
|
||||
the ancillary data are accessible. Falls back to time.time() if the
|
||||
cmsg is missing.
|
||||
|
||||
handler(msg, addr, transport, kernel_ts) – same signature as udp_handler
|
||||
in main.py with the optional kernel_ts argument.
|
||||
"""
|
||||
BUFSIZE = 65536
|
||||
ANCBUFSIZE = 128 # enough for one struct timespec cmsg
|
||||
|
||||
def _read():
|
||||
try:
|
||||
data, ancdata, _, addr = sock.recvmsg(BUFSIZE, ANCBUFSIZE)
|
||||
except BlockingIOError:
|
||||
return
|
||||
except OSError as e:
|
||||
logger.warning("recvmsg error: %s", e)
|
||||
return
|
||||
try:
|
||||
kernel_ts = _extract_kernel_ts(ancdata)
|
||||
msg = parse_message(data)
|
||||
if msg:
|
||||
handler(msg, addr, transport, kernel_ts)
|
||||
except Exception:
|
||||
logger.exception("Error processing datagram from %s", addr)
|
||||
|
||||
return _read
|
||||
|
||||
|
||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||
def __init__(self, config=None, handler=None):
|
||||
super().__init__()
|
||||
self.config = config or {}
|
||||
self.handler = handler
|
||||
|
||||
def connection_made(self, transport):
|
||||
self.transport = transport
|
||||
logger.info("UDP Server listening...")
|
||||
|
||||
def datagram_received(self, data, addr):
|
||||
logger.debug("Received from %s", addr)
|
||||
try:
|
||||
msg = parse_message(data)
|
||||
if self.handler:
|
||||
# handler can be a callable provided by the application
|
||||
# pass the transport so handlers can send replies (ACKs/commands)
|
||||
self.handler(msg, addr, self.transport)
|
||||
except Exception:
|
||||
logger.exception("Error while processing datagram from %s", addr)
|
||||
|
||||
|
||||
def parse_message(data: bytes):
|
||||
"""Parse a raw datagram into a message dict.
|
||||
|
||||
Uses the protocol decoding helpers and falls back to old format when
|
||||
decoding returns an empty dict (compat with older clients).
|
||||
"""
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
# fallback to old format
|
||||
msg = oldmtodict(data)
|
||||
return msg
|
||||
|
||||
|
||||
def dicttos(ID, d):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
return opk
|
||||
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||
|
||||
|
||||
def _make_timer_callbacks(uname, host, watchhosts, ctx):
|
||||
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||
|
||||
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||
"""
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
cfg = ctx.get("config", {})
|
||||
|
||||
async def on_unknown(connection):
|
||||
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
async def on_overdue(connection):
|
||||
if connection.getstate() != connection.__class__.UP:
|
||||
return
|
||||
now = time.time()
|
||||
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
if threshold_checker:
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path="rtt",
|
||||
value=float("inf"),
|
||||
alert_states=host.alert_states,
|
||||
)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||
|
||||
return on_overdue, on_unknown
|
||||
|
||||
|
||||
def restore_connection_timers(hbdclass, ctx):
|
||||
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||
|
||||
For UP connections, the remaining time until overdue is calculated from
|
||||
lastbeat so that clients that vanished during hbd's downtime are detected.
|
||||
For OVERDUE connections, the UNKNOWN drop timer is restored.
|
||||
"""
|
||||
now = time.time()
|
||||
cfg = ctx.get("config", {})
|
||||
grace = cfg.get("grace", 2)
|
||||
from . import config as config_mod
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
restored = 0
|
||||
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||
interval = host.interval
|
||||
for afam, conn in list(host.connections.items()):
|
||||
state = conn.getstate()
|
||||
if state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
remaining = max(1.0, (interval + grace) - elapsed)
|
||||
conn.reset_overdue_timer(remaining, on_overdue)
|
||||
logger.debug(
|
||||
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)",
|
||||
uname, afam, remaining, elapsed,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
elif state == hbdclass.Connection.OVERDUE:
|
||||
elapsed_overdue = now - conn.statetime
|
||||
remaining = DROPOVERDUE - elapsed_overdue
|
||||
if remaining <= 1:
|
||||
# Already past the drop window — mark UNKNOWN immediately
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
logger.info(
|
||||
"Marking %s/%s UNKNOWN (overdue %.1f days)",
|
||||
uname, afam, elapsed_overdue / 86400,
|
||||
)
|
||||
else:
|
||||
conn.reset_overdue_timer(remaining, on_unknown)
|
||||
logger.debug(
|
||||
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||
uname, afam, remaining,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
logger.info("Restored timers for %d connection(s)", restored)
|
||||
|
||||
|
||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
"""Handle a parsed datagram message.
|
||||
|
||||
ctx is a dictionary with runtime dependencies:
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- msg_journal: MessageJournal instance for logging all messages
|
||||
- DEBUG, verbose
|
||||
"""
|
||||
if not msg:
|
||||
return
|
||||
now = ctx.get("recv_ts") or time.time()
|
||||
|
||||
# Log message to journal
|
||||
msg_journal = ctx.get("msg_journal")
|
||||
if msg_journal:
|
||||
# Create async task to log message (non-blocking)
|
||||
import asyncio
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(msg_journal.log_message(msg, addr, now))
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to log message to journal: {e}")
|
||||
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
log = ctx.get("log")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
|
||||
# normalize addr (ip, port)
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from ..common.utils import shortname
|
||||
from . import config as config_mod
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
# Use new config function to check dyndns
|
||||
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||
host.dyn = uname in dyndnshosts
|
||||
# Apply user-access settings from config
|
||||
access = config_mod.get_host_access(cfg, uname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
# Get watchhosts once for use throughout message handling
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt"))
|
||||
except TypeError:
|
||||
rtt = None
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
# send ACK back
|
||||
rmsg = {"time": time.time()}
|
||||
opkt = dicttos("ACK", rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
elif msg.get("ID") == "PLG":
|
||||
# Handle plugin data message
|
||||
plugin_name = msg.get("plugin")
|
||||
if plugin_name:
|
||||
# Extract plugin fields, dropping protocol metadata fields
|
||||
plugin_data = {k: v for k, v in msg.items()
|
||||
if k not in ("ID", "plugin", "id", "name")}
|
||||
# Store plugin data with timestamp
|
||||
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||
if DEBUG > 1:
|
||||
print(f"Stored plugin data for {uname}: {plugin_name}")
|
||||
|
||||
# Check thresholds if checker is available
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker:
|
||||
try:
|
||||
state_changes = threshold_checker.check_plugin_data(
|
||||
host_name=uname,
|
||||
plugin_name=plugin_name,
|
||||
data=plugin_data,
|
||||
alert_states=host.alert_states,
|
||||
)
|
||||
if DEBUG > 1 and state_changes:
|
||||
print(f"Threshold state changes for {uname}: {state_changes}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
|
||||
|
||||
# Notify websockets of plugin update
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("plugin", {
|
||||
"host": uname,
|
||||
"plugin": plugin_name,
|
||||
"data": plugin_data,
|
||||
"timestamp": now
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
conn, res = host.conndata(cid, ip, rtt, now)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print("conndata failed: %s" % e)
|
||||
return
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
service = msg.get("service", "unknown")
|
||||
message = msg.get("msg", None)
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if uname in watchhosts:
|
||||
m = "%s booted" % (host.name)
|
||||
notify_mod.pushmsg_for_host(uname, m)
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
else:
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
# Timer-based reachability monitoring
|
||||
# Reset overdue timer on every heartbeat
|
||||
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||
grace = cfg.get("grace", 2)
|
||||
timeout_seconds = interval + grace
|
||||
on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||
|
||||
# Check RTT thresholds using the threshold checker
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker and rtt and rtt > 0:
|
||||
# Metric path for RTT is simply "rtt"
|
||||
metric_path = "rtt"
|
||||
|
||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=rtt,
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# send any commands we have queued
|
||||
while len(host.cmds):
|
||||
op, rmsg = host.cmds[0]
|
||||
if op == "CMD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "command sent")
|
||||
elif op == "UPD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "update initiated")
|
||||
opkt = dicttos(op, rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send cmd/update: %s" % e))
|
||||
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send websocket message: %s" % e))
|
||||
@@ -0,0 +1,228 @@
|
||||
"""User management: loading, authentication, and session tracking.
|
||||
|
||||
Users are defined in the config file under the ``users`` key:
|
||||
|
||||
users:
|
||||
alice:
|
||||
full_name: Alice Smith
|
||||
avatar: /path/to/avatar.png # file path, URL, or base64 data URI
|
||||
password: pbkdf2:sha256:... # generated with: hbd passwd
|
||||
admin: true # optional server-level admin
|
||||
notification_channels: [pushover_standard]
|
||||
|
||||
Roles are assigned per-host:
|
||||
|
||||
hosts:
|
||||
webserver01:
|
||||
owner: alice
|
||||
managers: [bob]
|
||||
monitors: [carol]
|
||||
|
||||
If no users are defined the server runs in unauthenticated mode (backwards
|
||||
compatible). When users are defined every API call must carry a valid session
|
||||
token in an ``Authorization: Bearer <token>`` or ``X-Auth-Token`` header,
|
||||
obtained via ``POST /api/0/auth/login``.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import hmac
|
||||
import logging
|
||||
import secrets
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session lifetime in seconds (24 hours).
|
||||
SESSION_TTL = 86400
|
||||
|
||||
# Global session store: token -> {"username": str, "expires": float, "created": float}
|
||||
_sessions: dict = {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# User class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class User:
|
||||
def __init__(
|
||||
self,
|
||||
username: str,
|
||||
full_name: str = "",
|
||||
avatar: str = "",
|
||||
password_hash: str = "",
|
||||
admin: bool = False,
|
||||
notification_channels: list | None = None,
|
||||
):
|
||||
self.username = username
|
||||
self.full_name = full_name
|
||||
self.avatar = avatar
|
||||
self.password_hash = password_hash
|
||||
self.admin = admin
|
||||
self.notification_channels: list = notification_channels or []
|
||||
|
||||
def check_password(self, password: str) -> bool:
|
||||
if not self.password_hash:
|
||||
return False
|
||||
return _verify_password(password, self.password_hash)
|
||||
|
||||
def avatar_is_local(self) -> bool:
|
||||
"""Return True when the avatar is a local filesystem path (starts with '/')."""
|
||||
return bool(self.avatar and self.avatar.startswith("/"))
|
||||
|
||||
def avatar_url(self) -> str:
|
||||
"""Return the URL to use as an <img src>.
|
||||
|
||||
Local file paths are served via the /api/0/users/{username}/avatar
|
||||
endpoint. External URLs and data URIs are returned as-is.
|
||||
"""
|
||||
if self.avatar_is_local():
|
||||
return f"/api/0/users/{self.username}/avatar"
|
||||
return self.avatar
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"username": self.username,
|
||||
"full_name": self.full_name,
|
||||
"avatar": self.avatar,
|
||||
"avatar_url": self.avatar_url(),
|
||||
"admin": self.admin,
|
||||
"notification_channels": self.notification_channels,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Password hashing (PBKDF2-HMAC-SHA256, stdlib only)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def hash_password(password: str) -> str:
|
||||
"""Return a storable hash for *password*.
|
||||
|
||||
Format: ``pbkdf2:sha256:<iterations>:<salt>:<hex-digest>``
|
||||
|
||||
Use this to generate the ``password`` value in the config file::
|
||||
|
||||
python -c "from hbd.server.users import hash_password; print(hash_password('secret'))"
|
||||
|
||||
Or via the CLI::
|
||||
|
||||
hbd passwd
|
||||
"""
|
||||
salt = secrets.token_hex(16)
|
||||
iterations = 260_000
|
||||
dk = hashlib.pbkdf2_hmac(
|
||||
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||
)
|
||||
return f"pbkdf2:sha256:{iterations}:{salt}:{dk.hex()}"
|
||||
|
||||
|
||||
def _verify_password(password: str, stored_hash: str) -> bool:
|
||||
"""Return True if *password* matches *stored_hash*."""
|
||||
try:
|
||||
parts = stored_hash.split(":")
|
||||
if len(parts) != 5 or parts[0] != "pbkdf2" or parts[1] != "sha256":
|
||||
return False
|
||||
_, _, iterations_str, salt, expected_hex = parts
|
||||
iterations = int(iterations_str)
|
||||
dk = hashlib.pbkdf2_hmac(
|
||||
"sha256", password.encode("utf-8"), salt.encode("utf-8"), iterations
|
||||
)
|
||||
return hmac.compare_digest(dk.hex(), expected_hex)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global user registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# username -> User
|
||||
users: dict = {}
|
||||
|
||||
|
||||
def load_users(config: dict) -> dict:
|
||||
"""Populate the global user registry from *config*.
|
||||
|
||||
Called once at startup and again on SIGHUP config reload.
|
||||
Returns the new ``users`` dict.
|
||||
"""
|
||||
global users
|
||||
users_cfg = config.get("users", {})
|
||||
if not isinstance(users_cfg, dict):
|
||||
users = {}
|
||||
return users
|
||||
|
||||
result: dict = {}
|
||||
for username, attrs in users_cfg.items():
|
||||
if not isinstance(attrs, dict):
|
||||
logger.warning("Skipping user %r: expected a mapping", username)
|
||||
continue
|
||||
result[username] = User(
|
||||
username=username,
|
||||
full_name=attrs.get("full_name", ""),
|
||||
avatar=attrs.get("avatar", ""),
|
||||
password_hash=attrs.get("password", ""),
|
||||
admin=bool(attrs.get("admin", False)),
|
||||
notification_channels=attrs.get("notification_channels", []),
|
||||
)
|
||||
|
||||
users = result
|
||||
logger.info("Loaded %d user(s) from config", len(users))
|
||||
return users
|
||||
|
||||
|
||||
def users_enabled() -> bool:
|
||||
"""Return True if at least one user is configured (auth-required mode)."""
|
||||
return bool(users)
|
||||
|
||||
|
||||
def get_user(username: str) -> "User | None":
|
||||
return users.get(username)
|
||||
|
||||
|
||||
def authenticate(username: str, password: str) -> "User | None":
|
||||
"""Return the User if credentials are valid, else None."""
|
||||
user = users.get(username)
|
||||
if user and user.check_password(password):
|
||||
return user
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def create_session(username: str) -> str:
|
||||
"""Create a new session for *username* and return the opaque token."""
|
||||
_purge_expired_sessions()
|
||||
token = secrets.token_hex(32)
|
||||
_sessions[token] = {
|
||||
"username": username,
|
||||
"expires": time.time() + SESSION_TTL,
|
||||
"created": time.time(),
|
||||
}
|
||||
return token
|
||||
|
||||
|
||||
def get_session_user(token: str) -> "User | None":
|
||||
"""Return the User for a valid *token*, or None if missing/expired."""
|
||||
if not token:
|
||||
return None
|
||||
session = _sessions.get(token)
|
||||
if not session:
|
||||
return None
|
||||
if session["expires"] < time.time():
|
||||
del _sessions[token]
|
||||
return None
|
||||
return get_user(session["username"])
|
||||
|
||||
|
||||
def delete_session(token: str) -> None:
|
||||
"""Invalidate *token* (logout)."""
|
||||
_sessions.pop(token, None)
|
||||
|
||||
|
||||
def _purge_expired_sessions() -> None:
|
||||
now = time.time()
|
||||
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
|
||||
for t in expired:
|
||||
del _sessions[t]
|
||||
@@ -0,0 +1,158 @@
|
||||
"""WebSocket server and broadcast helpers for hbd.
|
||||
|
||||
Provides an asyncio-based WebSocket server and a thread-safe broadcast
|
||||
function that other threads or synchronous code can call.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Callable, Iterable, Optional
|
||||
from . import data
|
||||
|
||||
import websockets
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
_connections = set()
|
||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||
#_get_msgs: Optional[Callable[[], Iterable]] = None
|
||||
_verbose = False
|
||||
|
||||
|
||||
async def _handler(websocket, path=None):
|
||||
_connections.add(websocket)
|
||||
remote_address = websocket.remote_address
|
||||
if path is None:
|
||||
path = getattr(websocket, "path", None)
|
||||
logger.info("WebSocket connection from %s: %s", remote_address, path)
|
||||
try:
|
||||
# send initial hosts
|
||||
if _get_hosts:
|
||||
try:
|
||||
hosts = list(_get_hosts())
|
||||
logger.debug("Sending %d hosts to new WebSocket client", len(hosts))
|
||||
for h in hosts:
|
||||
jmsg = json.dumps({"type": "host", "data": h})
|
||||
await websocket.send(jmsg)
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial hosts: %s", e, exc_info=True)
|
||||
# send recent messages
|
||||
if data.msgs:
|
||||
try:
|
||||
# msgs = list(_get_msgs())[-100:]
|
||||
logger.debug("Sending %d recent messages to new WebSocket client", len(data.msgs))
|
||||
for m in data.msgs:
|
||||
jmsg = json.dumps({"type": "message", "data": m})
|
||||
await websocket.send(jmsg)
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial messages: %s", e, exc_info=True)
|
||||
|
||||
# keep connection open until client disconnects
|
||||
async for _ in websocket:
|
||||
# we don't expect meaningful incoming messages besides the initial
|
||||
# client 'hello' that some clients send; ignore for now
|
||||
if _verbose:
|
||||
logger.debug("received ws data: %s", _)
|
||||
|
||||
except (
|
||||
websockets.exceptions.ConnectionClosedOK,
|
||||
websockets.exceptions.ConnectionClosedError,
|
||||
) as e:
|
||||
logger.info("WebSocket closed from %s: %r", remote_address, e)
|
||||
except Exception as e:
|
||||
logger.exception("WebSocket handler exception from %s: %s", remote_address, e)
|
||||
finally:
|
||||
logger.debug("Removing WebSocket connection from %s", remote_address)
|
||||
_connections.discard(websocket)
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
ws_port: int,
|
||||
wss_port: Optional[int] = None,
|
||||
ssl_context=None,
|
||||
get_hosts: Optional[Callable] = None,
|
||||
# get_msgs: Optional[Callable] = None,
|
||||
config: dict = {},
|
||||
):
|
||||
"""Start WebSocket servers and block until cancelled.
|
||||
|
||||
This is intended to be awaited inside the main asyncio event loop.
|
||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
||||
started.
|
||||
"""
|
||||
global _loop, _get_hosts, _verbose
|
||||
_loop = asyncio.get_running_loop()
|
||||
_get_hosts = get_hosts
|
||||
_verbose = config.get("verbose", False),
|
||||
_debug = config.get("debug", 0),
|
||||
|
||||
# Start servers and keep the server objects for clean shutdown
|
||||
running_servers = []
|
||||
ws_server = await websockets.serve(_handler, host, ws_port)
|
||||
running_servers.append(ws_server)
|
||||
if wss_port and ssl_context:
|
||||
wss_server = await websockets.serve(_handler, host, wss_port, ssl=ssl_context)
|
||||
running_servers.append(wss_server)
|
||||
|
||||
logger.info(
|
||||
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
|
||||
)
|
||||
|
||||
try:
|
||||
# Block until cancelled
|
||||
await asyncio.Future()
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
# Close all active browser connections so their handler coroutines exit
|
||||
active = list(_connections)
|
||||
if active:
|
||||
logger.info("Closing %d active WebSocket connection(s)...", len(active))
|
||||
await asyncio.gather(
|
||||
*[ws.close() for ws in active],
|
||||
return_exceptions=True,
|
||||
)
|
||||
# Stop the listening servers and wait for all handlers to finish
|
||||
for srv in running_servers:
|
||||
srv.close()
|
||||
await asyncio.gather(
|
||||
*[srv.wait_closed() for srv in running_servers],
|
||||
return_exceptions=True,
|
||||
)
|
||||
logger.info("WebSocket server(s) stopped")
|
||||
|
||||
|
||||
def broadcast(typ: str, data) -> bool:
|
||||
"""Thread-safe broadcast helper.
|
||||
|
||||
Schedules coroutine(s) on the running loop to send message to all
|
||||
connected websockets. Returns False if server was not running.
|
||||
"""
|
||||
if not _loop:
|
||||
return False
|
||||
jmsg = json.dumps({"type": typ, "data": data})
|
||||
to_close = []
|
||||
for ws in list(_connections):
|
||||
if ws.state != websockets.protocol.State.OPEN:
|
||||
to_close.append(ws)
|
||||
continue
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
|
||||
except Exception:
|
||||
to_close.append(ws)
|
||||
logger.debug("ws.send exception: closed")
|
||||
for ws in to_close:
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
|
||||
except Exception:
|
||||
pass
|
||||
if ws in _connections:
|
||||
_connections.remove(ws)
|
||||
return True
|
||||
|
||||
|
||||
def connection_count() -> int:
|
||||
return len(_connections)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 5.3 KiB |
@@ -1,7 +0,0 @@
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
||||
<link rel="stylesheet" href="/static/style.css" type="text/css" />
|
||||
<link rel="icon" href="/static/images/favicon.ico" sizes="32x32" />
|
||||
<title>{{ title }}</title>
|
||||
<script src="{{ extra_scripts }}"></script>
|
||||
</head>
|
||||
@@ -1,281 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
.content {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.table {
|
||||
/* flex: 1; */
|
||||
flex-grow: none;
|
||||
}
|
||||
|
||||
.log {
|
||||
flex: 2;
|
||||
flex-grow: 1;
|
||||
|
||||
}
|
||||
|
||||
#ntable {
|
||||
border-collapse: collapse;
|
||||
font-size: 95%;
|
||||
/* width: 100%; */
|
||||
}
|
||||
|
||||
#ntable td,
|
||||
#ntable th {
|
||||
border: 1px solid #ddd;
|
||||
text-align: left;
|
||||
padding: 0px;
|
||||
}
|
||||
|
||||
#ntable tr:nth-child(even) {
|
||||
background-color: #f2f2f2;
|
||||
}
|
||||
|
||||
#ntable tr:hover {
|
||||
background-color: #ddd;
|
||||
}
|
||||
|
||||
#ntable th {
|
||||
padding-top: 12px;
|
||||
padding-bottom: 12px;
|
||||
background-color: #9d9d9d;
|
||||
color: white;
|
||||
}
|
||||
|
||||
#ntable
|
||||
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
|
||||
content: " \2195";
|
||||
}
|
||||
|
||||
/* Modal for connection status messages */
|
||||
.connection-modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
z-index: 1000;
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.4);
|
||||
}
|
||||
|
||||
.connection-modal.show {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.connection-modal-content {
|
||||
background-color: #f9f9f9;
|
||||
padding: 20px;
|
||||
border: 1px solid #888;
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
|
||||
min-width: 300px;
|
||||
}
|
||||
|
||||
.connection-modal-content p {
|
||||
margin: 10px 0;
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
var nTable = document;
|
||||
var name_idx = {};
|
||||
var c = 0;
|
||||
|
||||
function setup() {
|
||||
name_idx = {};
|
||||
nTable = document.getElementById("ntable");
|
||||
for (var i = 0, row; (row = nTable.rows[i]); i++) {
|
||||
if (i == 0) continue;
|
||||
name = nTable.rows[i].cells[0].innerText;
|
||||
name_idx[name] = nTable.rows[i];
|
||||
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
|
||||
}
|
||||
}
|
||||
|
||||
function createRow(data) {
|
||||
var row = document.createElement("tr");
|
||||
var c_name = document.createElement("td");
|
||||
var c_ver = document.createElement("td");
|
||||
var c_ipv4addr = document.createElement("td");
|
||||
var c_ipv4state = document.createElement("td");
|
||||
var c_ipv4latency = document.createElement("td");
|
||||
c_ipv4latency.style.textAlign = "right";
|
||||
var c_ipv4statets = document.createElement("td");
|
||||
c_ipv4statets.style.textAlign = "right";
|
||||
var c_ipv6addr = document.createElement("td");
|
||||
var c_ipv6state = document.createElement("td");
|
||||
var c_ipv6latency = document.createElement("td");
|
||||
c_ipv6latency.style.textAlign = "right";
|
||||
var c_ipv6statets = document.createElement("td");
|
||||
c_ipv6statets.style.textAlign = "right";
|
||||
row.appendChild(c_name);
|
||||
row.appendChild(c_ver);
|
||||
row.appendChild(c_ipv4addr);
|
||||
row.appendChild(c_ipv4state);
|
||||
row.appendChild(c_ipv4latency);
|
||||
row.appendChild(c_ipv4statets);
|
||||
row.appendChild(c_ipv6addr);
|
||||
row.appendChild(c_ipv6state);
|
||||
row.appendChild(c_ipv6latency);
|
||||
row.appendChild(c_ipv6statets);
|
||||
if (data.dyn) {
|
||||
c_name.innerHTML = "<b>" + data.name + "</b>";
|
||||
} else {
|
||||
c_name.innerHTML = data.name;
|
||||
}
|
||||
c_ver.innerHTML = data.cver;
|
||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||
c_ipv4state.innerHTML = data.connections[0].state;
|
||||
if (data.connections.length > 1) {
|
||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||
c_ipv6state.innerHTML = data.connections[1].state;
|
||||
}
|
||||
var table = document.getElementById("ntablebody"); // find table to append to
|
||||
table.appendChild(row); // append row to table
|
||||
name_idx[c_name] = row;
|
||||
}
|
||||
|
||||
function formatTS(ts) {
|
||||
const milliseconds = ts * 1000;
|
||||
const dateObject = new Date(milliseconds);
|
||||
return dateObject.toLocaleString("de-DE");
|
||||
}
|
||||
|
||||
function update_table(data) {
|
||||
if (!(data.name in name_idx)) {
|
||||
createRow(data);
|
||||
setup();
|
||||
}
|
||||
|
||||
for (var i = 0; i < data.connections.length; i++) {
|
||||
name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
|
||||
data.connections[i].statetime
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = "up";
|
||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
latency = "";
|
||||
name_idx[data.name].cells[2 + i * 4].innerHTML = "";
|
||||
name_idx[data.name].cells[5 + i * 4].innerHTML = "";
|
||||
} else {
|
||||
state = "<b>" + data.connections[i].state + "</b>";
|
||||
latency = "-";
|
||||
}
|
||||
}
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = state;
|
||||
name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
|
||||
}
|
||||
}
|
||||
|
||||
function WS_Connect() {
|
||||
if ("WebSocket" in window) {
|
||||
//N.B: subprotocol field causes chrome to error 1006
|
||||
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
|
||||
|
||||
ws_hbd.onopen = function () {
|
||||
// Web Socket is connected, send data using send()
|
||||
console.log("ws connect {{heartbeat_ws_url}}");
|
||||
// Hide modal window if visible
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.remove("show");
|
||||
}
|
||||
ws_hbd.send("heartbeat_web");
|
||||
};
|
||||
|
||||
ws_hbd.onerror = function (event) {
|
||||
console.log(event);
|
||||
};
|
||||
|
||||
ws_hbd.onmessage = function (event) {
|
||||
/* console.log(event.data); */
|
||||
var state = JSON.parse(event.data);
|
||||
/* console.log("State: " + state.type); */
|
||||
if (state.type == "host") {
|
||||
update_table(state.data);
|
||||
} else if (state.type == "message") {
|
||||
var msgs = document.getElementById("messages");
|
||||
msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
|
||||
}
|
||||
cnt++;
|
||||
};
|
||||
|
||||
ws_hbd.onclose = function (event) {
|
||||
/* console.log(event); */
|
||||
console.log("Connection is closed, reopening");
|
||||
// Show modal window
|
||||
var modal = document.getElementById("connectionModal");
|
||||
if (modal) {
|
||||
modal.classList.add("show");
|
||||
}
|
||||
setTimeout(function () {
|
||||
WS_Connect();
|
||||
}, 3000);
|
||||
};
|
||||
} else {
|
||||
// The browser doesn't support WebSocket
|
||||
console.log("WebSocket NOT supported by your Browser!");
|
||||
}
|
||||
}
|
||||
WS_Connect();
|
||||
</script>
|
||||
<body>
|
||||
{% include 'menu.html' %}
|
||||
|
||||
<div id="content" class="content" style="overflow: hidden">
|
||||
<div id="table" class="table" style="overflow: hidden">
|
||||
<!-- <h2>{{title}}</h2> -->
|
||||
<table id="ntable" class="sortable">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Ver</th>
|
||||
<th>IPv4 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latencey</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
<th>IPv6 Addr</th>
|
||||
<th>State</th>
|
||||
<th style="text-align: right">Latencey</th>
|
||||
<th style="text-align: right">Last State</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="ntablebody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div id="log" class="log" style="overflow: auto;">
|
||||
<h2>Log of Events</h2>
|
||||
<div id="messages">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% include 'foot.html' %}
|
||||
|
||||
<!-- Connection status modal -->
|
||||
<div id="connectionModal" class="connection-modal">
|
||||
<div class="connection-modal-content">
|
||||
<p>⚠️ Connection is closed, reopening...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
setup();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,3 +0,0 @@
|
||||
<label for="drawer-toggle" id="drawer-toggle-label"></label>
|
||||
<header>{{ header }}</header>
|
||||
|
||||
-220
@@ -1,220 +0,0 @@
|
||||
"""UDP listener and datagram processing."""
|
||||
|
||||
import asyncio
|
||||
import zlib
|
||||
import logging
|
||||
|
||||
from .proto import stodict, oldmtodict
|
||||
from hbd.utils import dur
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||
def __init__(self, config=None, handler=None):
|
||||
super().__init__()
|
||||
self.config = config or {}
|
||||
self.handler = handler
|
||||
|
||||
def connection_made(self, transport):
|
||||
self.transport = transport
|
||||
logger.info("UDP Server listening...")
|
||||
|
||||
def datagram_received(self, data, addr):
|
||||
logger.debug("Received from %s", addr)
|
||||
try:
|
||||
msg = parse_message(data)
|
||||
if self.handler:
|
||||
# handler can be a callable provided by the application
|
||||
# pass the transport so handlers can send replies (ACKs/commands)
|
||||
self.handler(msg, addr, self.transport)
|
||||
except Exception:
|
||||
logger.exception("Error while processing datagram from %s", addr)
|
||||
|
||||
|
||||
def parse_message(data: bytes):
|
||||
"""Parse a raw datagram into a message dict.
|
||||
|
||||
Uses the protocol decoding helpers and falls back to old format when
|
||||
decoding returns an empty dict (compat with older clients).
|
||||
"""
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
# fallback to old format
|
||||
msg = oldmtodict(data)
|
||||
return msg
|
||||
|
||||
|
||||
def dicttos(ID, d, compress=False):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
s.append("%s=%0.5f" % (k, d[k]))
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
else:
|
||||
zpk = pk
|
||||
opk = ID + ":" + zpk
|
||||
return opk
|
||||
|
||||
|
||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
"""Handle a parsed datagram message.
|
||||
|
||||
ctx is a dictionary with runtime dependencies:
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- pushmsg: callable(message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- DEBUG, verbose
|
||||
"""
|
||||
if not msg:
|
||||
return
|
||||
now = __import__("time").time()
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
log = ctx.get("log")
|
||||
pushmsg = ctx.get("pushmsg")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
|
||||
# normalize addr (ip, port)
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from hbd.utils import shortname
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt", None))
|
||||
except Exception:
|
||||
rtt = None
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
host.setcver(msg.get("ver", 0))
|
||||
|
||||
try:
|
||||
conn, res = host.conndata(cid, ip, rtt, now)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print("conndata failed: %s" % e)
|
||||
return
|
||||
|
||||
if res:
|
||||
if log:
|
||||
log(uname, res)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s" % (host.name, res))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
service = msg.get("service", "unknown")
|
||||
message = msg.get("msg", None)
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
if log:
|
||||
log(uname, "booted")
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
m = "%s booted" % (host.name)
|
||||
if pushmsg:
|
||||
pushmsg(m)
|
||||
if message:
|
||||
if log:
|
||||
log(uname, "msg: %s" % message, service=service)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
if log:
|
||||
log(uname, m)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
else:
|
||||
host.upcount += 1
|
||||
|
||||
if shutdown:
|
||||
if log:
|
||||
log(uname, "%s shutdown" % conn.afam)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
host.interval = interval
|
||||
|
||||
# send ACK back
|
||||
rmsg = {"time": __import__("time").time()}
|
||||
if host.cver < 1:
|
||||
opkt = b"ACK"
|
||||
else:
|
||||
opkt = dicttos("ACK", rmsg, host.cver > 1)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
# send any commands we have queued
|
||||
while len(host.cmds):
|
||||
op, rmsg = host.cmds[0]
|
||||
if op == "CMD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "command sent")
|
||||
if host.cver < 1:
|
||||
rmsg = rmsg["cmd"]
|
||||
elif op == "UPD":
|
||||
del host.cmds[0]
|
||||
if log:
|
||||
log(uname, "update initiated")
|
||||
if host.cver < 1:
|
||||
if log:
|
||||
log(uname, " ver 0 does not support UPD")
|
||||
continue
|
||||
if host.cver < 1:
|
||||
opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
|
||||
if isinstance(opkt, str):
|
||||
opkt = opkt.encode()
|
||||
else:
|
||||
opkt = dicttos(op, rmsg, True)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send cmd/update: %s" % e))
|
||||
|
||||
if msg_to_websockets:
|
||||
try:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1,143 +0,0 @@
|
||||
"""WebSocket server and broadcast helpers for hbd.
|
||||
|
||||
Provides an asyncio-based WebSocket server and a thread-safe broadcast
|
||||
function that other threads or synchronous code can call.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Callable, Iterable, Optional
|
||||
|
||||
import websockets
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_connections = set()
|
||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||
_get_msgs: Optional[Callable[[], Iterable]] = None
|
||||
_verbose = False
|
||||
|
||||
|
||||
async def _handler(websocket, path=None):
|
||||
_connections.add(websocket)
|
||||
remote_address = websocket.remote_address
|
||||
if path is None:
|
||||
path = getattr(websocket, "path", None)
|
||||
if _verbose:
|
||||
logger.info("DBG ws_serve: %s: %s", remote_address, path)
|
||||
try:
|
||||
# send initial hosts
|
||||
if _get_hosts:
|
||||
for h in _get_hosts():
|
||||
jmsg = json.dumps({"type": "host", "data": h})
|
||||
await websocket.send(jmsg)
|
||||
# send recent messages
|
||||
if _get_msgs:
|
||||
for m in list(_get_msgs())[-100:]:
|
||||
jmsg = json.dumps({"type": "message", "data": m})
|
||||
await websocket.send(jmsg)
|
||||
|
||||
# keep connection open until client disconnects
|
||||
async for _ in websocket:
|
||||
# we don't expect meaningful incoming messages besides the initial
|
||||
# client 'hello' that some clients send; ignore for now
|
||||
if _verbose:
|
||||
logger.debug("received ws data: %s", _)
|
||||
|
||||
except (
|
||||
websockets.exceptions.ConnectionClosedOK,
|
||||
websockets.exceptions.ConnectionClosedError,
|
||||
) as e:
|
||||
if _verbose:
|
||||
logger.info("ws closed: %r", e)
|
||||
except Exception as e:
|
||||
logger.exception("ws handler exception: %s", e)
|
||||
finally:
|
||||
try:
|
||||
_connections.remove(websocket)
|
||||
except KeyError:
|
||||
pass
|
||||
await websocket.wait_closed()
|
||||
|
||||
|
||||
async def start(
|
||||
host: str,
|
||||
ws_port: int,
|
||||
wss_port: Optional[int] = None,
|
||||
ssl_context=None,
|
||||
get_hosts: Optional[Callable] = None,
|
||||
get_msgs: Optional[Callable] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""Start WebSocket servers and block until cancelled.
|
||||
|
||||
This is intended to be awaited inside the main asyncio event loop.
|
||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
||||
started.
|
||||
"""
|
||||
global _loop, _get_hosts, _get_msgs, _verbose
|
||||
_loop = asyncio.get_running_loop()
|
||||
_get_hosts = get_hosts
|
||||
_get_msgs = get_msgs
|
||||
_verbose = verbose
|
||||
|
||||
servers = []
|
||||
# plain WebSocket
|
||||
websockets_logger = logging.getLogger("websockets.server")
|
||||
websockets_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
# regular WebSocket
|
||||
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
|
||||
servers.append(ws_server)
|
||||
# secure WebSocket (optional)
|
||||
if wss_port and ssl_context:
|
||||
wss_server = websockets.serve(
|
||||
_handler, host, wss_port, ssl=ssl_context
|
||||
) # , subprotocols=["hbd"])
|
||||
servers.append(wss_server)
|
||||
|
||||
# await starting of all servers
|
||||
for srv in servers:
|
||||
await srv
|
||||
|
||||
if _verbose:
|
||||
logger.info(
|
||||
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
|
||||
)
|
||||
|
||||
# block forever (until loop is stopped or cancelled)
|
||||
await asyncio.Future()
|
||||
|
||||
|
||||
def broadcast(typ: str, data) -> bool:
|
||||
"""Thread-safe broadcast helper.
|
||||
|
||||
Schedules coroutine(s) on the running loop to send message to all
|
||||
connected websockets. Returns False if server was not running.
|
||||
"""
|
||||
if not _loop:
|
||||
return False
|
||||
jmsg = json.dumps({"type": typ, "data": data})
|
||||
to_close = []
|
||||
for ws in list(_connections):
|
||||
if ws.state != websockets.protocol.State.OPEN:
|
||||
to_close.append(ws)
|
||||
continue
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.send(jmsg), _loop)
|
||||
except Exception:
|
||||
to_close.append(ws)
|
||||
logger.debug("ws.send exception: closed")
|
||||
for ws in to_close:
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.wait_closed(), _loop)
|
||||
except Exception:
|
||||
pass
|
||||
if ws in _connections:
|
||||
_connections.remove(ws)
|
||||
return True
|
||||
|
||||
|
||||
def connection_count() -> int:
|
||||
return len(_connections)
|
||||
-380
@@ -1,380 +0,0 @@
|
||||
"""
|
||||
host and connection class shared between hbd and
|
||||
the websit's heartbeat.py
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import copy
|
||||
import queue
|
||||
|
||||
num = 0
|
||||
|
||||
MAXRTTS = 10
|
||||
|
||||
DEBUG = 2
|
||||
|
||||
|
||||
def log(host, m):
|
||||
if DEBUG:
|
||||
print("class log: %s %s" % (host, m))
|
||||
|
||||
|
||||
class Connection:
|
||||
# map of addrs to names
|
||||
|
||||
htab = {}
|
||||
UNKNOWN = "unknown"
|
||||
UP = "up"
|
||||
DOWN = "down"
|
||||
OVERDUE = "overdue"
|
||||
|
||||
def __init__(self, host, cid, addr, afam):
|
||||
self.host = host
|
||||
self.cid = cid
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.addr = addr
|
||||
self.afam = afam
|
||||
self.rtts = [0]
|
||||
self.lastbeat = time.time()
|
||||
self.statetime = self.lastbeat
|
||||
self.deltastatetime = "computed"
|
||||
self.state = Connection.UNKNOWN
|
||||
|
||||
if host:
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
log(self.host.name, "dns update %s" % self.addr)
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def registerDns(self):
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
|
||||
def clearstate(self):
|
||||
d = {}
|
||||
d["addr"] = ""
|
||||
d["rtt"] = ""
|
||||
d["lastbeat"] = ""
|
||||
d["state"] = ""
|
||||
d["statetime"] = ""
|
||||
d["deltastatetime"] = ""
|
||||
d["rttstate"] = ""
|
||||
return d
|
||||
|
||||
def statedict(self, Null=False):
|
||||
d = self.clearstate()
|
||||
now = time.time()
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
d["rtt"] = "?"
|
||||
d["lastbeat"] = self.lastbeat
|
||||
if self.state == Connection.OVERDUE:
|
||||
d["state"] = "<b>%s</b>" % self.state
|
||||
else:
|
||||
d["state"] = self.state
|
||||
if self.state == Connection.UP:
|
||||
d["rttstate"] = d["rtt"]
|
||||
elif self.state == Connection.OVERDUE:
|
||||
d["rttstate"] = ""
|
||||
else:
|
||||
d["rttstate"] = d["state"]
|
||||
d["statetime"] = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
|
||||
)
|
||||
delta = now - self.statetime
|
||||
|
||||
if self.state == Connection.UNKNOWN:
|
||||
d["deltastatetime"] = ""
|
||||
elif delta > 86400:
|
||||
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
|
||||
elif delta > 3600:
|
||||
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
|
||||
elif delta > 60:
|
||||
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
|
||||
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
|
||||
else:
|
||||
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
|
||||
d["deltastatetime"] = "%i secs" % (delta)
|
||||
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
|
||||
d = self.clearstate()
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self, afam):
|
||||
d = {}
|
||||
d["addr"] = "%s Addr" % afam
|
||||
d["rtt"] = "Latencey"
|
||||
d["lastbeat"] = "Last Contact"
|
||||
d["state"] = "State"
|
||||
d["statetime"] = "Last State"
|
||||
d["rttstate"] = "Reach"
|
||||
d["deltastatetime"] = "Last State"
|
||||
return d
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.__dict__)
|
||||
|
||||
# set new state, return number of secs in previous state
|
||||
def newstate(self, state, now, when=0):
|
||||
self.state = state
|
||||
delta = now - when
|
||||
s = delta - self.statetime
|
||||
self.statetime = delta
|
||||
return s
|
||||
|
||||
def getstate(self):
|
||||
return self.state
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
if self.addr == addr:
|
||||
r = None
|
||||
else:
|
||||
r = "changed from %s to %s" % (self.addr, addr)
|
||||
try:
|
||||
del Connection.htab[self.addr]
|
||||
except:
|
||||
pass
|
||||
self.addr = addr
|
||||
Connection.htab[addr] = self.host.name
|
||||
if self.host.isDynDns():
|
||||
Host.dnsQ.put((self.host.name, self.addr))
|
||||
return r
|
||||
|
||||
|
||||
#
|
||||
class Host:
|
||||
# Table of Hosts
|
||||
hosts = {}
|
||||
dnsQ = queue.Queue()
|
||||
|
||||
def __init__(self, name):
|
||||
global num
|
||||
self.name = name
|
||||
if name:
|
||||
num += 1
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
self.cmds = []
|
||||
self.cver = 0
|
||||
self.connections = {}
|
||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
if self.watched:
|
||||
d["name"] = "<b>%s</b>" % d["name"]
|
||||
d["dyn"] = str(self.dyn)
|
||||
d["ver"] = str(self.cver)
|
||||
d["num"] = self.num
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
cs = self.connections[c].statedict()
|
||||
else:
|
||||
cs = ubConnection.statedict(True)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
|
||||
return d
|
||||
|
||||
def headerdict(self):
|
||||
d = {}
|
||||
d["name"] = "Name"
|
||||
d["dyn"] = "Dyn"
|
||||
d["ver"] = "Ver"
|
||||
d["num"] = "??"
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
cs = ubConnection.headerdict(c)
|
||||
for csv in cs:
|
||||
d["%s.%s" % (c, csv)] = cs[csv]
|
||||
return d
|
||||
|
||||
def registerDns(self):
|
||||
for af in self.connections:
|
||||
self.connections[af].registerDns()
|
||||
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d == "connections":
|
||||
cl = []
|
||||
for c in self.connections:
|
||||
# dirty ugly hack: fix conn to host backpointer
|
||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
||||
cld["host"] = cld["host"].name
|
||||
cl.append(cld)
|
||||
ddict[d] = cl
|
||||
else:
|
||||
ddict[d] = self.__dict__[d]
|
||||
return ddict
|
||||
|
||||
def jsons(self):
|
||||
return json.dumps(self.stateinfo())
|
||||
|
||||
def setcver(self, cver):
|
||||
self.cver = cver
|
||||
|
||||
def isDynDns(self):
|
||||
return self.dyn
|
||||
|
||||
def isIPv4(self, addr):
|
||||
if isinstance(addr, tuple):
|
||||
return addr[0].find(".") > 0
|
||||
else:
|
||||
return addr.find(".") > 0
|
||||
|
||||
def conndata(self, cid, addr, rtt, now):
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
if self.isIPv4(addr):
|
||||
afam = "IPv4"
|
||||
else:
|
||||
afam = "IPv6"
|
||||
|
||||
if afam not in self.connections:
|
||||
self.connections[afam] = Connection(self, cid, addr, afam)
|
||||
|
||||
conn = self.connections[afam]
|
||||
res = conn.newaddr(addr, rtt, now)
|
||||
return conn, res
|
||||
|
||||
# called when reloading class from pickle, add new fields here
|
||||
def fixup(self):
|
||||
for c in ["IPv4", "IPv6"]:
|
||||
if c in self.connections:
|
||||
addr = self.connections[c].addr
|
||||
if addr[0:7] == "::ffff:":
|
||||
addr = addr[7:]
|
||||
self.connections[c].addr = addr
|
||||
|
||||
pass
|
||||
|
||||
# def dispstate(self):
|
||||
# if self.state in ["down", "overdue"]:
|
||||
# state = "<b>%s</b>" % self.state
|
||||
# elif self.state in ["up", "UP"]:
|
||||
# state = ""
|
||||
# for x in list(self.connections.keys()):
|
||||
# try:
|
||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
||||
# except:
|
||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
||||
# state = ""
|
||||
# else:
|
||||
# state = "%s" % self.state
|
||||
# return state
|
||||
|
||||
def dispstats(self):
|
||||
if self.doesack != -1:
|
||||
if self.upcount > 0:
|
||||
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
|
||||
r = ""
|
||||
for v in range(3):
|
||||
a, u = self.hdwcounts[v]
|
||||
if (self.upcount - u) != 0:
|
||||
vs = "%0.0f" % (
|
||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
||||
)
|
||||
if vs == "0":
|
||||
vs = ""
|
||||
else:
|
||||
vs = "-"
|
||||
r += '<td align="right">%s</td>' % vs
|
||||
return r
|
||||
else:
|
||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
"IPv4.state",
|
||||
("IPv4.rtt", 'style="text-align: right;"'),
|
||||
("IPv4.statetime", 'style="text-align: right;"'),
|
||||
"IPv6.addr",
|
||||
"IPv6.state",
|
||||
("IPv6.rtt", 'style="text-align: right;"'),
|
||||
("IPv6.statetime", 'style="text-align: right;"'),
|
||||
"ver",
|
||||
]
|
||||
|
||||
hostfields_short = [
|
||||
"name",
|
||||
("IPv4.rttstate", 'style="text-align: right;"'),
|
||||
("IPv4.deltastatetime", 'style="text-align: right;"'),
|
||||
("IPv6.rttstate", 'style="text-align: right;"'),
|
||||
("IPv6.deltastatetime", 'style="text-align: right;"'),
|
||||
]
|
||||
|
||||
def gene(self, tag, v, attrib=None):
|
||||
if attrib:
|
||||
a = " %s" % attrib
|
||||
else:
|
||||
a = ""
|
||||
return "<%s%s>%s</%s>" % (tag, a, v, tag)
|
||||
|
||||
def htmltable(self, tag, hd, short):
|
||||
if short:
|
||||
hostfields = Host.hostfields_short
|
||||
else:
|
||||
hostfields = Host.hostfields_long
|
||||
h = []
|
||||
for f in hostfields:
|
||||
if isinstance(f, tuple):
|
||||
h.append(self.gene(tag, hd[f[0]], f[1]))
|
||||
else:
|
||||
h.append(self.gene(tag, hd[f]))
|
||||
return self.gene("tr", "\n".join(h))
|
||||
|
||||
def buildhosttable(self, short=False):
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: start")
|
||||
res = []
|
||||
res.append('<table id="ntable" class="sortable">')
|
||||
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
|
||||
hosts_sorted = list(Host.hosts.keys())
|
||||
if len(hosts_sorted):
|
||||
hosts_sorted.sort()
|
||||
for h in hosts_sorted:
|
||||
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
|
||||
res.append("</table>")
|
||||
if DEBUG > 1:
|
||||
print("DBG buildhosttable: %s" % res)
|
||||
return res
|
||||
|
||||
def buildmsgtable(self, msgs):
|
||||
res = []
|
||||
le = max(40 - len(Host.hosts), 3)
|
||||
res.append("<h4>Log of Events</h4>")
|
||||
for m in msgs[len(msgs) - le:]:
|
||||
res.append("%s<BR>" % m)
|
||||
return res
|
||||
|
||||
|
||||
# create fake "unbound objects", remove in Python 3.0
|
||||
ubHost = Host(None)
|
||||
ubConnection = Connection(None, "", "", "")
|
||||
Executable
+4
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
|
||||
#echo "OK - all is well"
|
||||
echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
|
||||
+27
-11
@@ -4,26 +4,41 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.0.5"
|
||||
description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
|
||||
version = "5.0.12"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
license = "MIT"
|
||||
keywords = ["heartbeat", "monitoring", "dns", "websocket"]
|
||||
keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
|
||||
authors = [
|
||||
{ name = "heartbeat contributors" }
|
||||
]
|
||||
|
||||
# Core dependencies (required for both client and server)
|
||||
dependencies = [
|
||||
"websockets>=13.2",
|
||||
"mattermostdriver>=7.3.0",
|
||||
"PyYAML>=6.0",
|
||||
"aiohttp>=3.11",
|
||||
"Jinja2>=3.1.6",
|
||||
"fastapi>=0.128.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
# Client-only dependencies (hbc - system monitoring client)
|
||||
client = [
|
||||
"psutil>=5.9.0",
|
||||
]
|
||||
|
||||
# Server-only dependencies (hbd - heartbeat daemon/server)
|
||||
server = [
|
||||
"websockets>=13.2",
|
||||
"mattermostdriver>=7.3.0",
|
||||
"aiohttp>=3.11",
|
||||
"Jinja2>=3.1.6",
|
||||
]
|
||||
|
||||
# Install both client and server
|
||||
all = [
|
||||
"hbd[client,server]",
|
||||
]
|
||||
|
||||
# Development dependencies
|
||||
dev = [
|
||||
"pytest>=7.0",
|
||||
"pytest-cov>=4.0",
|
||||
@@ -35,15 +50,16 @@ dev = [
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
hbd = "hbd.cli:main"
|
||||
hbc = "hbd.hbc:main"
|
||||
hbd = "hbd.server.cli:main"
|
||||
hbc = "hbd.client.main:main"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["hbd*"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||
"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
|
||||
"hbd.client" = ["*.yaml"]
|
||||
|
||||
|
||||
[tool.black]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
set -e
|
||||
uv version --bump patch
|
||||
VER=$(uv version --short)
|
||||
sed -i "" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||
|
||||
# commit pyproject.toml
|
||||
git commit -m "version $VER" pyproject.toml hbd/__init__.py
|
||||
@@ -11,3 +11,5 @@ git push
|
||||
# tag version
|
||||
git tag -a v$VER -m "Version $VER"
|
||||
git push --tags
|
||||
|
||||
rm hbd/__init__.py.bak
|
||||
|
||||
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demo script for HTTP API endpoints.
|
||||
Tests and demonstrates the plugin data and alert APIs.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
BASE_URL = "http://localhost:50004"
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header."""
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f" {title}")
|
||||
print('=' * 70)
|
||||
|
||||
def format_timestamp(timestamp):
|
||||
"""Convert Unix timestamp to readable format."""
|
||||
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
def format_duration(seconds):
|
||||
"""Format duration in human-readable format."""
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds / 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{minutes}m {secs}s"
|
||||
elif seconds < 86400:
|
||||
hours = int(seconds / 3600)
|
||||
minutes = int((seconds % 3600) / 60)
|
||||
return f"{hours}h {minutes}m"
|
||||
else:
|
||||
days = int(seconds / 86400)
|
||||
hours = int((seconds % 86400) / 3600)
|
||||
return f"{days}d {hours}h"
|
||||
|
||||
def test_hosts_api():
|
||||
"""Test GET /api/0/hosts endpoint."""
|
||||
print_section("1. List All Monitored Hosts")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5)
|
||||
response.raise_for_status()
|
||||
hosts = response.json()
|
||||
|
||||
print(f"Found {len(hosts)} hosts:\n")
|
||||
for host in hosts:
|
||||
name = host.get('name', 'unknown')
|
||||
dyn = host.get('dyn', False)
|
||||
conn_count = len(host.get('connections', []))
|
||||
|
||||
print(f" • {name}")
|
||||
print(f" - Protocol: IPv{ver}")
|
||||
print(f" - Dynamic: {dyn}")
|
||||
print(f" - Connections: {conn_count}")
|
||||
|
||||
return hosts
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_host_plugins_api(hostname):
|
||||
"""Test GET /api/0/hosts/{hostname}/plugins endpoint."""
|
||||
print_section(f"2. Get All Plugins for Host: {hostname}")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/plugins", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
plugins = data.get('plugins', {})
|
||||
print(f"Found {len(plugins)} plugins:\n")
|
||||
|
||||
for plugin_name, plugin_data in plugins.items():
|
||||
timestamp = plugin_data.get('timestamp', 0)
|
||||
sample_count = plugin_data.get('sample_count', 0)
|
||||
metrics = plugin_data.get('data', {})
|
||||
|
||||
print(f" 📦 {plugin_name}")
|
||||
print(f" Last update: {format_timestamp(timestamp)}")
|
||||
print(f" Samples: {sample_count}")
|
||||
print(f" Metrics: {len(metrics)}")
|
||||
|
||||
# Show first few metrics
|
||||
for i, (metric, value) in enumerate(metrics.items()):
|
||||
if i < 3: # Show only first 3 metrics
|
||||
if isinstance(value, float):
|
||||
print(f" - {metric}: {value:.2f}")
|
||||
elif isinstance(value, dict):
|
||||
print(f" - {metric}: [nested data, {len(value)} keys]")
|
||||
else:
|
||||
print(f" - {metric}: {value}")
|
||||
|
||||
if len(metrics) > 3:
|
||||
print(f" ... and {len(metrics) - 3} more")
|
||||
print()
|
||||
|
||||
return list(plugins.keys())
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_plugin_detail_api(hostname, plugin_name, limit=5):
|
||||
"""Test GET /api/0/hosts/{hostname}/plugins/{plugin_name} endpoint."""
|
||||
print_section(f"3. Get Detailed Data: {hostname}/{plugin_name}")
|
||||
|
||||
try:
|
||||
url = f"{BASE_URL}/api/0/hosts/{hostname}/plugins/{plugin_name}"
|
||||
params = {'limit': limit}
|
||||
response = requests.get(url, params=params, timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
samples = data.get('samples', [])
|
||||
print(f"Retrieved {len(samples)} samples (limit={limit}):\n")
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
timestamp = sample.get('timestamp', 0)
|
||||
metrics = sample.get('data', {})
|
||||
|
||||
print(f" [{i+1}] {format_timestamp(timestamp)}")
|
||||
for metric, value in sorted(metrics.items())[:5]: # Show first 5 metrics
|
||||
if isinstance(value, float):
|
||||
print(f" {metric}: {value:.2f}")
|
||||
elif isinstance(value, dict):
|
||||
print(f" {metric}: [nested: {len(value)} keys]")
|
||||
else:
|
||||
print(f" {metric}: {value}")
|
||||
print()
|
||||
|
||||
return samples
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_host_alerts_api(hostname):
|
||||
"""Test GET /api/0/hosts/{hostname}/alerts endpoint."""
|
||||
print_section(f"4. Get Alerts for Host: {hostname}")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
alerts = data.get('alerts', [])
|
||||
summary = data.get('summary', {})
|
||||
|
||||
print(f"Summary:")
|
||||
print(f" ✓ OK: {summary.get('ok', 0)}")
|
||||
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||
print(f" ❓ Unknown: {summary.get('unknown', 0)}")
|
||||
print()
|
||||
|
||||
# Show non-OK alerts
|
||||
active_alerts = [a for a in alerts if a.get('level') != 'OK']
|
||||
if active_alerts:
|
||||
print(f"Active Alerts ({len(active_alerts)}):")
|
||||
for alert in active_alerts:
|
||||
metric = alert.get('metric_path', 'unknown')
|
||||
level = alert.get('level', 'UNKNOWN')
|
||||
value = alert.get('last_value', 0)
|
||||
since = alert.get('since', 0)
|
||||
duration = datetime.now().timestamp() - since
|
||||
|
||||
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||
print(f" {icon} {metric}")
|
||||
print(f" Level: {level}")
|
||||
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||
print(f" Duration: {format_duration(duration)}")
|
||||
print()
|
||||
else:
|
||||
print("✓ No active alerts - all systems normal!")
|
||||
|
||||
return data
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return {}
|
||||
|
||||
def test_all_alerts_api():
|
||||
"""Test GET /api/0/alerts endpoint."""
|
||||
print_section("5. Get All Active Alerts Across All Hosts")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
alerts = data.get('alerts', [])
|
||||
summary = data.get('summary', {})
|
||||
host_count = data.get('host_count', 0)
|
||||
|
||||
print(f"Monitoring {host_count} hosts")
|
||||
print(f"Active Alerts: {summary.get('total', 0)}")
|
||||
print(f" 🔴 Critical: {summary.get('critical', 0)}")
|
||||
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
|
||||
print()
|
||||
|
||||
if alerts:
|
||||
print("Alert Details:")
|
||||
for alert in alerts:
|
||||
hostname = alert.get('hostname', 'unknown')
|
||||
metric = alert.get('metric_path', 'unknown')
|
||||
level = alert.get('level', 'UNKNOWN')
|
||||
value = alert.get('last_value', 0)
|
||||
since = alert.get('since', 0)
|
||||
duration = datetime.now().timestamp() - since
|
||||
notification_count = alert.get('notification_count', 0)
|
||||
|
||||
icon = '⚠️' if level == 'WARNING' else '🔴'
|
||||
print(f" {icon} {hostname} / {metric}")
|
||||
print(f" Level: {level}")
|
||||
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
|
||||
print(f" Duration: {format_duration(duration)}")
|
||||
print(f" Notifications: {notification_count}")
|
||||
print()
|
||||
else:
|
||||
print("✅ All systems normal - no active alerts!")
|
||||
|
||||
return data
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return {}
|
||||
|
||||
def test_messages_api():
|
||||
"""Test GET /api/0/messages endpoint."""
|
||||
print_section("6. Get Recent Messages")
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/messages", timeout=5)
|
||||
response.raise_for_status()
|
||||
messages = response.json()
|
||||
|
||||
print(f"Last {len(messages)} messages:\n")
|
||||
for msg in messages[-5:]: # Show last 5
|
||||
timestamp = msg.get('time', 0)
|
||||
host = msg.get('host', 'unknown')
|
||||
text = msg.get('msg', '')
|
||||
|
||||
print(f" [{format_timestamp(timestamp)}] {host}: {text}")
|
||||
|
||||
return messages
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
def test_error_handling():
|
||||
"""Test API error handling."""
|
||||
print_section("7. Error Handling Tests")
|
||||
|
||||
# Test non-existent host
|
||||
print("Testing non-existent host...")
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts/nonexistenthost/plugins", timeout=5)
|
||||
if response.status_code == 404:
|
||||
error_data = response.json()
|
||||
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||
else:
|
||||
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
|
||||
# Test non-existent plugin
|
||||
print("\nTesting non-existent plugin...")
|
||||
try:
|
||||
# Get first host
|
||||
hosts = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5).json()
|
||||
if hosts:
|
||||
hostname = hosts[0]['name']
|
||||
response = requests.get(
|
||||
f"{BASE_URL}/api/0/hosts/{hostname}/plugins/nonexistentplugin",
|
||||
timeout=5
|
||||
)
|
||||
if response.status_code == 404:
|
||||
error_data = response.json()
|
||||
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
|
||||
else:
|
||||
print(f" ⚠️ Unexpected status code: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
|
||||
def demo_monitoring_loop():
|
||||
"""Demonstrate continuous monitoring."""
|
||||
print_section("8. Continuous Monitoring Demo (5 iterations)")
|
||||
|
||||
print("Monitoring alerts every 3 seconds (Ctrl+C to stop)...\n")
|
||||
|
||||
try:
|
||||
for i in range(5):
|
||||
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
summary = data.get('summary', {})
|
||||
critical = summary.get('critical', 0)
|
||||
warning = summary.get('warning', 0)
|
||||
|
||||
timestamp = datetime.now().strftime('%H:%M:%S')
|
||||
status = "🔴 CRITICAL" if critical > 0 else "⚠️ WARNING" if warning > 0 else "✅ OK"
|
||||
|
||||
print(f"[{timestamp}] {status} - Critical: {critical}, Warning: {warning}")
|
||||
|
||||
if i < 4: # Don't sleep after last iteration
|
||||
sleep(3)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nMonitoring stopped by user")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
|
||||
def main():
|
||||
"""Run all API tests."""
|
||||
print("""
|
||||
╔══════════════════════════════════════════════════════════════╗
|
||||
║ Heartbeat Daemon HTTP API Demo & Test Suite ║
|
||||
╚══════════════════════════════════════════════════════════════╝
|
||||
""")
|
||||
|
||||
print(f"Testing API at: {BASE_URL}")
|
||||
print(f"Ensure the heartbeat daemon is running!")
|
||||
|
||||
# Test basic connectivity
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=2)
|
||||
response.raise_for_status()
|
||||
print("✅ API is reachable\n")
|
||||
except Exception as e:
|
||||
print(f"❌ Cannot connect to API: {e}")
|
||||
print("\nPlease ensure:")
|
||||
print(" 1. Heartbeat daemon is running")
|
||||
print(" 2. HTTP server is enabled in configuration")
|
||||
print(f" 3. Server is listening on port {BASE_URL.split(':')[-1]}")
|
||||
sys.exit(1)
|
||||
|
||||
# Run test suite
|
||||
hosts = test_hosts_api()
|
||||
|
||||
if not hosts:
|
||||
print("\n⚠️ No hosts found. Ensure clients are sending heartbeats.")
|
||||
return
|
||||
|
||||
# Pick first host for detailed testing
|
||||
hostname = hosts[0].get('name', '')
|
||||
|
||||
if hostname:
|
||||
plugins = test_host_plugins_api(hostname)
|
||||
|
||||
if plugins:
|
||||
# Test detailed plugin data
|
||||
test_plugin_detail_api(hostname, plugins[0], limit=3)
|
||||
|
||||
# Test alert endpoints
|
||||
test_host_alerts_api(hostname)
|
||||
|
||||
# Test global endpoints
|
||||
test_all_alerts_api()
|
||||
test_messages_api()
|
||||
|
||||
# Test error handling
|
||||
test_error_handling()
|
||||
|
||||
# Continuous monitoring demo
|
||||
demo_monitoring_loop()
|
||||
|
||||
print_section("Test Suite Complete")
|
||||
print("""
|
||||
Next Steps:
|
||||
• View the web UI at http://localhost:50004/live
|
||||
• Check plugin metrics at http://localhost:50004/plugins
|
||||
• Monitor alerts at http://localhost:50004/alerts
|
||||
• Read API documentation: docs/HTTP_API.md
|
||||
""")
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nDemo interrupted by user")
|
||||
sys.exit(0)
|
||||
+11
-1
@@ -1,6 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
# install hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
|
||||
# install the heartbeat tools. By default, this will install the hbc
|
||||
# client only. The server is installed when the arg 'server' is passed
|
||||
# to the script. The script will install the heartbeat tools in a python
|
||||
# virtual environment in ~/venvs/hbd. The hbd and hbc commands will be
|
||||
# installed from the wheel and symlinked to ~/bin/hbd and ~/bin/hbc,
|
||||
# respectively. If the virtual environment already exists, it will be
|
||||
# reused. The script will also remove any existing symlinks for hbd and hbc
|
||||
# in ~/bin before creating new ones.
|
||||
|
||||
|
||||
# hbd/hbc from wheel and create symlinks for hbd and hbc in ~/bin
|
||||
|
||||
set -e
|
||||
if [ ! -d ~/venvs/hbd ]; then
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test all plugins together.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Setup path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugin import PluginRegistry, PluginLoader
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
async def test_all_plugins():
|
||||
"""Test loading all plugins."""
|
||||
print("=" * 70)
|
||||
print("Testing All Plugins")
|
||||
print("=" * 70)
|
||||
|
||||
# Create registry and loader
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
# Configuration for plugins
|
||||
config = {
|
||||
"cpu_monitor": {
|
||||
"interval": 30,
|
||||
"per_core": False
|
||||
},
|
||||
"nagios_runner": {
|
||||
"interval": 60,
|
||||
"commands": [
|
||||
{
|
||||
"name": "test_ok",
|
||||
"command": "echo 'OK - test passed | metric=100%;;;0;100'"
|
||||
},
|
||||
{
|
||||
"name": "test_warning",
|
||||
"command": "echo 'WARNING - test result | value=85%;80;90;0;100' && exit 1"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Load plugins
|
||||
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
|
||||
print(f"\n1. Loading plugins from: {plugin_dir}")
|
||||
|
||||
count = await loader.load_from_directory(plugin_dir, config)
|
||||
print(f" ✓ Loaded {count} plugins")
|
||||
|
||||
# List loaded plugins
|
||||
print(f"\n2. Loaded plugins:")
|
||||
for plugin in registry.get_all():
|
||||
print(f" - {plugin.name} v{plugin.version}")
|
||||
print(f" Type: {plugin.__class__.__name__}")
|
||||
print(f" Interval: {plugin.interval}s")
|
||||
print(f" Description: {plugin.description}")
|
||||
|
||||
# Test collection for each plugin
|
||||
print(f"\n3. Testing data collection:")
|
||||
for plugin in registry.get_all():
|
||||
print(f"\n {plugin.name}:")
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
print(f" ✓ Collected {len(data)} fields")
|
||||
|
||||
# Show sample of data
|
||||
sample_count = min(5, len(data))
|
||||
for key, value in list(data.items())[:sample_count]:
|
||||
value_str = str(value)
|
||||
if len(value_str) > 50:
|
||||
value_str = value_str[:47] + "..."
|
||||
print(f" {key}: {value_str}")
|
||||
|
||||
if len(data) > sample_count:
|
||||
print(f" ... and {len(data) - sample_count} more fields")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
# Cleanup
|
||||
print(f"\n4. Cleanup...")
|
||||
await loader.unload_all()
|
||||
print(f" ✓ All plugins unloaded")
|
||||
|
||||
print(f"\n" + "=" * 70)
|
||||
print(f"Successfully tested {count} plugins!")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_all_plugins())
|
||||
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for all monitoring plugins.
|
||||
|
||||
Tests all available plugins including the new ones:
|
||||
- memory_monitor
|
||||
- disk_monitor
|
||||
- network_monitor
|
||||
- filesystem_info
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
|
||||
# Add parent directory to path so we can import hbd
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from hbd.plugin import PluginLoader
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def format_bytes(bytes_val):
|
||||
"""Format bytes into human readable format."""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if bytes_val < 1024.0:
|
||||
return f"{bytes_val:.2f} {unit}"
|
||||
bytes_val /= 1024.0
|
||||
return f"{bytes_val:.2f} PB"
|
||||
|
||||
|
||||
def print_plugin_data(plugin_name, data, indent=2):
|
||||
"""Pretty print plugin data."""
|
||||
prefix = " " * indent
|
||||
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
if isinstance(value, dict):
|
||||
print(f"{prefix}{key}:")
|
||||
print_plugin_data(plugin_name, value, indent + 2)
|
||||
elif isinstance(value, list):
|
||||
print(f"{prefix}{key}: [{len(value)} items]")
|
||||
if len(value) <= 5: # Only show small lists
|
||||
for item in value:
|
||||
if isinstance(item, dict):
|
||||
print_plugin_data(plugin_name, item, indent + 2)
|
||||
else:
|
||||
print(f"{prefix} - {item}")
|
||||
else:
|
||||
# Format output based on key name for better readability
|
||||
if '_bytes' in key or key.endswith('_sent') or key.endswith('_recv') or 'memory_' in key or 'swap_' in key:
|
||||
if isinstance(value, (int, float)) and value > 1024:
|
||||
print(f"{prefix}{key}: {format_bytes(value)} ({value:,})")
|
||||
else:
|
||||
print(f"{prefix}{key}: {value}")
|
||||
elif 'percent' in key:
|
||||
print(f"{prefix}{key}: {value:.1f}%")
|
||||
elif isinstance(value, float):
|
||||
print(f"{prefix}{key}: {value:.2f}")
|
||||
elif isinstance(value, int) and value > 1000:
|
||||
print(f"{prefix}{key}: {value:,}")
|
||||
else:
|
||||
print(f"{prefix}{key}: {value}")
|
||||
else:
|
||||
print(f"{prefix}{data}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main test function."""
|
||||
print("="*60)
|
||||
print("Plugin System Test Suite")
|
||||
print("="*60)
|
||||
|
||||
# Load all available plugins using the plugin loader
|
||||
from hbd.plugin import PluginRegistry, PluginLoader
|
||||
from pathlib import Path
|
||||
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
|
||||
if not plugin_dir.exists():
|
||||
print(f"✗ Plugin directory not found: {plugin_dir}")
|
||||
return 1
|
||||
|
||||
# Load plugins from directory
|
||||
count = await loader.load_from_directory(plugin_dir, {})
|
||||
|
||||
print(f"\nLoaded {count} plugins:")
|
||||
plugins = registry.get_all()
|
||||
for plugin in plugins:
|
||||
print(f" - {plugin.name}: {plugin.__class__.__doc__.split('.')[0] if plugin.__class__.__doc__ else 'No description'}")
|
||||
|
||||
# Test each plugin
|
||||
results = {}
|
||||
for plugin in plugins:
|
||||
# Skip nagios_runner as it needs specific configuration
|
||||
if plugin.name == 'nagios_runner':
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Skipping: {plugin.name} (requires specific configuration)")
|
||||
print(f"{'='*60}")
|
||||
results[plugin.name] = True # Mark as success since it loaded OK
|
||||
continue
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {plugin.name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
# Collect data
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
if 'error' in data:
|
||||
print(f"✗ Collection error: {data['error']}")
|
||||
results[plugin.name] = False
|
||||
else:
|
||||
print(f"✓ Data collected: {len(data)} top-level fields")
|
||||
print_plugin_data(plugin.name, data)
|
||||
results[plugin.name] = True
|
||||
else:
|
||||
print(f"⚠ No data collected")
|
||||
results[plugin.name] = False
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to collect data: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
results[plugin.name] = False
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("Test Summary")
|
||||
print(f"{'='*60}")
|
||||
|
||||
success_count = sum(1 for v in results.values() if v)
|
||||
total_count = len(results)
|
||||
|
||||
print(f"\nResults: {success_count}/{total_count} plugins successful")
|
||||
for name, success in results.items():
|
||||
status = "✓" if success else "✗"
|
||||
print(f" {status} {name}")
|
||||
|
||||
if success_count == total_count:
|
||||
print("\n🎉 All plugins passed!")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ {total_count - success_count} plugin(s) failed")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit_code = asyncio.run(main())
|
||||
sys.exit(exit_code)
|
||||
+331
@@ -0,0 +1,331 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for message journal functionality.
|
||||
|
||||
Tests:
|
||||
- Journal initialization
|
||||
- Message logging
|
||||
- File rotation based on size
|
||||
- Backup management
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from hbd.journal import MessageJournal, get_journal
|
||||
|
||||
|
||||
async def test_basic_logging():
|
||||
"""Test basic message logging."""
|
||||
print("="*60)
|
||||
print("Test 1: Basic Message Logging")
|
||||
print("="*60)
|
||||
|
||||
# Create temporary directory for journal
|
||||
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
|
||||
print(f"Using temp directory: {temp_dir}")
|
||||
|
||||
try:
|
||||
# Create journal with config
|
||||
config = {
|
||||
'journal_enabled': True,
|
||||
'journal_dir': temp_dir,
|
||||
'journal_file': 'test.journal',
|
||||
'journal_max_size': 1024, # 1KB for testing
|
||||
'journal_max_backups': 3
|
||||
}
|
||||
|
||||
journal = MessageJournal(config)
|
||||
await journal.initialize()
|
||||
|
||||
# Log some test messages
|
||||
test_messages = [
|
||||
{
|
||||
'ID': 'HTB',
|
||||
'name': 'testhost1',
|
||||
'interval': 30,
|
||||
},
|
||||
{
|
||||
'ID': 'PLG',
|
||||
'plugin': 'cpu_monitor',
|
||||
'cpu_percent': 45.2,
|
||||
'load_1min': 1.5
|
||||
},
|
||||
{
|
||||
'ID': 'HTB',
|
||||
'name': 'testhost2',
|
||||
'interval': 60,
|
||||
'boot': 1
|
||||
}
|
||||
]
|
||||
|
||||
for i, msg in enumerate(test_messages):
|
||||
await journal.log_message(msg, ('192.168.1.100', 50000 + i), 1000.0 + i)
|
||||
print(f"✓ Logged message {i+1}: {msg['ID']}")
|
||||
|
||||
# Check journal file exists
|
||||
journal_path = Path(temp_dir) / 'test.journal'
|
||||
if journal_path.exists():
|
||||
print(f"✓ Journal file created: {journal_path}")
|
||||
|
||||
# Read and verify content
|
||||
with open(journal_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
print(f"✓ Journal has {len(lines)} entries")
|
||||
|
||||
# Parse first entry
|
||||
entry = json.loads(lines[0])
|
||||
print(f"✓ First entry structure: {list(entry.keys())}")
|
||||
assert 'timestamp' in entry
|
||||
assert 'datetime' in entry
|
||||
assert 'source_ip' in entry
|
||||
assert 'message' in entry
|
||||
print("✓ Entry structure validated")
|
||||
else:
|
||||
print("✗ Journal file not created")
|
||||
return False
|
||||
|
||||
# Get stats
|
||||
stats = journal.get_stats()
|
||||
print(f"\nJournal stats:")
|
||||
print(f" Enabled: {stats['enabled']}")
|
||||
print(f" Current size: {stats['current_size']} bytes")
|
||||
print(f" Max size: {stats['max_size']} bytes")
|
||||
print(f" Rotation threshold: {stats['rotation_threshold']}")
|
||||
|
||||
await journal.close()
|
||||
print("\n✅ Test 1 PASSED")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test 1 FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
finally:
|
||||
# Cleanup
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def test_rotation():
|
||||
"""Test log rotation based on size."""
|
||||
print("\n" + "="*60)
|
||||
print("Test 2: Log Rotation")
|
||||
print("="*60)
|
||||
|
||||
# Create temporary directory for journal
|
||||
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
|
||||
print(f"Using temp directory: {temp_dir}")
|
||||
|
||||
try:
|
||||
# Create journal with small max size
|
||||
config = {
|
||||
'journal_enabled': True,
|
||||
'journal_dir': temp_dir,
|
||||
'journal_file': 'test.journal',
|
||||
'journal_max_size': 500, # 500 bytes - very small for testing
|
||||
'journal_max_backups': 3
|
||||
}
|
||||
|
||||
journal = MessageJournal(config)
|
||||
await journal.initialize()
|
||||
|
||||
# Log many messages to trigger rotation
|
||||
print("Logging messages to trigger rotation...")
|
||||
for i in range(20):
|
||||
msg = {
|
||||
'ID': 'HTB',
|
||||
'name': f'testhost{i}',
|
||||
'interval': 30,
|
||||
'data': 'x' * 50 # Add some padding
|
||||
}
|
||||
await journal.log_message(msg, ('192.168.1.100', 50000 + i), 1000.0 + i)
|
||||
|
||||
# Give rotation time to complete
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
print(f"✓ Logged 20 messages")
|
||||
|
||||
# Check for rotated files
|
||||
journal_dir = Path(temp_dir)
|
||||
all_files = list(journal_dir.glob('test.journal*'))
|
||||
print(f"✓ Found {len(all_files)} journal files")
|
||||
|
||||
for f in sorted(all_files):
|
||||
size = f.stat().st_size
|
||||
print(f" - {f.name}: {size} bytes")
|
||||
|
||||
# Should have current file + some backups
|
||||
if len(all_files) > 1:
|
||||
print(f"✓ Rotation occurred ({len(all_files) - 1} backup files)")
|
||||
else:
|
||||
print("⚠ No rotation occurred (may not have reached threshold)")
|
||||
|
||||
# Check max backups limit
|
||||
backup_files = [f for f in all_files if f.name != 'test.journal']
|
||||
if len(backup_files) <= config['journal_max_backups']:
|
||||
print(f"✓ Backup count within limit: {len(backup_files)} <= {config['journal_max_backups']}")
|
||||
else:
|
||||
print(f"✗ Too many backups: {len(backup_files)} > {config['journal_max_backups']}")
|
||||
return False
|
||||
|
||||
await journal.close()
|
||||
print("\n✅ Test 2 PASSED")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test 2 FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
finally:
|
||||
# Cleanup
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def test_disabled_journal():
|
||||
"""Test that disabled journal doesn't write anything."""
|
||||
print("\n" + "="*60)
|
||||
print("Test 3: Disabled Journal")
|
||||
print("="*60)
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
|
||||
print(f"Using temp directory: {temp_dir}")
|
||||
|
||||
try:
|
||||
config = {
|
||||
'journal_enabled': False,
|
||||
'journal_dir': temp_dir,
|
||||
'journal_file': 'test.journal'
|
||||
}
|
||||
|
||||
journal = MessageJournal(config)
|
||||
await journal.initialize()
|
||||
|
||||
# Try to log a message
|
||||
msg = {'ID': 'HTB', 'name': 'testhost'}
|
||||
await journal.log_message(msg, ('192.168.1.100', 50000), 1000.0)
|
||||
|
||||
# Check that no file was created
|
||||
journal_path = Path(temp_dir) / 'test.journal'
|
||||
if not journal_path.exists():
|
||||
print("✓ No journal file created (as expected)")
|
||||
else:
|
||||
print("✗ Journal file was created despite being disabled")
|
||||
return False
|
||||
|
||||
await journal.close()
|
||||
print("\n✅ Test 3 PASSED")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test 3 FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def test_global_instance():
|
||||
"""Test global journal instance."""
|
||||
print("\n" + "="*60)
|
||||
print("Test 4: Global Journal Instance")
|
||||
print("="*60)
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
|
||||
|
||||
try:
|
||||
config = {
|
||||
'journal_enabled': True,
|
||||
'journal_dir': temp_dir,
|
||||
'journal_file': 'global.journal'
|
||||
}
|
||||
|
||||
# Get global instance
|
||||
journal1 = get_journal(config)
|
||||
journal2 = get_journal() # Should return same instance
|
||||
|
||||
if journal1 is journal2:
|
||||
print("✓ Global instance returns same object")
|
||||
else:
|
||||
print("✗ Global instance returns different objects")
|
||||
return False
|
||||
|
||||
await journal1.initialize()
|
||||
|
||||
# Log through convenience function
|
||||
from hbd.journal import log_message
|
||||
msg = {'ID': 'HTB', 'name': 'testhost'}
|
||||
await log_message(msg, ('192.168.1.100', 50000))
|
||||
|
||||
journal_path = Path(temp_dir) / 'global.journal'
|
||||
if journal_path.exists():
|
||||
print("✓ Global journal logged message")
|
||||
else:
|
||||
print("✗ Global journal did not log message")
|
||||
return False
|
||||
|
||||
await journal1.close()
|
||||
print("\n✅ Test 4 PASSED")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test 4 FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
print("Message Journal Test Suite")
|
||||
print("="*60)
|
||||
|
||||
tests = [
|
||||
test_basic_logging,
|
||||
test_rotation,
|
||||
test_disabled_journal,
|
||||
test_global_instance
|
||||
]
|
||||
|
||||
results = []
|
||||
for test in tests:
|
||||
result = await test()
|
||||
results.append(result)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
print("Test Summary")
|
||||
print("="*60)
|
||||
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
|
||||
print(f"Passed: {passed}/{total}")
|
||||
|
||||
for i, (test, result) in enumerate(zip(tests, results), 1):
|
||||
status = "✅ PASS" if result else "❌ FAIL"
|
||||
print(f" {status} - Test {i}: {test.__name__}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 All tests passed!")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ {total - passed} test(s) failed")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit_code = asyncio.run(main())
|
||||
sys.exit(exit_code)
|
||||
+150
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test the Nagios Runner Plugin.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Setup path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugins.nagios_runner import NagiosRunnerPlugin, NAGIOS_OK, NAGIOS_WARNING
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
async def test_nagios_runner():
|
||||
"""Test Nagios runner plugin."""
|
||||
print("=" * 70)
|
||||
print("Testing Nagios Runner Plugin")
|
||||
print("=" * 70)
|
||||
|
||||
# Create test configuration with simple shell commands
|
||||
# These mimic Nagios plugin output format
|
||||
config = {
|
||||
"interval": 60,
|
||||
"timeout": 10,
|
||||
"commands": [
|
||||
{
|
||||
"name": "check_uptime",
|
||||
"command": "echo 'OK - uptime is 5 days | uptime=432000s;;;0'"
|
||||
},
|
||||
{
|
||||
"name": "check_memory",
|
||||
"command": "echo 'OK - Memory usage 45% | memory=45%;80;90;0;100'"
|
||||
},
|
||||
{
|
||||
"name": "check_cpu",
|
||||
"command": "echo 'WARNING - CPU load high | load1=5.2;5.0;10.0;0 load5=4.8;4.0;8.0;0 load15=3.2;3.0;6.0;0' && exit 1"
|
||||
},
|
||||
{
|
||||
"name": "check_disk",
|
||||
"command": "echo 'OK - Disk usage 62% | /=62%;80;90;0;100 /home=45%;80;90;0;100'"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
print("\n1. Creating Nagios Runner plugin with test configuration")
|
||||
print(f" Configured {len(config['commands'])} test commands")
|
||||
|
||||
plugin = NagiosRunnerPlugin(config)
|
||||
|
||||
print(f"\n2. Initializing plugin...")
|
||||
initialized = await plugin.initialize()
|
||||
print(f" Initialized: {initialized}")
|
||||
|
||||
if not initialized:
|
||||
print(" ERROR: Plugin failed to initialize!")
|
||||
return
|
||||
|
||||
print(f"\n3. Collecting metrics from Nagios plugins...")
|
||||
data = await plugin.collect()
|
||||
|
||||
print(f" ✓ Collected {len(data)} data points")
|
||||
|
||||
print(f"\n4. Results:")
|
||||
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
|
||||
print(f" Plugins Executed: {data.get('plugin_count')}")
|
||||
|
||||
# Show individual plugin results
|
||||
print(f"\n5. Individual Plugin Results:")
|
||||
for cmd_config in config["commands"]:
|
||||
name = cmd_config["name"]
|
||||
status = data.get(f"{name}_status", "N/A")
|
||||
status_code = data.get(f"{name}_status_code", "N/A")
|
||||
output = data.get(f"{name}_output", "N/A")
|
||||
|
||||
print(f"\n {name}:")
|
||||
print(f" Status: {status} (code: {status_code})")
|
||||
print(f" Output: {output}")
|
||||
|
||||
# Show performance data if present
|
||||
perf_keys = [k for k in data.keys() if k.startswith(f"{name}_") and
|
||||
k not in [f"{name}_status", f"{name}_status_code", f"{name}_output"]]
|
||||
if perf_keys:
|
||||
print(f" Performance Data:")
|
||||
for key in perf_keys:
|
||||
metric_name = key.replace(f"{name}_", "")
|
||||
print(f" {metric_name}: {data[key]}")
|
||||
|
||||
print(f"\n6. Testing Nagios plugin detection (if available)...")
|
||||
|
||||
# Try to find actual Nagios plugins on the system
|
||||
common_nagios_paths = [
|
||||
"/usr/lib/nagios/plugins",
|
||||
"/usr/local/nagios/libexec",
|
||||
"/usr/lib64/nagios/plugins"
|
||||
]
|
||||
|
||||
nagios_plugin_dir = None
|
||||
for path in common_nagios_paths:
|
||||
if Path(path).exists():
|
||||
nagios_plugin_dir = Path(path)
|
||||
print(f" ✓ Found Nagios plugins at: {nagios_plugin_dir}")
|
||||
break
|
||||
|
||||
if nagios_plugin_dir:
|
||||
# Try check_users if it exists
|
||||
check_users = nagios_plugin_dir / "check_users"
|
||||
if check_users.exists():
|
||||
print(f"\n Testing real Nagios plugin: check_users")
|
||||
real_config = {
|
||||
"commands": [{
|
||||
"name": "users",
|
||||
"command": f"{check_users} -w 10 -c 20"
|
||||
}]
|
||||
}
|
||||
real_plugin = NagiosRunnerPlugin(real_config)
|
||||
await real_plugin.initialize()
|
||||
real_data = await real_plugin.collect()
|
||||
|
||||
print(f" Status: {real_data.get('users_status')}")
|
||||
print(f" Output: {real_data.get('users_output')}")
|
||||
|
||||
# Show any performance data
|
||||
for key in real_data:
|
||||
if key.startswith("users_") and "status" not in key and "output" not in key:
|
||||
print(f" {key}: {real_data[key]}")
|
||||
else:
|
||||
print(f" check_users not found at {check_users}")
|
||||
else:
|
||||
print(f" No Nagios plugins directory found")
|
||||
print(f" Install nagios-plugins to test with real plugins:")
|
||||
print(f" sudo apt-get install nagios-plugins # Debian/Ubuntu")
|
||||
print(f" sudo yum install nagios-plugins-all # RHEL/CentOS")
|
||||
|
||||
print(f"\n7. Cleanup...")
|
||||
await plugin.cleanup()
|
||||
print(f" ✓ Cleanup complete")
|
||||
|
||||
print(f"\n" + "=" * 70)
|
||||
print("Test complete!")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_nagios_runner())
|
||||
+119
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for plugin system.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Setup path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugin import PluginRegistry, PluginLoader
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
async def test_plugins():
|
||||
"""Test plugin loading and collection."""
|
||||
print("=" * 60)
|
||||
print("Testing Plugin System")
|
||||
print("=" * 60)
|
||||
|
||||
# Create registry and loader
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
# Load plugins
|
||||
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
|
||||
print(f"\n1. Loading plugins from: {plugin_dir}")
|
||||
|
||||
if not plugin_dir.exists():
|
||||
print(f" ERROR: Plugin directory does not exist!")
|
||||
return
|
||||
|
||||
count = await loader.load_from_directory(plugin_dir)
|
||||
print(f" Loaded {count} plugins")
|
||||
|
||||
# List loaded plugins
|
||||
print(f"\n2. Loaded plugins:")
|
||||
for plugin in registry.get_all():
|
||||
print(f" - {plugin.name} v{plugin.version} ({plugin.__class__.__name__})")
|
||||
print(f" Description: {plugin.description}")
|
||||
print(f" Interval: {plugin.interval}s")
|
||||
|
||||
# Test InfoPlugins
|
||||
print(f"\n3. Testing InfoPlugins (collect once):")
|
||||
from hbd.plugin import InfoPlugin
|
||||
for plugin in registry.get_by_type(InfoPlugin):
|
||||
print(f"\n Collecting {plugin.name}...")
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
print(f" ✓ Success! Got {len(data)} fields")
|
||||
for key, value in list(data.items())[:5]: # Show first 5 fields
|
||||
print(f" {key}: {value}")
|
||||
if len(data) > 5:
|
||||
print(f" ... and {len(data) - 5} more fields")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
# Test MonitorPlugins
|
||||
print(f"\n4. Testing MonitorPlugins (periodic collection):")
|
||||
from hbd.plugin import MonitorPlugin
|
||||
for plugin in registry.get_by_type(MonitorPlugin):
|
||||
print(f"\n Collecting {plugin.name}...")
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
print(f" ✓ Success! Got {len(data)} fields")
|
||||
for key, value in list(data.items())[:8]: # Show first 8 fields
|
||||
print(f" {key}: {value}")
|
||||
if len(data) > 8:
|
||||
print(f" ... and {len(data) - 8} more fields")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
# Test protocol encoding
|
||||
print(f"\n5. Testing protocol encoding:")
|
||||
from hbd.proto import dicttos, stodict
|
||||
|
||||
# Create sample plugin data
|
||||
test_data = {
|
||||
"plugin": "test_plugin",
|
||||
"cpu_percent": 42.5,
|
||||
"memory_mb": 1024,
|
||||
"processes": 156,
|
||||
"load_avg": [1.2, 0.8, 0.5],
|
||||
"disk_info": {"sda": {"used": 50, "total": 100}}
|
||||
}
|
||||
|
||||
print(f" Original data: {test_data}")
|
||||
|
||||
# Encode
|
||||
encoded = dicttos("PLG", test_data)
|
||||
print(f" Encoded ({len(encoded)} bytes): {encoded[:50]}...")
|
||||
|
||||
# Decode
|
||||
decoded = stodict(encoded)
|
||||
print(f" Decoded: {decoded}")
|
||||
|
||||
# Verify
|
||||
if decoded.get("ID") == "PLG" and decoded.get("plugin") == "test_plugin":
|
||||
print(f" ✓ Protocol encoding/decoding works!")
|
||||
else:
|
||||
print(f" ✗ Protocol encoding/decoding failed!")
|
||||
|
||||
# Cleanup
|
||||
print(f"\n6. Cleaning up...")
|
||||
await loader.unload_all()
|
||||
print(f" ✓ Cleanup complete")
|
||||
|
||||
print(f"\n" + "=" * 60)
|
||||
print("Test complete!")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_plugins())
|
||||
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug plugin loading.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Setup path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from hbd.plugin import PluginRegistry, PluginLoader
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
async def test_manual_load():
|
||||
"""Test manual plugin loading."""
|
||||
print("Testing manual plugin import...")
|
||||
|
||||
# Import plugins directly
|
||||
from hbd.plugins.os_info import OSInfoPlugin
|
||||
from hbd.plugins.cpu_monitor import CPUMonitorPlugin
|
||||
|
||||
# Create instances
|
||||
os_plugin = OSInfoPlugin()
|
||||
cpu_plugin = CPUMonitorPlugin()
|
||||
|
||||
print(f"OS Plugin: {os_plugin.name} v{os_plugin.version}")
|
||||
print(f"CPU Plugin: {cpu_plugin.name} v{cpu_plugin.version}")
|
||||
|
||||
# Initialize
|
||||
print("\nInitializing plugins...")
|
||||
os_init = await os_plugin.initialize()
|
||||
cpu_init = await cpu_plugin.initialize()
|
||||
|
||||
print(f"OS plugin initialized: {os_init}")
|
||||
print(f"CPU plugin initialized: {cpu_init}")
|
||||
|
||||
# Collect data
|
||||
if os_init:
|
||||
print("\nCollecting OS info...")
|
||||
os_data = await os_plugin.collect()
|
||||
print(f"Got {len(os_data)} fields:")
|
||||
for k, v in list(os_data.items())[:10]:
|
||||
print(f" {k}: {v}")
|
||||
|
||||
if cpu_init:
|
||||
print("\nCollecting CPU info...")
|
||||
cpu_data = await cpu_plugin.collect()
|
||||
print(f"Got {len(cpu_data)} fields:")
|
||||
for k, v in cpu_data.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_manual_load())
|
||||
@@ -0,0 +1,495 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test suite for the threshold checking and alerting system.
|
||||
|
||||
Tests cover:
|
||||
- Threshold configuration parsing
|
||||
- Threshold evaluation (all operators)
|
||||
- Hysteresis functionality
|
||||
- Alert state tracking
|
||||
- State change detection
|
||||
- Notification triggering
|
||||
- Re-notification logic
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from hbd.threshold import (
|
||||
ThresholdChecker,
|
||||
ThresholdConfig,
|
||||
AlertLevel,
|
||||
AlertState,
|
||||
ComparisonOperator,
|
||||
)
|
||||
|
||||
|
||||
def test_threshold_config_basic():
|
||||
"""Test basic threshold configuration."""
|
||||
print("Test 1: Basic threshold configuration...")
|
||||
|
||||
config = ThresholdConfig(
|
||||
metric_path="cpu_monitor.cpu_percent",
|
||||
warning=80.0,
|
||||
critical=90.0,
|
||||
operator=">",
|
||||
)
|
||||
|
||||
# Test below warning
|
||||
result = config.evaluate(50.0)
|
||||
assert result == AlertLevel.OK, f"Expected OK, got {result}"
|
||||
|
||||
# Test at warning
|
||||
result = config.evaluate(80.0)
|
||||
assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
|
||||
|
||||
# Test above warning but below critical
|
||||
result = config.evaluate(85.0)
|
||||
assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
|
||||
|
||||
# Test above critical
|
||||
result = config.evaluate(95.0)
|
||||
assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
|
||||
|
||||
print(" ✓ Basic threshold configuration works")
|
||||
|
||||
|
||||
def test_threshold_operators():
|
||||
"""Test all comparison operators."""
|
||||
print("\nTest 2: Comparison operators...")
|
||||
|
||||
# Greater than operator
|
||||
config_gt = ThresholdConfig(
|
||||
metric_path="test.metric",
|
||||
warning=80.0,
|
||||
critical=90.0,
|
||||
operator=">",
|
||||
)
|
||||
assert config_gt.evaluate(85.0) == AlertLevel.WARNING
|
||||
assert config_gt.evaluate(75.0) == AlertLevel.OK
|
||||
|
||||
# Less than operator (for inverse thresholds like available memory)
|
||||
config_lt = ThresholdConfig(
|
||||
metric_path="memory.available_mb",
|
||||
warning=1000,
|
||||
critical=500,
|
||||
operator="<",
|
||||
)
|
||||
assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
|
||||
assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
|
||||
assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
|
||||
|
||||
# Greater than or equal
|
||||
config_gte = ThresholdConfig(
|
||||
metric_path="test.metric",
|
||||
warning=80.0,
|
||||
operator=">=",
|
||||
)
|
||||
assert config_gte.evaluate(80.0) == AlertLevel.WARNING
|
||||
assert config_gte.evaluate(79.9) == AlertLevel.OK
|
||||
|
||||
# Less than or equal
|
||||
config_lte = ThresholdConfig(
|
||||
metric_path="test.metric",
|
||||
warning=20.0,
|
||||
operator="<=",
|
||||
)
|
||||
assert config_lte.evaluate(20.0) == AlertLevel.WARNING
|
||||
assert config_lte.evaluate(20.1) == AlertLevel.OK
|
||||
|
||||
print(" ✓ All comparison operators work correctly")
|
||||
|
||||
|
||||
def test_hysteresis():
|
||||
"""Test hysteresis to prevent flapping."""
|
||||
print("\nTest 3: Hysteresis...")
|
||||
|
||||
config = ThresholdConfig(
|
||||
metric_path="cpu_monitor.cpu_percent",
|
||||
warning=80.0,
|
||||
critical=90.0,
|
||||
operator=">",
|
||||
hysteresis=0.1, # 10% hysteresis
|
||||
)
|
||||
|
||||
# Start at OK, go to WARNING
|
||||
result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
|
||||
assert result == AlertLevel.WARNING, "Should enter WARNING state"
|
||||
|
||||
# Try to recover with insufficient improvement (within hysteresis)
|
||||
# Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
|
||||
result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
|
||||
assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
|
||||
|
||||
# Recover with sufficient improvement
|
||||
result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
|
||||
assert result == AlertLevel.OK, "Should recover to OK"
|
||||
|
||||
# Test critical hysteresis
|
||||
result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
|
||||
assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
|
||||
|
||||
# Try to recover from critical with insufficient improvement
|
||||
# Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
|
||||
result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
|
||||
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
|
||||
|
||||
# Sufficient improvement to drop from CRITICAL (below 81)
|
||||
result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
|
||||
assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
|
||||
|
||||
# Now test dropping from CRITICAL to WARNING
|
||||
result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
|
||||
assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
|
||||
|
||||
# Drop to between warning and critical, but still in hysteresis zone
|
||||
result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
|
||||
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
|
||||
|
||||
# Drop below critical hysteresis but still above warning threshold
|
||||
# At 80.1, we're above WARNING (80) so should evaluate to WARNING
|
||||
result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
|
||||
assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
|
||||
|
||||
print(" ✓ Hysteresis prevents flapping")
|
||||
|
||||
|
||||
def test_alert_state():
|
||||
"""Test alert state tracking."""
|
||||
print("\nTest 4: Alert state tracking...")
|
||||
|
||||
alert = AlertState("cpu_monitor.cpu_percent")
|
||||
|
||||
# Initial state
|
||||
assert alert.level == AlertLevel.OK
|
||||
assert alert.notification_count == 0
|
||||
|
||||
# Update to WARNING - should trigger notification
|
||||
changed = alert.update(AlertLevel.WARNING, 85.0)
|
||||
assert changed == True, "State change should return True"
|
||||
assert alert.level == AlertLevel.WARNING
|
||||
assert alert.last_value == 85.0
|
||||
|
||||
# Update with same level - no notification
|
||||
changed = alert.update(AlertLevel.WARNING, 86.0)
|
||||
assert changed == False, "No state change should return False"
|
||||
assert alert.last_value == 86.0
|
||||
|
||||
# Escalate to CRITICAL
|
||||
changed = alert.update(AlertLevel.CRITICAL, 95.0)
|
||||
assert changed == True, "Escalation should trigger notification"
|
||||
assert alert.level == AlertLevel.CRITICAL
|
||||
|
||||
# Recover to OK
|
||||
changed = alert.update(AlertLevel.OK, 50.0)
|
||||
assert changed == True, "Recovery should trigger notification"
|
||||
assert alert.level == AlertLevel.OK
|
||||
|
||||
print(" ✓ Alert state tracking works correctly")
|
||||
|
||||
|
||||
def test_threshold_checker_parsing():
|
||||
"""Test parsing threshold configuration from YAML structure."""
|
||||
print("\nTest 5: Configuration parsing...")
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
"operator": ">",
|
||||
"hysteresis": 0.1,
|
||||
},
|
||||
"load_1min": {
|
||||
"warning": 4.0,
|
||||
"critical": 8.0,
|
||||
},
|
||||
},
|
||||
"memory_monitor": {
|
||||
"percent": {
|
||||
"warning": 85.0,
|
||||
"critical": 95.0,
|
||||
},
|
||||
"available_mb": {
|
||||
"warning": 1000,
|
||||
"critical": 500,
|
||||
"operator": "<",
|
||||
},
|
||||
},
|
||||
"disk_monitor": {
|
||||
"partitions": {
|
||||
"/": {
|
||||
"percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
},
|
||||
},
|
||||
"/home": {
|
||||
"percent": {
|
||||
"warning": 85.0,
|
||||
"critical": 95.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
checker = ThresholdChecker(config)
|
||||
|
||||
# Verify thresholds were parsed
|
||||
assert "cpu_monitor.cpu_percent" in checker.thresholds
|
||||
assert "cpu_monitor.load_1min" in checker.thresholds
|
||||
assert "memory_monitor.percent" in checker.thresholds
|
||||
assert "memory_monitor.available_mb" in checker.thresholds
|
||||
assert "disk_monitor./.percent" in checker.thresholds
|
||||
assert "disk_monitor./home.percent" in checker.thresholds
|
||||
|
||||
# Verify operators were parsed correctly
|
||||
assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
|
||||
assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
|
||||
|
||||
print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly")
|
||||
|
||||
|
||||
def test_check_plugin_data():
|
||||
"""Test checking plugin data against thresholds."""
|
||||
print("\nTest 6: Plugin data checking...")
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
},
|
||||
"load_1min": {
|
||||
"warning": 4.0,
|
||||
"critical": 8.0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
|
||||
def notification_callback(msg):
|
||||
notifications.append(msg)
|
||||
|
||||
checker = ThresholdChecker(config, notification_callback=notification_callback)
|
||||
alert_states = {}
|
||||
|
||||
# First check - OK
|
||||
plugin_data = {
|
||||
"cpu_percent": 50.0,
|
||||
"load_1min": 2.0,
|
||||
}
|
||||
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
assert len(state_changes) == 0, "No thresholds violated, no state changes"
|
||||
assert len(notifications) == 0, "No notifications should be sent"
|
||||
|
||||
# Second check - WARNING
|
||||
plugin_data = {
|
||||
"cpu_percent": 85.0,
|
||||
"load_1min": 2.0,
|
||||
}
|
||||
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
assert len(state_changes) == 1, "One metric should change state"
|
||||
assert state_changes[0][0] == "cpu_monitor.cpu_percent"
|
||||
assert state_changes[0][2] == AlertLevel.WARNING
|
||||
assert len(notifications) == 1, "One notification should be sent"
|
||||
assert "WARNING" in notifications[0]
|
||||
assert "testhost" in notifications[0]
|
||||
|
||||
# Third check - CRITICAL
|
||||
plugin_data = {
|
||||
"cpu_percent": 95.0,
|
||||
"load_1min": 9.0,
|
||||
}
|
||||
|
||||
notifications.clear()
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
assert len(state_changes) == 2, "Two metrics should change state"
|
||||
assert len(notifications) == 2, "Two notifications should be sent"
|
||||
|
||||
# Fourth check - Recovery
|
||||
plugin_data = {
|
||||
"cpu_percent": 50.0,
|
||||
"load_1min": 1.0,
|
||||
}
|
||||
|
||||
notifications.clear()
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="cpu_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
assert len(state_changes) == 2, "Two metrics should recover"
|
||||
assert len(notifications) == 2, "Two recovery notifications"
|
||||
assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
|
||||
|
||||
print(" ✓ Plugin data checking and notifications work")
|
||||
|
||||
|
||||
def test_nested_metrics():
|
||||
"""Test checking nested metrics like disk partitions."""
|
||||
print("\nTest 7: Nested metrics (partitions)...")
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"disk_monitor": {
|
||||
"partitions": {
|
||||
"/": {
|
||||
"percent": {
|
||||
"warning": 80.0,
|
||||
"critical": 90.0,
|
||||
},
|
||||
},
|
||||
"/home": {
|
||||
"percent": {
|
||||
"warning": 85.0,
|
||||
"critical": 95.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
notifications = []
|
||||
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
||||
alert_states = {}
|
||||
|
||||
plugin_data = {
|
||||
"partitions": {
|
||||
"/": {
|
||||
"percent": 75.0,
|
||||
"free_gb": 50.0,
|
||||
},
|
||||
"/home": {
|
||||
"percent": 88.0, # Should trigger WARNING
|
||||
"free_gb": 100.0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
state_changes = checker.check_plugin_data(
|
||||
host_name="testhost",
|
||||
plugin_name="disk_monitor",
|
||||
data=plugin_data,
|
||||
alert_states=alert_states,
|
||||
)
|
||||
|
||||
assert len(state_changes) == 1, "One partition should trigger alert"
|
||||
assert "/home" in state_changes[0][0], "Should be /home partition"
|
||||
assert state_changes[0][2] == AlertLevel.WARNING
|
||||
assert len(notifications) == 1
|
||||
|
||||
print(" ✓ Nested metric checking works")
|
||||
|
||||
|
||||
def test_alert_summary():
|
||||
"""Test getting alert summaries."""
|
||||
print("\nTest 8: Alert summaries...")
|
||||
|
||||
config = {
|
||||
"thresholds": {
|
||||
"cpu_monitor": {
|
||||
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
||||
"load_1min": {"warning": 4.0, "critical": 8.0},
|
||||
},
|
||||
"memory_monitor": {
|
||||
"percent": {"warning": 85.0, "critical": 95.0},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
checker = ThresholdChecker(config)
|
||||
alert_states = {}
|
||||
|
||||
# Create some alert states
|
||||
plugin_data = {
|
||||
"cpu_percent": 85.0, # WARNING
|
||||
"load_1min": 9.0, # CRITICAL
|
||||
}
|
||||
checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
|
||||
|
||||
plugin_data = {
|
||||
"percent": 96.0, # CRITICAL
|
||||
}
|
||||
checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
|
||||
|
||||
# Get summary
|
||||
summary = checker.get_alert_summary(alert_states)
|
||||
assert summary["warning"] == 1, "Should have 1 warning"
|
||||
assert summary["critical"] == 2, "Should have 2 critical"
|
||||
|
||||
# Get active alerts
|
||||
active = checker.get_active_alerts(alert_states)
|
||||
assert len(active) == 3, "Should have 3 active alerts"
|
||||
|
||||
print(" ✓ Alert summaries work correctly")
|
||||
|
||||
|
||||
def run_all_tests():
|
||||
"""Run all tests."""
|
||||
print("=" * 70)
|
||||
print("THRESHOLD SYSTEM TEST SUITE")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
test_threshold_config_basic()
|
||||
test_threshold_operators()
|
||||
test_hysteresis()
|
||||
test_alert_state()
|
||||
test_threshold_checker_parsing()
|
||||
test_check_plugin_data()
|
||||
test_nested_metrics()
|
||||
test_alert_summary()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("✓ ALL TESTS PASSED")
|
||||
print("=" * 70)
|
||||
return 0
|
||||
|
||||
except AssertionError as e:
|
||||
print(f"\n✗ TEST FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"\n✗ UNEXPECTED ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(run_all_tests())
|
||||
@@ -44,6 +44,6 @@ def test_handle_cmd_sends_command():
|
||||
handle_datagram(msg2, ("127.0.0.1", 50000), ftr, ctx)
|
||||
# should have sent ACK and the command; last send should be non-empty
|
||||
assert len(ftr.sent) >= 1
|
||||
# the command for cver 0 will be sent as raw cmd string
|
||||
# the command for ??0 will be sent as raw cmd string
|
||||
# so at least one send contains b'doit' or similar
|
||||
assert any(b"doit" in s[0] for s in ftr.sent)
|
||||
|
||||
+1
-1
@@ -9,6 +9,6 @@ def test_parse_message_uncompressed():
|
||||
|
||||
|
||||
def test_parse_message_compressed():
|
||||
raw = dicttos("ACK", {"time": 1}, compress=True)
|
||||
raw = dicttos("ACK", {"time": 1})
|
||||
m = parse_message(raw)
|
||||
assert "ID" in m
|
||||
|
||||
Reference in New Issue
Block a user