Compare commits

..

17 Commits

Author SHA1 Message Date
Andreas Wrede 8dd002d159 version 5.0.7
Release / release (push) Failing after 1s
2026-04-04 14:45:10 -04:00
Andreas Wrede 2373b55d8b fix actions host label. cp[e woth debian flavor sed 2026-04-04 14:43:07 -04:00
Andreas Wrede 81530636ec version 5.0.6
Release / release (push) Has been cancelled
2026-04-04 13:54:19 -04:00
Andreas Wrede 190199b36d Add a release workflow 2026-04-04 13:00:17 -04:00
Andreas Wrede 73aa89f8f4 fix web page issues 2026-04-04 12:43:30 -04:00
Andreas Wrede 941f3ea4b0 display and acknowledge alerts 2026-04-03 06:35:45 -04:00
Andreas Wrede c5770006f7 hbc proper termination, hbd config reloadable 2026-04-02 07:17:00 -04:00
Andreas Wrede 84c1aef51f removal of cver 2026-04-01 20:47:29 -04:00
Andreas Wrede 460d2be9e9 Fix rtt, including bug in time compute 2026-04-01 19:41:53 -04:00
Andreas Wrede 090d341244 per-client threshold config 2026-04-01 15:22:42 -04:00
Andreas Wrede 079e84f729 display tag fro alterts, cleanup udp 2026-04-01 11:49:55 -04:00
Andreas Wrede dd23d9d163 refactor monitor, add threshold rtesting 2026-03-31 12:22:03 -04:00
Andreas Wrede ad7178ebcb Move threshhold to server, move eventlog to notify 2026-03-29 20:29:33 -04:00
Andreas Wrede 0543266c92 Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:
- Restructuring of the project directory into client and server components
- Renaming of modules and classes to better reflect their purpose and functionality
- Moving common utilities and configurations to a shared location
- Updating import statements to reflect the new structure
- Adding new documentation files for better clarity on various aspects of the project
- Removing deprecated or unused code to streamline the codebase
- Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
2026-03-29 11:13:40 -04:00
Andreas Wrede 7e2038ecac version 5.0.5 2026-02-16 10:54:16 -05:00
Andreas Wrede 75e41eafc4 remove redundent code 2026-02-16 10:54:04 -05:00
Andreas Wrede 73b9d05357 fix letter of daemon option 2026-02-13 21:05:42 -05:00
80 changed files with 16147 additions and 2673 deletions
+38
View File
@@ -0,0 +1,38 @@
name: Release
on:
push:
tags:
- 'v*'
jobs:
release:
runs-on: FreeBSD
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install build tools
run: |
python -m pip install --upgrade pip
pip install build twine
- name: Build package
run: python -m build
- name: Extract version from tag
id: get_version
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
- name: Create release
uses: actions/gitea-release-action@v1
with:
files: |
dist/*.whl
dist/*.tar.gz
title: "Release ${{ steps.get_version.outputs.VERSION }}"
body: "Release version ${{ steps.get_version.outputs.VERSION }}"
+239 -29
View File
@@ -2,43 +2,253 @@
hb_port: 50003
hbd_host: ''
#logfile: "/home/andreas/public_html/messages/andreas"
logfile: "/Users/andreas/public_html/messages/andreas"
logfile: "/home/andreas/logs/heartbeat/heartbeat.log"
#logfile: "/Users/andreas/public_html/messages/andreas"
logfmt: "msg"
grace: 40
interval: 10
watchhosts:
# "localhost":
# "haschloss" :
# "cotgate":
"wentworth":
notify: +4915123456789
src: "signal"
"y":
notify: +4915123456789
src: "signal"
"winter":
notify: +14168226179
src: "signal"
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
# Notification Channels - Define notification providers centrally
# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration
notification_channels:
pushover_standard:
type: pushover
token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf
user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK
signal_andreas:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +14168226179
recipient: +14168226179
email_andreas:
type: email
recipients: [aew.hbd.notify@wrede.ca]
sender: aew.hbd@wrede.ca
smtp_server: smtp.fastmail.com
smtp_port: 587
smtp_user: andreas@wrede.ca
smtp_password: pvtvefyp5gbhnch2
# Example additional channels (commented out)
# pushover_urgent:
# type: pushover
# token: your-app-token
# user: your-user-key
#
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
# Default notification channels (used if host doesn't specify channels)
default_notification_channels: [pushover_standard]
# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications
hosts:
wentworth:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
y:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
winter:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
wally:
threshold_config: freebsd_server
watch: false
notification_channels: [pushover_standard]
dyndns: false
eris:
threshold_config: truenas_server
watch: false
notification_channels: [pushover_standard]
dyndns: false
haschloss:
threshold_config: default
watch: false
dyndns: true
wayback:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
wertvoll:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
weekend:
threshold_config: freebsd_server
watch: false
notification_channels: [pushover_standard]
dyndns: true
cotgate:
threshold_config: default
watch: false
dyndns: true
rvgate:
threshold_config: default
watch: false
dyndns: true
draper:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
# Hosts to drop/ignore
drophosts: {"unknown", "wookie15", "wort"}
nsupdate_bin: "/usr/local/bin/nsupdate"
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
pushsrv: "pushover"
dyndomains: {"wrede.org"}
toemail: ["aew.hbd.notify@wrede.ca"]
fromemail: "aew.hbd@wrede.ca"
smtpserver: "smtp.fastmail.com"
smtpuser: "andreas@wrede.ca"
smtppassword: "r8psra6wj6gcakkp"
smtpport: 587
ws_port: 50005
wss_port: 50006
cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
cert_path: "ssl/"
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
# cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
# cert_path: "test/"
# CERT_PATH = "./test/"
wss_pem: "fullchain.pem"
wss_key: "privkey.pem"
# wss_pem: "fullchain.pem"
# wss_key: "privkey.pem"
journal_enabled: true # Enable/disable journaling
journal_dir: /home/andreas/logs/heartbeat # Journal directory
journal_file: messages.journal # Base filename
journal_max_size: 104857600 # Max size (100MB default)
journal_max_backups: 10 # Number of backups to keep
threshold_configs:
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
percent:
warning: 85.0
critical: 95.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
rtt:
warning: 50
critical: 250.0
freebsd_server:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
memory_percent:
warning: 97.0
critical: 100.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
nagios_runner:
# overall_status_code:
# warning: 1
# critical: 2
# operator: ">="
load_status:
warning: WARNING
critical: CRITICAL
operator: "=="
ups_load:
display: "load to high: {ups_output}"
warning: 70
critical: 80
operator: ">="
ups_status_code:
display: "{ups_output}"
warning: 1
critical: 2
operator: ">="
nextcloud_apps_status_code:
display: "{nextcloud_apps_output}"
warning: 1
critical: 2
operator: ">="
rtt:
warning: 50
critical: 250.0
truenas_server:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
percent:
warning: 3.0
critical: 95.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
nagios_runner:
# overall_status_code:
# warning: 1
# critical: 2
# operator: ">="
load_status:
warning: WARNING
critical: CRITICAL
operator: "=="
ups_load:
display: "load to high: {ups_output}"
WARNING: 70
CRITICAL: 80
OPERATOR: ">="
ups_status_code:
DISPLAY: "{ups_output}"
warning: 1
critical: 2
operator: ">="
nextcloud_apps_status_code:
display: "{nextcloud_apps_output}"
warning: 1
critical: 2
operator: ">="
rtt:
warning: 120
critical: 250.0
+3 -3
View File
@@ -8,8 +8,8 @@
"name": "Python: Run hbd (module)",
"type": "debugpy",
"request": "launch",
"module": "hbd.cli",
"args": ["-c", ".hb.yaml", "-f", "-v", "-x", "-x", "-x"],
"module": "hbd.server.cli",
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x", "-x"],
"cwd": "${workspaceFolder}",
"env": {
"PYTHONPATH": "${workspaceFolder}"
@@ -32,7 +32,7 @@
"type": "debugpy",
"request": "launch",
"module": "debugpy",
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.cli", "-c", ".hb.yaml", "-f", "-v"],
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", ".hb.yaml", "-f", "-v"],
"env": { "PYTHONPATH": "${workspaceFolder}" },
"console": "integratedTerminal",
"justMyCode": false
+4 -1
View File
@@ -2,5 +2,8 @@
"python.pythonPath": "/usr/bin/python3",
"python.linting.enabled": true,
"python.formatting.provider": "black",
"python.linting.flake8Enabled": true
"python.linting.flake8Enabled": true,
"chat.tools.terminal.autoApprove": {
"mv": true
}
}
+373 -1
View File
@@ -11,10 +11,338 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
- Queue DNS updates via `nsupdate` and run them in a background thread ✅
- WebSocket API for live updates (hosts & messages) ✅
- Notification pipeline (email, Pushover, Mattermost, Signal) ✅
- **HTTP API & Web UI** ✅
- REST API for plugin data, alerts, and host information
- Live dashboard with WebSocket updates
- Interactive plugin metrics visualization
- Alerts dashboard with filtering and summaries
- **Message journal with automatic log rotation** ✅
- Logs all received messages in JSON format
- Size-based automatic rotation
- Configurable retention and backup management
- **Plugin system for extensible monitoring** ✅
- Collect system metrics (CPU, memory, disk, network)
- Execute existing Nagios monitoring plugins
- Create custom plugins with simple Python classes
- **Threshold alerting system** ✅
- Monitor metrics against configurable WARNING/CRITICAL thresholds
- Hysteresis to prevent alert flapping
- Automatic notifications on state changes
- Re-notification for ongoing alerts
- Modular codebase suitable for unit testing and CI ✅
---
## 🔌 Plugin System
Heartbeat includes a comprehensive plugin architecture that extends monitoring beyond simple heartbeats. The plugin system allows you to:
- **Collect system information**: OS details, hardware info, system configuration
- **Monitor resources**: CPU usage, memory, disk space, network statistics
- **Run Nagios plugins**: Execute thousands of existing Nagios monitoring plugins without modification
- **Create custom plugins**: Build your own monitoring logic with simple Python classes
### Plugin Types
- **InfoPlugin**: Collects static information once (e.g., OS version, hardware specs)
- **MonitorPlugin**: Collects metrics periodically (e.g., CPU usage every 30 seconds)
### Built-in Plugins
- `os_info`: Collects OS, kernel, distribution, and architecture information
- `cpu_monitor`: Monitors CPU usage, load average, frequency, and process counts
- `memory_monitor`: Monitors RAM and swap usage, available memory
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
### Nagios Integration
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
- Executes plugins via subprocess with timeout protection
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
- Extracts performance data with thresholds
- Reports aggregated status across all configured checks
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
### Creating Custom Plugins
```python
from hbd.plugin import MonitorPlugin
class DiskMonitorPlugin(MonitorPlugin):
name = "disk_monitor"
interval = 60 # Run every 60 seconds
async def collect(self):
return {
"disk_usage": get_disk_usage(),
"timestamp": time.time()
}
```
Place plugins in `hbd/plugins/` and they'll be automatically discovered and loaded by the client.
---
## 📝 Message Journal
Heartbeat includes a message journal that logs all received messages with automatic rotation.
### Features
- **JSON Format**: All messages logged in JSONL (JSON Lines) format for easy parsing
- **Automatic Rotation**: Size-based rotation with configurable thresholds
- **Backup Management**: Keeps configurable number of rotated log files
- **Non-blocking**: Async logging with minimal performance impact
### Configuration
```yaml
# Message journal settings
journal_enabled: true # Enable/disable journaling
journal_dir: /var/log/heartbeat # Journal directory
journal_file: messages.journal # Base filename
journal_max_size: 104857600 # Max size (100MB default)
journal_max_backups: 10 # Number of backups to keep
```
### Example Journal Entry
```json
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
```
### Analyzing Journal Files
```bash
# View recent messages
tail -100 /var/log/heartbeat/messages.journal | jq .
# Count messages by type
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
# Filter by hostname
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
```
See [docs/MESSAGE_JOURNAL.md](docs/MESSAGE_JOURNAL.md) for complete documentation including rotation behavior, integration with log management systems, and analysis examples.
---
## 🚨 Threshold Alerting
Heartbeat includes a sophisticated threshold alerting system that monitors plugin metrics and triggers notifications when values exceed configured limits.
### Features
- **Multi-level alerts**: WARNING and CRITICAL severity levels
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
- **Smart notifications**: Alerts only on state changes, not every check
- **Re-notifications**: Periodic reminders for ongoing alerts
- **Journal integration**: All threshold events logged for audit trail
### Configuration
```yaml
thresholds:
# RTT (Round-Trip Time) thresholds for heartbeat monitoring
# These are checked on every HTB message arrival
rtt:
webserver01:
warning: 100.0 # Warn when RTT > 100ms
critical: 500.0 # Critical when RTT > 500ms
database01:
warning: 50.0
critical: 200.0
# Plugin metric thresholds
cpu_monitor:
cpu_percent:
warning: 80.0 # Warn when CPU > 80%
critical: 90.0 # Critical when CPU > 90%
operator: ">"
hysteresis: 0.1 # 10% hysteresis to prevent flapping
memory_monitor:
percent:
warning: 85.0
critical: 95.0
disk_monitor:
partitions:
/:
percent:
warning: 80.0
critical: 90.0
free_gb:
warning: 10.0 # Alert when < 10GB free
critical: 5.0
operator: "<" # Inverse threshold
# Global settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
```
### RTT Monitoring
Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
- **Per-host configuration**: Set different thresholds for each monitored host
- **Real-time checking**: Thresholds evaluated on every HTB message arrival
- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
- **Re-notification**: Periodic reminders for ongoing RTT issues
- **Event & journal logging**: All RTT events logged for audit trail
**Configuration format:**
```yaml
thresholds:
rtt:
<hostname>:
warning: <milliseconds> # Warn when RTT > this value
critical: <milliseconds> # Critical when RTT > this value
hysteresis: 0.1 # Optional: 10% hysteresis (default)
```
**Example alerts:**
```
WARNING: webserver01 - rtt.webserver01 = 125.3
CRITICAL: database01 - rtt.database01 = 520.1
RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
```
RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
### Alert Behavior
1. **State Changes**: Notifications sent when crossing thresholds
- OK → WARNING: Early notification
- WARNING → CRITICAL: Escalation
- CRITICAL → OK: Recovery
2. **Hysteresis**: Prevents rapid state transitions
```
Critical threshold: 90%
Hysteresis: 10%
Recovery threshold: 81% (90 - 10% of 90)
Value 91% → CRITICAL (threshold crossed)
Value 85% → CRITICAL (still above 81%)
Value 79% → OK (below recovery threshold)
```
3. **Re-notifications**: Periodic reminders for ongoing alerts
- Default: Every 60 minutes
- Configurable via `threshold_renotify_interval`
### Example Notifications
```
WARNING: webserver01 - cpu_monitor.cpu_percent = 85.0
CRITICAL: webserver01 - memory_monitor.percent = 96.0
RECOVERED: database01 - disk_monitor./.percent = 75.0 (WARNING -> OK)
REMINDER (CRITICAL): mailserver - cpu_monitor.load_1min = 12.5 (ongoing for 3600s)
```
### Supported Metrics
All plugin metrics can be thresholded:
- **CPU**: cpu_percent, load_1min, load_5min, load_15min
- **Memory**: percent, available_mb, swap_percent
- **Disk**: Per-partition percent, free_gb, free_mb
- **Network**: errors_total, dropped packets, connection counts
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL)
See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration.
---
## 🌐 HTTP API & Web UI
Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST API and web-based dashboards for monitoring and visualization.
### Features
- **REST API**: JSON endpoints for accessing plugin data, alerts, and host information
- **Live Dashboard**: Real-time WebSocket-powered host status view
- **Plugin Metrics**: Interactive visualization of all plugin data with auto-refresh
- **Alerts Dashboard**: Comprehensive alert monitoring with filtering and summaries
- **CORS Support**: Configurable for integration with external applications
### Web Dashboards
- **Live View** (`/live`): Real-time host connectivity, latency, and messages
- **Plugin Metrics** (`/plugins`): Browse and visualize metrics from all plugins
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering
### API Endpoints
```bash
# List all monitored hosts
curl http://localhost:50004/api/0/hosts
# Get all plugin data for a host
curl http://localhost:50004/api/0/hosts/webserver01/plugins
# Get detailed plugin history (last 50 samples)
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=50
# Get alert states for a specific host
curl http://localhost:50004/api/0/hosts/webserver01/alerts
# Get all active alerts across all hosts
curl http://localhost:50004/api/0/alerts
```
### Integration Examples
**Python Client:**
```python
import requests
# Monitor for critical alerts
response = requests.get('http://localhost:50004/api/0/alerts')
alerts = response.json()
if alerts['summary']['critical'] > 0:
print(f"⚠️ {alerts['summary']['critical']} CRITICAL alerts!")
for alert in alerts['alerts']:
if alert['level'] == 'CRITICAL':
print(f" {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
```
**Bash Monitoring Script:**
```bash
#!/bin/bash
# Check for critical alerts
CRITICAL=$(curl -s http://localhost:50004/api/0/alerts | jq '.summary.critical')
if [ "$CRITICAL" -gt 0 ]; then
echo "CRITICAL: $CRITICAL critical alerts detected!"
# Send notification
fi
```
### Demo & Testing
Run the API demo script to test all endpoints:
```bash
python3 scripts/demo_http_api.py
```
See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation including response formats, error handling, and integration examples.
---
## ⚙️ Quickstart
Prerequisites:
@@ -43,6 +371,46 @@ You can also run it directly via the package entrypoint after installation:
python -m hbd.cli -c /path/to/config.yaml
```
### Running the Client
The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
```bash
# Basic usage pointing to server
python -m hbd.hbc --server your-server.example.com
# With custom configuration
python -m hbd.hbc --server 192.168.1.100 --port 50003 --interval 30
# Run with specific plugins enabled/disabled
python -m hbd.hbc --server hbd.local --disable-plugin os_info
```
Client configuration can also be specified in YAML:
```yaml
server: hbd.example.com
port: 50003
interval: 30
plugins:
cpu_monitor:
interval: 300 # Check every 5 minutes (default)
per_core: true
memory_monitor:
interval: 300 # Check every 5 minutes (default)
disk_monitor:
interval: 300 # Check every 5 minutes (default)
network_monitor:
interval: 300 # Check every 5 minutes (default)
nagios_runner:
interval: 300 # Check every 5 minutes (default)
commands:
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
```
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
## 🐞 Debugging in VS Code
This repository includes a ready-to-use `.vscode/launch.json` with configurations to run or attach the VS Code debugger to `hbd`.
@@ -102,7 +470,7 @@ pushsrv: pushover
## 🔧 Architecture & Modules
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads)
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
The DNS worker now runs as an `asyncio` task and the package exposes a
@@ -112,6 +480,10 @@ pushsrv: pushover
- `hbd.notify` — email and push notification helpers
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
- `hbd.http` — HTTP handler factory for the status UI/API
- `hbd.journal` — message journal with size-based log rotation and backup management
- `hbd.plugin` — plugin framework with base classes, registry, and dynamic loader
- `hbd.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
- `hbd.hbc` — heartbeat client that sends heartbeats and plugin data to server
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
- `hbd.cli` — CLI entrypoint and argument parsing
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
+234
View File
@@ -0,0 +1,234 @@
# HBD/HBC Separation Refactoring
## Overview
The heartbeat monitoring system has been refactored into a modular package structure with separate client and server components. This allows users to install only what they need and provides clear separation of concerns.
## New Package Structure
```
hbd/
├── __init__.py # Main package (minimal)
├── client/ # HBC - System monitoring client
│ ├── __init__.py
│ ├── main.py # Entry point (was hbc.py)
│ ├── config.py # Client-specific configuration
│ ├── plugin.py # Plugin framework
│ ├── threshold.py # Threshold checking
│ └── plugins/ # Monitoring plugins
│ ├── cpu_monitor.py
│ ├── disk_monitor.py
│ ├── memory_monitor.py
│ ├── network_monitor.py
│ ├── filesystem_info.py
│ ├── os_info.py
│ └── nagios_runner.py
├── server/ # HBD - Heartbeat daemon/server
│ ├── __init__.py
│ ├── main.py # Server runtime (was server.py)
│ ├── cli.py # Command-line interface
│ ├── config.py # Server-specific configuration
│ ├── http.py # HTTP/REST API
│ ├── ws.py # WebSocket server
│ ├── udp.py # UDP heartbeat listener
│ ├── dns.py # DNS update functionality
│ ├── notify.py # Notification handlers
│ ├── monitor.py # Host monitoring
│ ├── hbdclass.py # Host class definitions
│ ├── journal.py # Message journaling
│ ├── templates/ # Jinja2 web templates
│ └── static/ # Web UI assets
└── common/ # Shared utilities
├── __init__.py
├── proto.py # Protocol encoding/decoding
└── utils.py # Common utilities
## Configuration Files
### Client Configuration (hbd/client/config.py)
Client-specific defaults:
- `hb_port`: Port where hbd servers listen (default: 50003)
- `interval`: Heartbeat interval in seconds (default: 10)
- `plugins`: Per-plugin configuration
- `thresholds`: Threshold configuration for monitoring
### Server Configuration (hbd/server/config.py)
Server-specific defaults:
- `hb_port`: Port to listen for heartbeats (default: 50003)
- `hbd_port`: HTTP API port (default: 50004)
- `ws_port`: WebSocket port (default: 50005)
- `logfile`, `logfmt`: Logging configuration
- `pushsrv`, `pushover_token`, etc.: Notification settings
- `watchhosts`, `dyndnshosts`: Host monitoring
- `smtpserver`, etc.: Email settings
- `journal_*`: Message journaling settings
## Installation Options
### Install Core Only (minimal, PyYAML only)
```bash
pip install hbd
```
### Install Client Only (for monitoring)
```bash
pip install hbd[client]
# Installs: PyYAML, psutil
```
### Install Server Only (for daemon)
```bash
pip install hbd[server]
# Installs: PyYAML, websockets, mattermostdriver, aiohttp, Jinja2
```
### Install Everything
```bash
pip install hbd[all]
# Installs all dependencies for both client and server
```
### Development Installation
```bash
pip install -e ".[dev]"
# Includes all dependencies plus testing/linting tools
```
## Command-Line Interfaces
### HBC (Client)
```bash
hbc [options] host1 [host2 ...]
# Entry point: hbd.client.main:main
# Location: hbd/client/main.py
```
### HBD (Server)
```bash
hbd [options]
# Entry point: hbd.server.cli:main
# Location: hbd/server/cli.py → hbd/server/main.py
```
## Import Changes
### Client Code
```python
# Old imports
from .config import load_config
from .proto import dicttos, stodict
from .plugin import PluginRegistry
# New imports
from .config import load_config # Still in client/
from ..common.proto import dicttos # Moved to common/
from .plugin import PluginRegistry # Still in client/
```
### Server Code
```python
# Old imports
from .config import load_config
from .proto import stodict
from .threshold import AlertLevel
# New imports
from .config import load_config # Server-specific config
from ..common.proto import stodict # Moved to common/
from ..client.threshold import AlertLevel # Client module
```
### Plugin Code
```python
# Old import
from hbd.plugin import MonitorPlugin
# New import
from hbd.client.plugin import MonitorPlugin
```
## Benefits
1. **Modular Installation**: Install only what you need
- Client-only systems don't need web server dependencies
- Server-only systems don't need psutil
2. **Clearer Architecture**: Explicit separation of concerns
- Client: System monitoring and data collection
- Server: Heartbeat reception, web UI, notifications
- Common: Shared protocol and utilities
3. **Independent Evolution**: Client and server can evolve separately
- Different release cycles possible
- Clear API boundaries via common/
4. **Smaller Footprint**: Reduced dependency installation
- Client: ~1 dependency (psutil)
- Server: ~4 dependencies (websockets, aiohttp, Jinja2, mattermostdriver)
## Migration Guide
### For Existing Installations
1. **Reinstall the package**:
```bash
pip install -e ".[all]" # For development
# or
pip install hbd[all] # For production
```
2. **Configuration files remain unchanged**:
- Both client and server read from `~/.hb.yaml`
- All existing config keys are supported in both configs
- Server has additional keys (journal, websocket, email, etc.)
- Client has minimal keys (interval, plugins, thresholds)
3. **Commands remain the same**:
- `hbc` command works identically
- `hbd` command works identically
### For New Deployments
1. **Client-only system** (monitoring host):
```bash
pip install hbd[client]
hbc server1.example.com server2.example.com
```
2. **Server-only system** (monitoring daemon):
```bash
pip install hbd[server]
hbd -c /etc/hbd.yaml -f
```
3. **Combined system** (dev/test):
```bash
pip install hbd[all]
```
## Testing
All imports and entry points have been tested and validated:
- ✅ Package imports work correctly
- ✅ `hbc` command entry point functional
- ✅ `hbd` command entry point functional
- ✅ Optional dependencies properly configured
- ✅ All internal imports updated
## Files Archived
The following files were renamed to avoid conflicts:
- `hbd/config.py` → `hbd/config.py.old` (split into client/server configs)
- `hbd/hbc_old.py` → `hbd/hbc_old.py.bak` (backup file)
## Next Steps
1. Test client functionality with a monitoring host
2. Test server functionality with web UI and notifications
3. Update documentation (README.md) with new structure
4. Consider publishing to PyPI with new structure
5. Update any deployment scripts/Dockerfiles to use optional dependencies
+320
View File
@@ -0,0 +1,320 @@
#!/usr/bin/env python3
"""
Demonstration of the threshold alerting system.
This script shows how thresholds work by simulating plugin data
with values that cross various threshold boundaries.
"""
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from hbd.threshold import ThresholdChecker, AlertLevel
def demo_basic_thresholds():
"""Demonstrate basic threshold checking."""
print("=" * 70)
print("DEMO 1: Basic Threshold Checking")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"operator": ">",
"hysteresis": 0.1,
}
}
}
}
notifications = []
def notifier(msg):
notifications.append(msg)
print(f" 📧 NOTIFICATION: {msg}")
checker = ThresholdChecker(config, notification_callback=notifier)
alert_states = {}
# Simulate CPU values over time
test_values = [
(50.0, "Normal operation"),
(85.0, "Crosses WARNING threshold"),
(87.0, "Still in WARNING"),
(95.0, "Escalates to CRITICAL"),
(92.0, "Still CRITICAL (in hysteresis)"),
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
(79.0, "Recovers to OK"),
(50.0, "Back to normal"),
]
print("\nSimulating CPU usage over time:")
print("-" * 70)
for value, description in test_values:
print(f"\n📊 CPU: {value}% - {description}")
plugin_data = {"cpu_percent": value}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
current_state = alert_states.get("cpu_monitor.cpu_percent")
if current_state:
print(f" Current state: {current_state.level.name}")
if state_changes:
for metric, old_level, new_level, val in state_changes:
print(f" ⚠️ State change: {old_level.name}{new_level.name}")
print(f"\n📈 Summary: {len(notifications)} notifications sent")
print("=" * 70)
def demo_multiple_metrics():
"""Demonstrate monitoring multiple metrics."""
print("\n\n" + "=" * 70)
print("DEMO 2: Multiple Metrics and Alert Summary")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {"warning": 80.0, "critical": 90.0},
"load_1min": {"warning": 4.0, "critical": 8.0},
},
"memory_monitor": {
"percent": {"warning": 85.0, "critical": 95.0},
"available_mb": {
"warning": 1000,
"critical": 500,
"operator": "<",
},
},
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
# Simulate problematic system state
print("\nSimulating a system under load:")
print("-" * 70)
scenarios = [
{
"name": "Initial state - all OK",
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
},
{
"name": "CPU spikes to WARNING",
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
},
{
"name": "Memory also reaches WARNING",
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
"memory_monitor": {"percent": 88.0, "available_mb": 800},
},
{
"name": "CPU escalates to CRITICAL",
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
"memory_monitor": {"percent": 88.0, "available_mb": 800},
},
{
"name": "System recovering",
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
},
]
for scenario in scenarios:
print(f"\n📍 {scenario['name']}")
# Check CPU metrics
checker.check_plugin_data(
"testhost",
"cpu_monitor",
scenario["cpu_monitor"],
alert_states
)
# Check memory metrics
checker.check_plugin_data(
"testhost",
"memory_monitor",
scenario["memory_monitor"],
alert_states
)
# Show alert summary
summary = checker.get_alert_summary(alert_states)
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
# Show active alerts
active = checker.get_active_alerts(alert_states)
if active:
print(f" Active alerts:")
for alert in active:
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
print(f"\n📈 Total notifications sent: {len(notifications)}")
print("=" * 70)
def demo_hysteresis():
"""Demonstrate hysteresis effect."""
print("\n\n" + "=" * 70)
print("DEMO 3: Hysteresis Prevents Flapping")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"hysteresis": 0.1, # 10% hysteresis
}
}
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
print("\nCritical threshold: 90%")
print("Hysteresis: 10%")
print("Recovery threshold: 81% (90 - 10% of 90)")
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
print("-" * 70)
# Simulate fluctuating values
test_values = [
(75.0, "Normal"),
(92.0, "Crosses CRITICAL"),
(88.0, "Drops but still above 81% (stays CRITICAL)"),
(86.0, "Still above 81% (stays CRITICAL)"),
(83.0, "Still above 81% (stays CRITICAL)"),
(80.0, "Below 81% - recovers to OK"),
(88.0, "Rises again but below 90% (stays OK)"),
(91.0, "Crosses CRITICAL again"),
]
for value, description in test_values:
print(f"\n📊 CPU: {value:5.1f}% - {description}")
plugin_data = {"cpu_percent": value}
state_changes = checker.check_plugin_data(
"testhost",
"cpu_monitor",
plugin_data,
alert_states,
)
current_state = alert_states.get("cpu_monitor.cpu_percent")
print(f" State: {current_state.level.name}")
if state_changes:
print(f" 📧 Notification sent (state changed)")
else:
print(f" ✓ No notification (state unchanged - hysteresis working)")
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
print("=" * 70)
def demo_inverse_threshold():
"""Demonstrate inverse thresholds (less than)."""
print("\n\n" + "=" * 70)
print("DEMO 4: Inverse Thresholds (Alert When Low)")
print("=" * 70)
config = {
"thresholds": {
"memory_monitor": {
"available_mb": {
"warning": 1000, # Warn when < 1000 MB
"critical": 500, # Critical when < 500 MB
"operator": "<",
"hysteresis": 0.1,
}
}
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
print("\nMonitoring available memory (alert when LOW):")
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
print("-" * 70)
test_values = [
(2000, "Plenty of memory"),
(800, "Drops below 1000 MB - WARNING"),
(450, "Drops below 500 MB - CRITICAL"),
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
(600, "Enough recovery - back to WARNING"),
(1200, "Fully recovered - OK"),
]
for value, description in test_values:
print(f"\n💾 Available: {value} MB - {description}")
plugin_data = {"available_mb": value}
state_changes = checker.check_plugin_data(
"testhost",
"memory_monitor",
plugin_data,
alert_states,
)
current_state = alert_states.get("memory_monitor.available_mb")
print(f" State: {current_state.level.name}")
if state_changes:
for metric, old_level, new_level, val in state_changes:
print(f" 📧 {old_level.name}{new_level.name}")
print(f"\n📈 Notifications sent: {len(notifications)}")
print("=" * 70)
if __name__ == "__main__":
print("\n")
print("" + "" * 68 + "")
print("" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "")
print("" + "" * 68 + "")
demo_basic_thresholds()
demo_multiple_metrics()
demo_hysteresis()
demo_inverse_threshold()
print("\n\n" + "=" * 70)
print("DEMONSTRATION COMPLETE")
print("=" * 70)
print("\nKey takeaways:")
print(" • Thresholds detect when metrics exceed configured limits")
print(" • Notifications sent only on state changes, not every check")
print(" • Hysteresis prevents alert flapping")
print(" • Supports both 'greater than' and 'less than' thresholds")
print(" • Multiple metrics can be monitored simultaneously")
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
print("=" * 70)
print()
+292
View File
@@ -0,0 +1,292 @@
# Configuration Reload
The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running.
## How to Reload Configuration
Send a SIGHUP signal to the running hbd process:
```bash
# Find the process ID
ps aux | grep hbd
# Or use pidof/pgrep
pidof hbd
pgrep -f hbd
# Send SIGHUP signal
kill -HUP <pid>
# Or if using systemd
systemctl reload heartbeat
```
## What Can Be Reloaded
The following configuration sections can be reloaded without restarting:
### ✅ Fully Reloadable
- **Notification Channels** (`notification_channels`)
- Add, remove, or modify notification channel definitions
- Update tokens, API keys, SMTP credentials
- Change recipient lists
- **Threshold Configurations** (`threshold_configs`)
- Modify warning and critical thresholds
- Add or remove threshold rules
- Change operators and hysteresis values
- Update display formats
- **Host Configuration** (`hosts`)
- Change watch status
- Update notification channel assignments
- Modify threshold config assignments
- Change dyndns status
- **Host Lists**
- `watchhosts` - hosts to monitor
- `dyndnshosts` - hosts with dynamic DNS
- `drophosts` - hosts to ignore
- **Runtime Settings**
- `grace` - grace period multiplier
- `interval` - expected heartbeat interval
- `threshold_renotify_interval` - re-notification interval
- `debug` - debug level
- `verbose` - verbose output
- **DNS Settings**
- `dyndomains` - dynamic DNS domains
- `nsupdate_bin` - nsupdate binary path
- `rndc_key` - RNDC key path
### ⚠️ Requires Restart
The following settings **cannot** be reloaded and require a service restart:
- **Network Ports**
- `hb_port` - UDP heartbeat port
- `hbd_port` - HTTP API port
- `ws_port` - WebSocket port
- `wss_port` - Secure WebSocket port
- **SSL/TLS Settings**
- `cert_path` - SSL certificate path
- `wss_pem` - SSL certificate file
- `wss_key` - SSL key file
- **Persistence**
- `pickfile` - Pickle file path
- **Logging**
- `logfile` - Log file path
- `logfmt` - Log format
- **Journal Settings**
- `journal_enabled` - Enable/disable journaling
- `journal_dir` - Journal directory
- `journal_file` - Journal filename
- `journal_max_size` - Maximum journal size
- `journal_max_backups` - Number of backup files
## Reload Process
When a SIGHUP signal is received:
1. **Configuration File Loading**
- The config file is re-read from disk
- YAML parsing is performed
- Validation checks are run
2. **Component Updates**
- Notification system is updated with new channel definitions
- Threshold checker reloads all threshold configurations
- Alert states are preserved to maintain hysteresis
3. **Error Handling**
- If reload fails, the previous configuration is kept
- Error messages are logged
- Service continues running with old configuration
4. **Logging**
- Reload start and completion are logged
- Each component reports its reload status
- Total number of thresholds is reported
## Example Reload Session
```bash
# Terminal 1: Watch the logs
tail -f /var/log/heartbeat.log
# Terminal 2: Edit configuration
vim /path/to/.hb.yaml
# Make changes to notification channels or thresholds
# Save the file
# Terminal 3: Trigger reload
kill -HUP $(pgrep -f hbd)
# Terminal 1: See reload messages
2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload...
2026-04-01 12:34:56 INFO: ============================================================
2026-04-01 12:34:56 INFO: Starting configuration reload...
2026-04-01 12:34:56 INFO: ============================================================
2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml
2026-04-01 12:34:56 INFO: Notification configuration reloaded
2026-04-01 12:34:56 INFO: Reloading threshold configuration...
2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds
2026-04-01 12:34:56 INFO: ============================================================
2026-04-01 12:34:56 INFO: Configuration reload completed successfully
2026-04-01 12:34:56 INFO: ============================================================
```
## Common Use Cases
### 1. Update Notification Credentials
If you need to rotate API keys or update SMTP passwords:
```yaml
notification_channels:
pushover_standard:
type: pushover
token: new-token-here # Updated
user: new-user-key-here # Updated
```
Just edit the config file and send SIGHUP - no restart needed.
### 2. Adjust Threshold Values
Fine-tune alerting thresholds based on observed behavior:
```yaml
threshold_configs:
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 85.0 # Increased from 80.0
critical: 95.0 # Increased from 90.0
```
Send SIGHUP to apply the new thresholds immediately.
### 3. Add New Notification Channels
Add a new notification destination:
```yaml
notification_channels:
email_oncall:
type: email
recipients: [oncall@example.com]
sender: alerts@example.com
smtp_server: smtp.example.com
hosts:
critical_server:
threshold_config: default
watch: true
notification_channels: [pushover_standard, email_oncall] # Added
```
The new channel becomes active immediately after SIGHUP.
### 4. Update Watch List
Start or stop monitoring hosts without restart:
```yaml
hosts:
new_server:
threshold_config: default
watch: true # Start watching
notification_channels: [pushover_standard]
```
## Best Practices
1. **Test Configuration Before Reload**
- Validate YAML syntax before sending SIGHUP
- Check for typos in channel names
- Verify threshold values are reasonable
2. **Monitor Reload Logs**
- Always check logs after reload to confirm success
- Look for error messages if reload fails
- Verify expected number of thresholds loaded
3. **Backup Before Changes**
- Keep a backup of working configuration
- Use version control (git) for config files
- Document why changes were made
4. **Gradual Rollout**
- Test changes on development server first
- Apply to one production server at a time
- Verify behavior before applying everywhere
5. **Plan for Restart-Required Changes**
- Schedule downtime for port or SSL changes
- Use blue-green deployment if possible
- Keep service downtime minimal
## Troubleshooting
### Reload Doesn't Apply Changes
**Check:**
- Is the config file path correct?
- Did you save the file after editing?
- Are there YAML syntax errors?
- Check the logs for error messages
**Solution:**
```bash
# Validate YAML syntax
python -c "import yaml; yaml.safe_load(open('.hb.yaml'))"
# Check file modification time
ls -l .hb.yaml
# View logs
journalctl -u heartbeat -f
```
### Partial Configuration Applied
**Cause:** Some sections reloaded, others didn't.
**Solution:** Check logs to see which components failed. Common issues:
- Invalid channel type
- Missing required threshold fields
- Invalid host references
### Service Becomes Unresponsive
**Cause:** Malformed configuration caused an exception.
**Solution:**
1. Revert to backup configuration
2. Send SIGHUP again to reload the good config
3. If service is completely stuck, restart it
## Implementation Details
The reload mechanism uses:
- **Signal Handling**: SIGHUP triggers reload event
- **Async-Safe Reloading**: Configuration is loaded asynchronously
- **Component Coordination**: All affected components are updated atomically
- **State Preservation**: Alert states and hysteresis information are maintained
- **Error Recovery**: Failed reloads don't affect running configuration
## See Also
- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration
- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details
- Configuration examples in `hbd/config_*.yaml`
+531
View File
@@ -0,0 +1,531 @@
# HTTP API and Web UI Documentation
## Overview
The Heartbeat Daemon provides a comprehensive HTTP API and web-based UI for monitoring plugin data and alert states. The API follows RESTful conventions and returns JSON responses.
## Base URL
All API endpoints are relative to the server base URL:
```
http://your-server:50004
```
Default port is `50004` (configurable via `hbd_port` in configuration).
---
## API Endpoints
### Host Management
#### GET /api/0/hosts
Get list of all monitored hosts with their state information.
**Response:**
```json
[
{
"name": "webserver01",
"dyn": false,
"connections": [...]
}
]
```
#### GET /api/0/messages
Get recent heartbeat messages (last 30).
**Response:**
```json
[
{
"time": 1711234567.123,
"host": "webserver01",
"msg": "heartbeat received"
}
]
```
---
### Plugin Data Endpoints
#### GET /api/0/hosts/{hostname}/plugins
Get all plugin data for a specific host.
**Parameters:**
- `hostname` (path): Name of the host
**Response:**
```json
{
"hostname": "webserver01",
"plugins": {
"cpu_monitor": {
"timestamp": 1711234567.123,
"data": {
"cpu_percent": 45.2,
"load_1min": 2.5,
"load_5min": 2.1,
"load_15min": 1.8
},
"sample_count": 100
},
"memory_monitor": {
"timestamp": 1711234568.456,
"data": {
"percent": 65.4,
"available_mb": 4096,
"total_mb": 16384
},
"sample_count": 100
}
}
}
```
**Example:**
```bash
curl http://localhost:50004/api/0/hosts/webserver01/plugins
```
#### GET /api/0/hosts/{hostname}/plugins/{plugin_name}
Get detailed historical data for a specific plugin.
**Parameters:**
- `hostname` (path): Name of the host
- `plugin_name` (path): Name of the plugin
- `limit` (query, optional): Number of recent samples to return (default: 10)
**Response:**
```json
{
"hostname": "webserver01",
"plugin": "cpu_monitor",
"samples": [
{
"timestamp": 1711234567.123,
"data": {
"cpu_percent": 45.2,
"load_1min": 2.5
}
},
{
"timestamp": 1711234267.123,
"data": {
"cpu_percent": 42.1,
"load_1min": 2.3
}
}
],
"sample_count": 2
}
```
**Examples:**
```bash
# Get last 1 sample (most recent)
curl http://localhost:50004/api/0/hosts/webserver01/plugins/cpu_monitor?limit=1
# Get last 50 samples
curl http://localhost:50004/api/0/hosts/webserver01/plugins/memory_monitor?limit=50
# Get disk monitor data
curl http://localhost:50004/api/0/hosts/database01/plugins/disk_monitor
```
---
### Alert Endpoints
#### GET /api/0/hosts/{hostname}/alerts
Get alert states for a specific host.
**Parameters:**
- `hostname` (path): Name of the host
**Response:**
```json
{
"hostname": "webserver01",
"alerts": [
{
"metric_path": "cpu_monitor.cpu_percent",
"level": "WARNING",
"since": 1711234000.0,
"last_value": 85.5,
"last_check": 1711234567.123,
"notification_count": 2
},
{
"metric_path": "disk_monitor./.percent",
"level": "OK",
"since": 1711230000.0,
"last_value": 65.0,
"last_check": 1711234567.123,
"notification_count": 0
}
],
"summary": {
"ok": 15,
"warning": 1,
"critical": 0,
"unknown": 0
}
}
```
**Example:**
```bash
curl http://localhost:50004/api/0/hosts/webserver01/alerts
```
#### GET /api/0/alerts
Get all active alerts across all monitored hosts.
**Response:**
```json
{
"alerts": [
{
"hostname": "webserver01",
"metric_path": "cpu_monitor.cpu_percent",
"level": "CRITICAL",
"since": 1711234000.0,
"last_value": 95.5,
"last_check": 1711234567.123,
"notification_count": 3
},
{
"hostname": "database01",
"metric_path": "memory_monitor.percent",
"level": "WARNING",
"since": 1711233000.0,
"last_value": 88.2,
"last_check": 1711234567.123,
"notification_count": 1
}
],
"summary": {
"critical": 1,
"warning": 1,
"unknown": 0,
"total": 2
},
"host_count": 5
}
```
**Example:**
```bash
curl http://localhost:50004/api/0/alerts | jq .
```
---
## Web UI Pages
### Live Dashboard
**URL:** `/live`
Real-time dashboard showing:
- Host connection states
- IPv4/IPv6 connectivity
- Latency metrics
- Recent messages
**Features:**
- WebSocket-powered live updates
- Sortable columns
- Color-coded status indicators
### Plugin Metrics
**URL:** `/plugins`
Interactive visualization of plugin metrics:
- Select host and plugin from dropdown
- View current metric values
- Automatic refresh every 30 seconds
- Support for nested metrics (e.g., per-partition disk stats)
**Features:**
- Card-based metric display
- Unit formatting (%, MB, GB)
- Nested object visualization
- Auto-refresh
**Screenshots of available data:**
- CPU usage, load average, frequency
- Memory usage, available memory, swap
- Disk usage per partition, I/O statistics
- Network interface statistics, connection counts
- Custom plugin data
### Alerts Dashboard
**URL:** `/alerts`
Comprehensive alert monitoring:
- Summary cards (Critical, Warning, Total Hosts)
- Filter by severity (All, Critical, Warning)
- Alert details with duration
- Auto-refresh every 15 seconds
**Features:**
- Color-coded alert levels
- Duration tracking
- Filterable list
- Real-time updates
- Summary statistics
---
## Integration Examples
### Monitoring Script
```bash
#!/bin/bash
# Check for critical alerts and send notification
RESPONSE=$(curl -s http://localhost:50004/api/0/alerts)
CRITICAL_COUNT=$(echo "$RESPONSE" | jq '.summary.critical')
if [ "$CRITICAL_COUNT" -gt 0 ]; then
echo "CRITICAL: $CRITICAL_COUNT critical alerts detected!"
echo "$RESPONSE" | jq '.alerts[] | select(.level=="CRITICAL")'
# Send notification
# mail -s "Critical Alerts" admin@example.com < alert_details.txt
fi
```
### Python Client
```python
import requests
import json
# Get all plugin data for a host
response = requests.get('http://localhost:50004/api/0/hosts/webserver01/plugins')
data = response.json()
print(f"Host: {data['hostname']}")
print(f"Plugins: {', '.join(data['plugins'].keys())}")
for plugin, info in data['plugins'].items():
print(f"\n{plugin}:")
for metric, value in info['data'].items():
print(f" {metric}: {value}")
# Check for alerts
response = requests.get('http://localhost:50004/api/0/alerts')
alerts = response.json()
if alerts['summary']['critical'] > 0:
print(f"\n⚠️ {alerts['summary']['critical']} CRITICAL ALERTS!")
for alert in alerts['alerts']:
if alert['level'] == 'CRITICAL':
print(f" - {alert['hostname']}: {alert['metric_path']} = {alert['last_value']}")
```
### Grafana Integration
The API endpoints can be used with Grafana's JSON datasource plugin:
1. Install the SimpleJSON datasource plugin
2. Configure datasource URL: `http://your-server:50004`
3. Create queries:
- Metrics: `/api/0/hosts/webserver01/plugins/cpu_monitor?limit=100`
- Alerts: `/api/0/alerts`
### Prometheus Integration
Export metrics in Prometheus format (future enhancement):
```python
# Example prometheus exporter
from prometheus_client import Gauge, generate_latest
import requests
cpu_usage = Gauge('heartbeat_cpu_percent', 'CPU usage percentage', ['hostname'])
memory_usage = Gauge('heartbeat_memory_percent', 'Memory usage percentage', ['hostname'])
def collect_metrics():
hosts = requests.get('http://localhost:50004/api/0/hosts').json()
for host in hosts:
hostname = host['name']
plugins = requests.get(f'http://localhost:50004/api/0/hosts/{hostname}/plugins').json()
if 'cpu_monitor' in plugins['plugins']:
cpu_data = plugins['plugins']['cpu_monitor']['data']
cpu_usage.labels(hostname=hostname).set(cpu_data.get('cpu_percent', 0))
if 'memory_monitor' in plugins['plugins']:
mem_data = plugins['plugins']['memory_monitor']['data']
memory_usage.labels(hostname=hostname).set(mem_data.get('percent', 0))
```
---
## Response Formats
### Success Response
All successful API calls return HTTP 200 with JSON body:
```json
{
"field": "value",
...
}
```
### Error Response
API errors return appropriate HTTP status codes with JSON:
```json
{
"error": "Host 'unknown-host' not found"
}
```
**Common Status Codes:**
- `200 OK` - Success
- `400 Bad Request` - Invalid parameters
- `404 Not Found` - Resource not found
- `500 Internal Server Error` - Server error
---
## WebSocket API
For real-time updates, connect to the WebSocket endpoint:
**URL:** `ws://your-server:50005/hbd` (or `wss://` for secure)
**Messages:**
```json
{
"type": "host",
"data": {
"name": "webserver01",
"state": "UP"
}
}
```
```json
{
"type": "plugin",
"data": {
"host": "webserver01",
"plugin": "cpu_monitor",
"data": {...},
"timestamp": 1711234567.123
}
}
```
---
## Configuration
### Enable HTTP Server
```yaml
# In your hbd configuration file
hbd_host: "" # Listen on all interfaces
hbd_port: 50004 # HTTP port
ws_port: 50005 # WebSocket port (optional)
# wss_port: 50006 # Secure WebSocket (requires SSL)
```
### SSL/TLS Configuration
For secure WebSocket connections:
```yaml
wss_port: 50006
cert_path: /etc/heartbeat/certs/
wss_pem: server.pem
wss_key: server.key
```
---
## Rate Limiting
The API currently does not implement rate limiting. For production use, consider:
- Placing behind a reverse proxy (nginx, Apache)
- Using API gateway for rate limiting
- Implementing caching for frequently accessed endpoints
---
## CORS Support
By default, CORS is not enabled. To enable for web applications:
```python
# In http.py, add CORS middleware
from aiohttp_cors import setup as cors_setup
app = web.Application()
cors = cors_setup(app)
# Configure CORS for all routes
for route in list(app.router.routes()):
cors.add(route, {
"*": aiohttp_cors.ResourceOptions(
allow_credentials=True,
expose_headers="*",
allow_headers="*",
)
})
```
---
## Performance Considerations
### Caching
- Plugin data is cached in memory (last 100 samples per plugin)
- No database queries required
- Responses are fast (<10ms typical)
### Scalability
- Each host stores its own data independently
- Memory usage: ~1KB per host + ~1KB per plugin sample
- For 100 hosts with 5 plugins: ~50MB memory
### Best Practices
1. Use `limit` parameter to control response size
2. Cache responses on client side when appropriate
3. Use WebSocket for real-time updates instead of polling
4. Consider pagination for large deployments (future enhancement)
---
## Troubleshooting
### API Returns 404
- Verify hostname in URL matches actual host name
- Check host is sending heartbeats: `curl http://localhost:50004/api/0/hosts`
### No Plugin Data
- Verify client is configured with plugins
- Check client logs for plugin errors
- Ensure plugins are sending data (check journal logs)
### Empty Alerts
- Verify thresholds are configured
- Check host is in `watchhosts` list
- Ensure plugins are collecting metrics
- Review server logs for threshold checker errors
---
## See Also
- [Plugin Development Guide](PLUGIN_DEVELOPMENT.md)
- [Threshold Alerting Documentation](THRESHOLD_ALERTING.md)
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
- Configuration examples: `hbd/config_example.yaml`
+413
View File
@@ -0,0 +1,413 @@
# Message Journal
The message journal provides persistent logging of all received heartbeat messages with automatic size-based log rotation.
## Overview
The journal logs every message received by the heartbeat daemon (hbd) in JSON format, making it easy to:
- Audit message history
- Debug connection issues
- Analyze traffic patterns
- Replay messages for testing
- Create historical reports
## Features
- **JSON Format**: Each message is logged as a single JSON line for easy parsing
- **Size-Based Rotation**: Automatically rotates logs when size threshold is reached
- **Automatic Cleanup**: Keeps only a configurable number of backup files
- **Thread-Safe**: Safe for concurrent access from multiple async tasks
- **Configurable**: All settings controllable via configuration file
- **Performance**: Non-blocking async operation with minimal overhead
## Configuration
Add these settings to your hbd configuration file (e.g., `.hb.yaml`):
```yaml
# Message journal configuration
journal_enabled: true # Enable/disable journaling
journal_dir: /var/log/heartbeat # Directory for journal files
journal_file: messages.journal # Base filename
journal_max_size: 104857600 # Max size in bytes (100MB default)
journal_max_backups: 10 # Number of backup files to keep
```
### Configuration Options
| Option | Default | Description |
|--------|---------|-------------|
| `journal_enabled` | `true` | Enable or disable message journaling |
| `journal_dir` | `/var/log/heartbeat` | Directory where journal files are stored |
| `journal_file` | `messages.journal` | Base filename for the journal |
| `journal_max_size` | `104857600` (100MB) | Maximum file size before rotation |
| `journal_max_backups` | `10` | Number of rotated backup files to keep |
## File Format
Messages are logged in JSONL (JSON Lines) format - one JSON object per line:
```json
{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}}
{"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}}
```
### Entry Structure
Each journal entry contains:
| Field | Type | Description |
|-------|------|-------------|
| `timestamp` | float | Unix timestamp (seconds since epoch) |
| `datetime` | string | ISO 8601 formatted datetime |
| `source_ip` | string | Source IP address |
| `source_port` | integer | Source UDP port |
| `message` | object | Complete parsed message dictionary |
## Log Rotation
### How Rotation Works
1. Journal writes messages to the current file
2. When file size exceeds `journal_max_size`, rotation is triggered
3. Current file is renamed with timestamp: `messages.journal.YYYYMMDD-HHMMSS`
4. New empty file is created as the current journal
5. Old backup files exceeding `journal_max_backups` are deleted
### Example File Structure
```
/var/log/heartbeat/
├── messages.journal # Current active journal
├── messages.journal.20260328-120000 # Rotated backup
├── messages.journal.20260328-140000 # Rotated backup
└── messages.journal.20260328-160000 # Rotated backup (oldest)
```
### Rotation Behavior
- Rotation is triggered when the next message would exceed the size limit
- Rotation is automatic and requires no manual intervention
- Old backups are deleted in FIFO order (oldest first)
- Rotation is thread-safe and won't lose messages
## Usage Examples
### Reading Journal Files
#### Using Python
```python
import json
# Read all entries from current journal
with open('/var/log/heartbeat/messages.journal', 'r') as f:
for line in f:
entry = json.loads(line)
print(f"{entry['datetime']} - {entry['source_ip']} - {entry['message']['ID']}")
```
#### Using jq (command line)
```bash
# View all messages
cat /var/log/heartbeat/messages.journal | jq .
# Filter by message type
cat /var/log/heartbeat/messages.journal | jq 'select(.message.ID == "HTB")'
# Filter by hostname
cat /var/log/heartbeat/messages.journal | jq 'select(.message.name == "webserver1")'
# Count messages by type
cat /var/log/heartbeat/messages.journal | jq -r '.message.ID' | sort | uniq -c
# Extract timestamps and source IPs
cat /var/log/heartbeat/messages.journal | jq -r '[.datetime, .source_ip, .message.ID] | @tsv'
```
#### Using shell tools
```bash
# Count total messages
wc -l /var/log/heartbeat/messages.journal
# View recent messages
tail -n 100 /var/log/heartbeat/messages.journal | jq .
# Search for specific host
grep -F '"name":"webserver1"' /var/log/heartbeat/messages.journal
# Check journal file size
du -h /var/log/heartbeat/messages.journal
```
### Analyzing Historical Data
```bash
# Combine all journal files (current + backups)
cat /var/log/heartbeat/messages.journal* | jq . > all_messages.json
# Count messages per host
cat /var/log/heartbeat/messages.journal* | jq -r '.message.name // "unknown"' | sort | uniq -c
# Find all plugin messages
cat /var/log/heartbeat/messages.journal* | jq 'select(.message.ID == "PLG")'
# Extract CPU metrics from plugin messages
cat /var/log/heartbeat/messages.journal* | \
jq 'select(.message.plugin == "cpu_monitor") | {time: .datetime, host: .message.name, cpu: .message.cpu_percent}'
```
## Integration with Log Management
### Logrotate
While the journal has built-in rotation, you can also use logrotate for additional management:
```
/var/log/heartbeat/messages.journal.* {
daily
rotate 30
compress
delaycompress
missingok
notifempty
}
```
### Elasticsearch/OpenSearch
Import journal data into Elasticsearch for advanced analysis:
```python
from elasticsearch import Elasticsearch
import json
es = Elasticsearch(['localhost:9200'])
with open('/var/log/heartbeat/messages.journal', 'r') as f:
for line in f:
entry = json.loads(line)
es.index(index='heartbeat-messages', body=entry)
```
### Splunk
Create a Splunk input for the journal:
```ini
[monitor:///var/log/heartbeat/messages.journal*]
sourcetype = heartbeat_json
index = heartbeat
```
## Performance Considerations
### Overhead
- Journal writing is async and non-blocking
- Typical overhead: < 1ms per message
- Minimal impact on heartbeat processing
### Disk Usage
Calculate expected disk usage:
```
Messages per day = (86400 seconds / interval) * number_of_hosts
Average message size ≈ 200-500 bytes
Daily disk usage = Messages per day * Average message size
Example:
- 100 hosts
- 30 second interval
- 2880 messages/day per host
- 288,000 messages/day total
- ~60-140 MB/day
```
### Recommendations
- **Small deployments** (< 50 hosts): Default settings work well
- **Medium deployments** (50-500 hosts): Increase `journal_max_size` to 500MB, `journal_max_backups` to 20
- **Large deployments** (> 500 hosts): Consider 1GB+ journal files, 30+ backups, or external log aggregation
## Monitoring
### Check Journal Status
The journal exposes statistics that can be queried:
```python
from hbd.journal import get_journal
journal = get_journal()
stats = journal.get_stats()
print(f"Current size: {stats['current_size']:,} bytes")
print(f"Rotation threshold: {stats['rotation_threshold']}")
```
### Log Messages
Journal operations are logged at appropriate levels:
- `INFO`: Initialization, rotation events, cleanup
- `DEBUG`: Individual message logging
- `WARNING`: Non-critical issues
- `ERROR`: Critical failures
Check hbd logs for journal-related messages:
```bash
grep journal /var/log/heartbeat.log
```
## Troubleshooting
### Journal Files Not Created
**Problem**: No journal files appear in the configured directory.
**Solutions**:
- Check `journal_enabled: true` in configuration
- Verify directory exists and hbd has write permissions
- Check hbd logs for initialization errors
- Verify disk space is available
### Rotation Not Working
**Problem**: Journal file grows beyond `journal_max_size`.
**Solutions**:
- Check that `journal_max_size` is properly configured
- Verify hbd has permission to rename/create files
- Check for filesystem issues
- Review hbd logs for rotation errors
### Missing Messages
**Problem**: Some messages don't appear in journal.
**Solutions**:
- Verify `journal_enabled: true`
- Check for write errors in hbd logs
- Verify sufficient disk space
- Check if filesystem is read-only
### Performance Issues
**Problem**: Journal causing slow message processing.
**Solutions**:
- Use faster storage (SSD) for journal directory
- Increase `journal_max_size` to reduce rotation frequency
- Disable journal if not needed: `journal_enabled: false`
- Consider async syslog forwarding instead
## Security Considerations
### File Permissions
Ensure proper permissions on journal files:
```bash
# Journal directory
chmod 750 /var/log/heartbeat
chown hbd:hbd /var/log/heartbeat
# Journal files
chmod 640 /var/log/heartbeat/messages.journal*
```
### Sensitive Data
Journal files may contain:
- Hostnames and IP addresses
- System metrics
- Custom message content
**Recommendations**:
- Restrict read access to authorized users only
- Consider encryption for archived journals
- Implement log retention policies
- Sanitize data if sharing for debugging
## API Reference
### MessageJournal Class
```python
class MessageJournal:
def __init__(self, config: Dict[str, Any])
async def initialize(self) -> bool
async def log_message(self, msg: Dict, addr: tuple, timestamp: float)
async def close(self)
def get_stats(self) -> Dict[str, Any]
```
### Module Functions
```python
def get_journal(config: Dict = None) -> MessageJournal
async def log_message(msg: Dict, addr: tuple, timestamp: float = None)
```
## Example: Custom Message Processing
Process journal messages in real-time:
```python
import asyncio
import json
from pathlib import Path
async def tail_journal(journal_path):
"""Follow journal file and process new messages."""
path = Path(journal_path)
with open(path, 'r') as f:
# Jump to end
f.seek(0, 2)
while True:
line = f.readline()
if line:
entry = json.loads(line)
await process_message(entry)
else:
await asyncio.sleep(0.1)
async def process_message(entry):
"""Process a journal entry."""
msg = entry['message']
# Alert on boot messages
if msg.get('boot'):
print(f"ALERT: {msg['name']} rebooted at {entry['datetime']}")
# Track CPU usage
if msg.get('ID') == 'PLG' and msg.get('plugin') == 'cpu_monitor':
cpu = msg.get('cpu_percent', 0)
if cpu > 90:
print(f"WARNING: {entry['source_ip']} CPU usage: {cpu}%")
```
## Future Enhancements
Potential improvements for future versions:
- Compression of rotated logs (gzip)
- Time-based rotation in addition to size-based
- Filtering to exclude certain message types
- Structured logging output formats (CEF, GELF)
- Remote syslog forwarding
- Message deduplication
- Journal file encryption
- Signed journal entries
## See Also
- [Configuration Guide](../hbd/config.py) - Full configuration options
- [UDP Protocol](../hbd/udp.py) - Message handling
- [Server Architecture](../hbd/server.py) - Server initialization
+331
View File
@@ -0,0 +1,331 @@
# Nagios Plugin Integration Guide
The Heartbeat monitoring system now supports running existing Nagios-compatible monitoring plugins through the `nagios_runner` plugin. This allows you to leverage the thousands of existing Nagios plugins without modification.
## Quick Start
### 1. Install Nagios Plugins
**Debian/Ubuntu:**
```bash
sudo apt-get install nagios-plugins
```
**RHEL/CentOS/Fedora:**
```bash
sudo yum install nagios-plugins-all
# or
sudo dnf install nagios-plugins-all
```
**Arch Linux:**
```bash
sudo pacman -S monitoring-plugins
```
### 2. Configure Heartbeat
Add the `nagios_runner` section to your `~/.hb.yaml` config:
```yaml
nagios_runner:
interval: 60 # Run plugins every 60 seconds
timeout: 30 # Command timeout in seconds
commands:
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
```
### 3. Start Heartbeat Client
```bash
hbc -v localhost
```
The client will now execute the configured Nagios plugins and send their results to the server.
## How It Works
### Nagios Plugin Standard
Nagios plugins follow a simple interface:
1. **Exit Codes:**
- `0` = OK
- `1` = WARNING
- `2` = CRITICAL
- `3` = UNKNOWN
2. **Output Format:**
```
STATUS - Message | performance_data
```
3. **Performance Data Format:**
```
'label'=value[UOM];[warn];[crit];[min];[max]
```
### Example Plugin Output
```bash
$ /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
DISK OK - free space: / 156 GB (78%); | /=44GB;127;142;0;159
```
This output includes:
- **Status:** `DISK OK`
- **Message:** `free space: / 156 GB (78%)`
- **Performance Data:** `/=44GB;127;142;0;159`
- Current value: 44GB
- Warning threshold: 127GB
- Critical threshold: 142GB
- Min: 0GB
- Max: 159GB
### Data Collected
The `nagios_runner` plugin collects:
**For each configured command:**
- `{name}_status` - Status string (OK, WARNING, CRITICAL, UNKNOWN)
- `{name}_status_code` - Numeric exit code (0-3)
- `{name}_output` - Status message
- `{name}_{metric}` - Each performance metric value
- `{name}_{metric}_uom` - Unit of measurement (if present)
- `{name}_{metric}_warn` - Warning threshold (if present)
- `{name}_{metric}_crit` - Critical threshold (if present)
- `{name}_{metric}_min` - Minimum value (if present)
- `{name}_{metric}_max` - Maximum value (if present)
**Overall:**
- `overall_status` - Worst status from all commands
- `overall_status_code` - Worst status code
- `plugin_count` - Number of Nagios plugins executed
## Configuration Options
```yaml
nagios_runner:
# Collection interval in seconds (default: 60)
interval: 60
# Command execution timeout in seconds (default: 30)
timeout: 30
# Execute commands via shell (default: true)
# Set to false for direct execution (more secure but less flexible)
shell: true
# List of Nagios plugins to run
commands:
- name: unique_name # Required: unique identifier
command: /path/to/plugin [args] # Required: full command to execute
```
## Common Nagios Plugins
### System Resources
**Disk Space:**
```yaml
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
```
**Load Average:**
```yaml
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
```
**Swap Usage:**
```yaml
- name: check_swap
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
```
**Process Count:**
```yaml
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
```
**Users Logged In:**
```yaml
- name: check_users
command: /usr/lib/nagios/plugins/check_users -w 5 -c 10
```
### Network Services
**SSH:**
```yaml
- name: check_ssh
command: /usr/lib/nagios/plugins/check_ssh localhost
```
**HTTP:**
```yaml
- name: check_http_local
command: /usr/lib/nagios/plugins/check_http -H localhost
- name: check_http_ssl
command: /usr/lib/nagios/plugins/check_http -H example.com --ssl
```
**DNS:**
```yaml
- name: check_dns
command: /usr/lib/nagios/plugins/check_dns -H google.com
```
**Ping:**
```yaml
- name: check_ping_gateway
command: /usr/lib/nagios/plugins/check_ping -H 192.168.1.1 -w 100,20% -c 500,60%
```
### Databases
**MySQL:**
```yaml
- name: check_mysql
command: /usr/lib/nagios/plugins/check_mysql -H localhost -u user -p password
```
**PostgreSQL:**
```yaml
- name: check_pgsql
command: /usr/lib/nagios/plugins/check_pgsql -H localhost -d database
```
## Writing Custom Nagios Plugins
You can write your own Nagios-compatible plugins in any language. Here's a simple example:
**Bash:**
```bash
#!/bin/bash
# /usr/local/bin/check_example.sh
# Get the value to check
value=$(some_command)
# Define thresholds
warn=80
crit=90
# Check and output result
if [ $value -ge $crit ]; then
echo "CRITICAL - Value is $value | value=${value};${warn};${crit};0;100"
exit 2
elif [ $value -ge $warn ]; then
echo "WARNING - Value is $value | value=${value};${warn};${crit};0;100"
exit 1
else
echo "OK - Value is $value | value=${value};${warn};${crit};0;100"
exit 0
fi
```
**Python:**
```python
#!/usr/bin/env python3
# /usr/local/bin/check_example.py
import sys
def check_something():
value = get_value() # Your check logic here
warn = 80
crit = 90
perfdata = f"value={value};{warn};{crit};0;100"
if value >= crit:
print(f"CRITICAL - Value is {value} | {perfdata}")
sys.exit(2)
elif value >= warn:
print(f"WARNING - Value is {value} | {perfdata}")
sys.exit(1)
else:
print(f"OK - Value is {value} | {perfdata}")
sys.exit(0)
if __name__ == "__main__":
check_something()
```
Then configure in Heartbeat:
```yaml
nagios_runner:
commands:
- name: my_custom_check
command: /usr/local/bin/check_example.sh
```
## Troubleshooting
### Plugin not found
```
Error: Command not found
```
**Solution:** Use the full path to the plugin. Common locations:
- `/usr/lib/nagios/plugins/`
- `/usr/lib64/nagios/plugins/`
- `/usr/local/nagios/libexec/`
### Permission denied
```
Error: Permission denied
```
**Solution:** Ensure the plugin is executable:
```bash
chmod +x /path/to/plugin
```
### Timeout errors
```
Command timed out after 30s
```
**Solution:** Increase the timeout in config:
```yaml
nagios_runner:
timeout: 60 # Increase timeout
```
### No performance data
If performance data is not being parsed:
1. Check plugin output includes `|` separator
2. Verify performance data format: `'label'=value[UOM];...`
3. Enable debug logging: `hbc -v -x localhost`
## Benefits
1. **Massive Plugin Library:** Thousands of existing Nagios plugins available
2. **No Rewriting:** Use plugins as-is without modification
3. **Community Support:** Well-documented and maintained plugins
4. **Flexibility:** Mix Nagios plugins with native Heartbeat plugins
5. **Standard Interface:** Consistent exit codes and output format
6. **Performance Data:** Automatic extraction of metrics
## Resources
- [Nagios Plugin Development Guidelines](https://nagios-plugins.org/doc/guidelines.html)
- [Monitoring Plugins Project](https://www.monitoring-plugins.org/)
- [Nagios Exchange](https://exchange.nagios.org/) - Plugin repository
- [Check_MK Local Checks](https://docs.checkmk.com/latest/en/localchecks.html) - Compatible format
## Next Steps
- Configure threshold alerts based on Nagios plugin status codes
- View plugin data in the Heartbeat web UI
- Create custom plugins for your specific monitoring needs
- Integrate with existing Nagios/Icinga configurations
+533
View File
@@ -0,0 +1,533 @@
# Notification System
## Overview
The Heartbeat Monitoring System includes a flexible notification system that can send alerts through multiple channels including Email, Pushover, Signal, and Mattermost. The system supports centralized channel definitions with per-host routing, allowing fine-grained control over notification delivery.
## Architecture
### Components
1. **Notification Channels** (`notification_channels` in config)
- Centralized definitions of notification providers
- Each channel has a type and type-specific credentials
- Reusable across multiple hosts
2. **Channel Dispatcher** (`hbd/server/notify.py`)
- `pushmsg_for_host(hostname, message)`: Main entry point for host-specific notifications
- `_dispatch_to_channel(channel_name, channel_config, message)`: Routes to specific provider
- Provider functions: `pushover()`, `pushsignal()`, `pushmattermost()`, `send_email()`
3. **Configuration Utilities** (`hbd/server/config.py`)
- `get_notification_channels_for_host(config, hostname)`: Retrieves channel names for a host
- `get_notification_channels_config(config, hostname)`: Retrieves full channel configurations
- `get_channel_config(config, channel_name)`: Gets configuration for a specific channel
4. **Integration Points**
- **Threshold alerts**: `threshold.py` calls `notify_mod.pushmsg_for_host()`
- **Heartbeat events**: `udp.py` calls `notify_mod.pushmsg_for_host()` for boot/shutdown/overdue
- **Custom alerts**: Any code can call `notify_mod.pushmsg_for_host(hostname, message)`
## Configuration
### Centralized Channel Definitions
Define notification channels once in your configuration file:
```yaml
notification_channels:
# Signal notifications
signal_ops:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890 # Your Signal number
recipient: +1234567890 # Recipient number
signal_oncall:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +0987654321 # Different recipient
# Email notifications
email_ops:
type: email
recipients:
- ops@example.com
- alerts@example.com
sender: heartbeat@example.com
smtp_server: smtp.example.com
smtp_port: 587
smtp_user: heartbeat@example.com
smtp_password: your-smtp-password
email_devteam:
type: email
recipients: [dev-alerts@example.com]
sender: heartbeat-dev@example.com
smtp_server: smtp.example.com
smtp_port: 587
smtp_user: heartbeat-dev@example.com
smtp_password: your-smtp-password
# Pushover notifications
pushover_urgent:
type: pushover
token: your-pushover-app-token
user: your-pushover-user-key
pushover_normal:
type: pushover
token: your-pushover-app-token
user: another-user-key
# Mattermost notifications
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: your-webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
```
### Default Notification Channels
Specify default channels for hosts that don't have specific channel assignments:
```yaml
default_notification_channels:
- email_ops
- mattermost_devops
```
Hosts without `notification_channels` defined will use these defaults.
### Per-Host Channel Assignment
Assign specific channels to each host in the `hosts` section:
```yaml
hosts:
# Critical production web server - multiple channels for redundancy
prod-web-01:
threshold_config: high_sensitivity
watch: true
notification_channels:
- signal_oncall # Immediate mobile notification
- pushover_urgent # Secondary mobile notification
- email_ops # Email for record keeping
dyndns: false
# Database server - ops team notifications only
prod-db-01:
threshold_config: database
watch: true
notification_channels:
- signal_ops
- email_ops
dyndns: false
# Development server - email only, no urgent notifications
dev-server-01:
threshold_config: low_sensitivity
watch: false
notification_channels:
- email_devteam
dyndns: false
# Test server - uses default_notification_channels
test-server-01:
threshold_config: default
watch: false
dyndns: false
# No notification_channels specified = uses default_notification_channels
```
## Channel Types
### Email
Sends notifications via SMTP.
**Configuration fields:**
```yaml
type: email
recipients: [email1@example.com, email2@example.com] # Required: List of recipients
sender: heartbeat@example.com # Required: From address
smtp_server: smtp.example.com # Required: SMTP server hostname
smtp_port: 587 # Optional: Default 587
smtp_user: heartbeat@example.com # Optional: For authenticated SMTP
smtp_password: your-password # Optional: For authenticated SMTP
```
**Features:**
- Supports multiple recipients
- TLS/STARTTLS support on port 587
- Authenticated and unauthenticated SMTP
**Example:**
```yaml
notification_channels:
email_critical:
type: email
recipients: [admin@example.com, oncall@example.com]
sender: alerts@example.com
smtp_server: smtp.fastmail.com
smtp_port: 587
smtp_user: alerts@example.com
smtp_password: app-specific-password
```
### Pushover
Sends push notifications to mobile devices via Pushover API.
**Configuration fields:**
```yaml
type: pushover
token: your-application-token # Required: Your Pushover app token
user: your-user-key # Required: Recipient's user key
```
**Features:**
- Instant mobile push notifications
- Works on iOS and Android
- Supports delivery confirmations
**Setup:**
1. Create a Pushover account at https://pushover.net
2. Create an application to get your app token
3. Note your user key from your account dashboard
**Example:**
```yaml
notification_channels:
pushover_admin:
type: pushover
token: azGDORePK8gMaC0QOYAMyEEuzJnyUi
user: uQiRzpo4DXghDmr9QzzfQu27cmVRsG
```
### Signal
Sends notifications via Signal messenger using signal-cli.
**Configuration fields:**
```yaml
type: signal
cli_path: /usr/local/bin/signal-cli # Optional: Path to signal-cli binary
user: +1234567890 # Required: Your Signal phone number
recipient: +0987654321 # Required: Recipient phone number
```
**Prerequisites:**
1. Install signal-cli: https://github.com/AsamK/signal-cli
2. Register signal-cli with your phone number:
```bash
signal-cli -u +1234567890 register
signal-cli -u +1234567890 verify CODE
```
3. Ensure signal-cli is in PATH or specify full path in config
**Features:**
- End-to-end encrypted messaging
- Works without phone being online
- No API fees or rate limits
**Example:**
```yaml
notification_channels:
signal_admin:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +12025551234
recipient: +12025559999
```
### Mattermost
Sends notifications to Mattermost team chat via incoming webhooks.
**Configuration fields:**
```yaml
type: mattermost
host: mattermost.example.com # Required: Mattermost server hostname
token: your-webhook-token # Required: Incoming webhook token
channel: channel-name # Required: Target channel name
username: heartbeat-bot # Optional: Bot display name
icon: https://example.com/icon.png # Optional: Bot icon URL
```
**Prerequisites:**
1. Enable incoming webhooks in Mattermost
2. Create an incoming webhook for your team
3. Note the webhook token from the webhook URL
**Features:**
- Team-wide visibility
- Rich formatting support
- Message threading
**Example:**
```yaml
notification_channels:
mattermost_ops:
type: mattermost
host: chat.example.com
token: abc123def456ghi789
channel: infrastructure-alerts
username: heartbeat-monitor
icon: https://example.com/heartbeat-icon.png
```
## Notification Events
The system sends notifications for various events:
### Threshold Alerts
When monitored metrics exceed configured thresholds:
- **State changes**: OK → WARNING, WARNING → CRITICAL, CRITICAL → OK
- **Format**: `{LEVEL}: {hostname} - {metric_path} = {value} {threshold_info}`
- **Example**: `CRITICAL: prod-web-01 - cpu_monitor.cpu_percent = 95.2 (threshold: > 90.0)`
- **Re-notifications**: Periodic reminders for ongoing alerts (default: hourly)
### Heartbeat Events
Host lifecycle events:
- **Host boot**: `{hostname} booted`
- **Host shutdown**: `{hostname} {connection_type} shutdown`
- **Host recovery**: `{hostname} {connection_type} is back`
- **Connection issues**: `{hostname} {message}`
- **Host overdue**: `{hostname} {connection_type} overdue`
Only hosts with `watch: true` send heartbeat event notifications.
### Custom Alerts
Application code can send custom notifications:
```python
from hbd.server import notify as notify_mod
# Send to host-specific channels
notify_mod.pushmsg_for_host("prod-web-01", "Custom alert message")
# Send using global config
notify_mod.pushmsg_from_config("Global notification")
# Send to specific config
notify_mod.pushmsg(custom_config_dict, "Targeted notification")
```
## Design Principles
The notification system follows these core principles:
- **Centralization**: Define notification providers once, reference them by name
- **Flexibility**: Each host can use different channels for different notification needs
- **Redundancy**: Critical hosts can specify multiple channels for failover
- **Clarity**: Clean separation between channel definition and channel assignment
- **Type Safety**: Provider-specific validation at configuration time
## Best Practices
### Channel Organization
- **Create purpose-specific channels**: `email_ops`, `signal_oncall`, `pushover_urgent`
- **Separate by team/role**: `email_devteam`, `signal_dbateam`, `mattermost_security`
- **Use descriptive names**: Channel names appear in logs and debugging
### Redundancy
For critical hosts, use multiple notification channels:
```yaml
hosts:
critical-db:
notification_channels:
- signal_oncall # Primary: Mobile alert
- pushover_urgent # Backup: Different mobile platform
- email_ops # Tertiary: Email for record-keeping
```
### Notification Fatigue Prevention
- **Use `watch: false`** for non-critical hosts
- **Configure appropriate thresholds** to avoid false positives
- **Set different channels for different severities**
- **Use `default_notification_channels`** for baseline, add more for critical systems
### Security
- **Protect credentials**: Use file permissions to protect config files with passwords/tokens
- **Rotate tokens**: Periodically rotate API tokens and passwords
- **Use app-specific passwords**: For email, use app-specific passwords instead of main account password
- **Separate accounts**: Consider separate notification accounts for different environments (prod vs dev)
### Testing
Test notification channels before relying on them:
```bash
# Test signal-cli directly
signal-cli -u +1234567890 send -m "Test message" +0987654321
# Test SMTP
echo "Test" | mail -s "Test Subject" admin@example.com
# Test through heartbeat system (Python REPL)
from hbd.server import notify as notify_mod, config as config_mod
cfg = config_mod.load_config(".hb.yaml")
notify_mod.setup(cfg)
notify_mod.pushmsg_for_host("test-host", "Test notification")
```
## Troubleshooting
### Notifications Not Sending
1. **Check logs**: Look for "Failed to send notification" errors
2. **Verify host is watched**: Ensure `watch: true` in host definition
3. **Check channel configuration**: Verify credentials and settings
4. **Test channel directly**: Use command-line tools to test provider
5. **Check network**: Ensure server can reach notification endpoints
### Signal Issues
- **signal-cli not found**: Specify full path in `cli_path`
- **Not registered**: Run `signal-cli -u +NUMBER register` and verify
- **Trust issues**: Run `signal-cli -u +NUMBER receive` to sync trust store
- **Recipient not found**: Ensure recipient is in your Signal contacts
### Email Issues
- **Authentication failed**: Check SMTP username/password
- **TLS errors**: Verify SMTP port (587 for STARTTLS, 465 for SSL)
- **Relay denied**: Ensure SMTP server allows relay from your IP
- **Timeout**: Check firewall rules for SMTP ports
### Pushover Issues
- **Invalid token/user**: Verify token and user key from Pushover dashboard
- **API rate limits**: Pushover has monthly message limits on free tier
- **HTTP errors**: Check Pushover API status page
### Mattermost Issues
- **Webhook not found**: Verify webhook token and ensure webhook is enabled
- **Channel not found**: Check channel name spelling and permissions
- **Driver import error**: Install mattermostdriver: `pip install mattermostdriver`
## API Reference
### Main Functions
#### `pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict`
Send notification to host-specific channels.
**Parameters:**
- `hostname`: Name of the host (used to look up notification channels)
- `msg`: Message to send
- `debug`: Debug level (0=no debug, 1+=debug output)
**Returns:** Dictionary of results per channel: `{"signal_ops": True, "email_ops": False}`
**Example:**
```python
from hbd.server import notify as notify_mod
notify_mod.pushmsg_for_host("prod-web-01", "Server CPU at 95%")
```
**Behavior:**
1. Looks up notification channels configured for the host
2. If no host-specific channels, uses `default_notification_channels`
3. Dispatches to each channel in parallel
4. Returns dict of results keyed by channel name
5. Logs success/failure for each channel
## Examples
### Complete Configuration Example
```yaml
# Notification channel definitions
notification_channels:
signal_oncall:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +12025551234
recipient: +12025555678
email_ops:
type: email
recipients: [ops@example.com, alerts@example.com]
sender: heartbeat@example.com
smtp_server: smtp.fastmail.com
smtp_port: 587
smtp_user: heartbeat@example.com
smtp_password: app-password-here
# Default channels
default_notification_channels: [email_ops]
# Host definitions with channel assignments
hosts:
prod-web-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, email_ops]
dyndns: false
dev-server-01:
threshold_config: low_sensitivity
watch: false
notification_channels: [email_ops]
dyndns: false
```
### Multiple Environments Example
```yaml
notification_channels:
# Production channels
signal_prod_oncall:
type: signal
user: +12025551234
recipient: +12025551111 # On-call phone
email_prod_ops:
type: email
recipients: [prod-ops@example.com]
sender: prod-heartbeat@example.com
smtp_server: smtp.example.com
# Staging channels
email_staging:
type: email
recipients: [staging-alerts@example.com]
sender: staging-heartbeat@example.com
smtp_server: smtp.example.com
# Development channels
mattermost_dev:
type: mattermost
host: chat.example.com
token: dev-webhook-token
channel: dev-alerts
hosts:
prod-api-01:
notification_channels: [signal_prod_oncall, email_prod_ops]
staging-api-01:
notification_channels: [email_staging]
dev-api-01:
notification_channels: [mattermost_dev]
```
+544
View File
@@ -0,0 +1,544 @@
# Plugin Development Guide
This guide explains how to create custom plugins for the Heartbeat monitoring system.
## Table of Contents
- [Plugin Architecture](#plugin-architecture)
- [Plugin Types](#plugin-types)
- [Creating a Plugin](#creating-a-plugin)
- [Plugin Lifecycle](#plugin-lifecycle)
- [Configuration](#configuration)
- [Best Practices](#best-practices)
- [Examples](#examples)
- [Testing](#testing)
## Plugin Architecture
Heartbeat's plugin system is designed to be simple yet powerful. Plugins are Python classes that inherit from one of the base plugin types and implement a few key methods.
### Key Concepts
- **Plugin Registry**: Central registry that manages all loaded plugins
- **Plugin Loader**: Automatically discovers and loads plugins from the `hbd/plugins/` directory
- **Plugin Types**: InfoPlugin (static data) and MonitorPlugin (periodic metrics)
- **Async/Await**: All plugin methods are async for non-blocking operation
## Plugin Types
### InfoPlugin
InfoPlugins collect static information that doesn't change frequently (OS version, hardware specs, etc.).
- **Runs once** at startup (interval = 0)
- **Cached** - data is collected once and reused
- **Lightweight** - no periodic overhead
**Use InfoPlugin for:**
- Operating system details
- Hardware information
- Software versions
- Configuration data
- Static inventory
### MonitorPlugin
MonitorPlugins collect metrics that change over time (CPU usage, memory, network traffic).
- **Runs periodically** based on configured interval
- **Scheduled** - collected at regular intervals
- **Dynamic** - captures changing system state
**Use MonitorPlugin for:**
- Resource usage (CPU, memory, disk, network)
- Performance metrics
- Counters and gauges
- Time-series data
## Creating a Plugin
### Step 1: Choose Plugin Type
Decide whether your plugin collects static information (InfoPlugin) or dynamic metrics (MonitorPlugin).
### Step 2: Create Plugin File
Create a new Python file in `hbd/plugins/` directory:
```python
"""
My awesome plugin for Heartbeat.
Brief description of what this plugin does.
"""
import logging
from typing import Dict, Any, Optional
# Import psutil or other dependencies if needed
try:
import psutil
except ImportError:
psutil = None
from hbd.plugin import MonitorPlugin # or InfoPlugin
logger = logging.getLogger(__name__)
class MyAwesomePlugin(MonitorPlugin): # or InfoPlugin
"""
One-line description of the plugin.
Collects:
- List of metrics/data collected
- Another metric
Configuration:
interval: Collection interval in seconds (default: 60)
option1: Description of option1 (default: value)
option2: Description of option2 (default: value)
"""
name = "my_awesome_plugin" # Unique plugin name
interval = 60 # For MonitorPlugin, use 0 for InfoPlugin
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize the plugin with optional configuration."""
super().__init__(config)
# Extract configuration options
self.option1 = self.config.get('option1', 'default_value')
self.option2 = self.config.get('option2', True)
# Check dependencies
if psutil is None:
raise ImportError("psutil is required for my_awesome_plugin")
async def initialize(self):
"""
Initialize the plugin.
This is called once when the plugin is loaded.
Use this to verify dependencies, establish connections, etc.
Returns:
True if initialization successful, False otherwise
"""
logger.info(f"My awesome plugin initialized (option1: {self.option1})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect data.
This is called periodically (MonitorPlugin) or once (InfoPlugin).
Returns:
Dictionary of collected data (will be sent to server)
"""
try:
data = await self._collect_metrics()
logger.debug(f"Collected {len(data)} metrics")
return data
except Exception as e:
logger.error(f"Error collecting data: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Internal method to collect actual metrics."""
metrics = {}
# Collect your data here
metrics['metric1'] = self._get_metric1()
metrics['metric2'] = self._get_metric2()
return metrics
def _get_metric1(self):
"""Helper method for metric collection."""
# Implementation here
return 42
def _get_metric2(self):
"""Helper method for metric collection."""
# Implementation here
return "hello"
async def cleanup(self):
"""
Cleanup resources.
This is called when the plugin is unloaded or the client shuts down.
Use this to close connections, release resources, etc.
"""
logger.info("My awesome plugin cleanup")
# Plugin instance for automatic discovery
plugin = MyAwesomePlugin
```
### Step 3: Test Your Plugin
Create a test script to verify your plugin works:
```python
#!/usr/bin/env python3
import asyncio
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))
from hbd.plugins.my_awesome_plugin import MyAwesomePlugin
async def test():
# Create plugin instance
plugin = MyAwesomePlugin({'option1': 'test_value'})
# Initialize
if not await plugin.initialize():
print("Failed to initialize")
return False
# Collect data
data = await plugin.collect()
print(f"Collected data: {data}")
# Cleanup
await plugin.cleanup()
return True
if __name__ == '__main__':
success = asyncio.run(test())
sys.exit(0 if success else 1)
```
## Plugin Lifecycle
Understanding the plugin lifecycle helps you implement plugins correctly:
```
1. Plugin Discovery
└─> Loader scans hbd/plugins/ directory
└─> Finds Python files (except those starting with _)
└─> Imports modules
2. Plugin Instantiation
└─> Creates instance with configuration
└─> __init__() is called
3. Plugin Initialization
└─> initialize() is called
└─> Plugin verifies dependencies, establishes connections
└─> Returns True/False for success/failure
4. Plugin Registration
└─> If initialization succeeds, plugin is registered
└─> Plugin becomes active
5. Data Collection
└─> For InfoPlugin: collect() called once after initialization
└─> For MonitorPlugin: collect() called periodically based on interval
└─> Data is sent to server via PLG message
6. Plugin Shutdown
└─> cleanup() is called
└─> Plugin releases resources, closes connections
```
## Configuration
### Plugin-Specific Configuration
Plugins receive configuration through the `config` parameter in `__init__`:
```python
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
# Access configuration with defaults
self.interval = self.config.get('interval', 60)
self.threshold = self.config.get('threshold', 80)
self.enabled_features = self.config.get('features', ['feature1', 'feature2'])
```
### Client Configuration File
Users configure plugins in the client configuration YAML:
```yaml
plugins:
my_awesome_plugin:
enabled: true
interval: 120
option1: custom_value
option2: false
```
## Best Practices
### 1. Error Handling
Always handle errors gracefully:
```python
async def collect(self) -> Dict[str, Any]:
try:
return await self._collect_metrics()
except Exception as e:
logger.error(f"Error collecting metrics: {e}")
return {"error": str(e)}
```
### 2. Logging
Use appropriate log levels:
```python
logger.debug("Detailed information for debugging")
logger.info("Normal operation messages")
logger.warning("Warning messages for unusual but handled situations")
logger.error("Error messages for failures")
```
### 3. Dependencies
Check for optional dependencies:
```python
try:
import some_optional_library
except ImportError:
some_optional_library = None
# Later in __init__:
if some_optional_library is None:
raise ImportError("some_optional_library is required")
```
### 4. Performance
- Keep collection methods fast (< 1 second)
- Use async/await for I/O operations
- Cache expensive computations
- Don't block the event loop
### 5. Data Structure
Return clean, structured data:
```python
{
'metric_name': value,
'nested_data': {
'sub_metric': value
},
'list_data': [item1, item2],
'timestamp': time.time() # Optional timestamp
}
```
### 6. Documentation
Document your plugin thoroughly:
- Class docstring with description and configuration
- Method docstrings explaining purpose and return values
- Inline comments for complex logic
## Examples
### Example 1: Simple InfoPlugin
```python
from hbd.plugin import InfoPlugin
import platform
class SimpleInfoPlugin(InfoPlugin):
"""Collect basic system information."""
name = "simple_info"
interval = 0 # InfoPlugin
async def initialize(self):
return True
async def collect(self) -> Dict[str, Any]:
return {
'hostname': platform.node(),
'system': platform.system(),
'python_version': platform.python_version()
}
async def cleanup(self):
pass
plugin = SimpleInfoPlugin
```
### Example 2: MonitorPlugin with State
```python
from hbd.plugin import MonitorPlugin
import time
class CounterPlugin(MonitorPlugin):
"""Track a counter over time."""
name = "counter"
interval = 30
def __init__(self, config=None):
super().__init__(config)
self._counter = 0
self._start_time = time.time()
async def initialize(self):
return True
async def collect(self) -> Dict[str, Any]:
self._counter += 1
uptime = time.time() - self._start_time
return {
'count': self._counter,
'uptime': uptime,
'rate': self._counter / uptime
}
async def cleanup(self):
pass
plugin = CounterPlugin
```
### Example 3: Plugin with External Command
```python
from hbd.plugin import MonitorPlugin
import asyncio
class CommandPlugin(MonitorPlugin):
"""Execute external command and capture output."""
name = "command_executor"
interval = 60
def __init__(self, config=None):
super().__init__(config)
self.command = self.config.get('command', 'echo "no command"')
async def initialize(self):
return True
async def collect(self) -> Dict[str, Any]:
try:
process = await asyncio.create_subprocess_shell(
self.command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await asyncio.wait_for(
process.communicate(),
timeout=30
)
return {
'exit_code': process.returncode,
'stdout': stdout.decode('utf-8'),
'stderr': stderr.decode('utf-8')
}
except Exception as e:
return {'error': str(e)}
async def cleanup(self):
pass
plugin = CommandPlugin
```
## Testing
### Unit Testing
Create unit tests for your plugins:
```python
import unittest
import asyncio
class TestMyPlugin(unittest.TestCase):
def setUp(self):
self.plugin = MyAwesomePlugin({'option1': 'test'})
def test_initialization(self):
result = asyncio.run(self.plugin.initialize())
self.assertTrue(result)
def test_collection(self):
asyncio.run(self.plugin.initialize())
data = asyncio.run(self.plugin.collect())
self.assertIsInstance(data, dict)
self.assertIn('metric1', data)
self.assertGreater(data['metric1'], 0)
def tearDown(self):
asyncio.run(self.plugin.cleanup())
if __name__ == '__main__':
unittest.main()
```
### Integration Testing
Test your plugin with the actual client:
```bash
# Create test configuration
cat > test_config.yaml <<EOF
server: localhost
plugins:
my_awesome_plugin:
enabled: true
interval: 10
option1: test_value
EOF
# Run client in test mode
python -m hbd.hbc -c test_config.yaml --verbose
```
## Troubleshooting
### My plugin isn't loading
1. Check filename doesn't start with underscore
2. Verify plugin class inherits from InfoPlugin or MonitorPlugin
3. Check `initialize()` returns True
4. Look for import errors in logs
### Plugin loads but doesn't collect data
1. Check `interval` is set correctly (0 for InfoPlugin, > 0 for MonitorPlugin)
2. Verify `collect()` returns a dictionary
3. Check for exceptions in `collect()` method
4. Enable DEBUG logging to see detailed errors
### Data isn't appearing on server
1. Verify client is connected to server
2. Check server logs for PLG message handling
3. Verify returned data is JSON-serializable
4. Check for large data sizes (may exceed UDP packet size)
## Further Reading
- [Plugin Framework Source](../hbd/plugin.py) - Core plugin implementation
- [Built-in Plugins](../hbd/plugins/) - Examples of working plugins
- [Nagios Integration](NAGIOS_INTEGRATION.md) - Running external plugins
- [Configuration Guide](../hbd/config_example.yaml) - Full configuration reference
File diff suppressed because it is too large Load Diff
+9
View File
@@ -0,0 +1,9 @@
Plan
Heartbeat is a client/server based network monitor and host observer. hbd, the server portion receives heartbeat and state messages from clients and maintaines state and hisgtory of the informations it receives.
hbc, the client portion gathers information on various aspects of the
system it is running on, and sends it to hbd. Initially this info is basic, like OS make and version, hardware info (CPU type, memory and disks), fileystem info and some resource info. hbc/hbd support a plugin system to extend the info gathered and stored.
hbd also can send notification based on missed hbc updates, and on violation of pre-set limits for various state paramaters.
+14 -8
View File
@@ -1,11 +1,17 @@
"""hbd package - scaffolding for heartbeat daemon
"""hbd package - heartbeat monitoring system
This package contains the refactored modules for the original monolithic
`hbd` script. The initial implementation contains small scaffolds so you can
start moving functionality into the package.
This package contains both the heartbeat client (hbc) and server (hbd) components,
organized into separate subpackages:
- hbd.client: Client component with system monitoring plugins
- hbd.server: Server/daemon component with web UI and notifications
- hbd.common: Shared utilities and protocol definitions
Install options:
- pip install hbd[client] # Client only
- pip install hbd[server] # Server only
- pip install hbd[all] # Both client and server
"""
__all__ = ["main", "__version__"]
__version__ = "5.0.4"
from .cli import main
__all__ = ["__version__"]
__version__ = "5.0.7"
+3
View File
@@ -0,0 +1,3 @@
"""HeartBeat Client (hbc) - System monitoring client."""
__version__ = "5.0.5"
+54
View File
@@ -0,0 +1,54 @@
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
import logging
import os
try:
import yaml
except Exception:
yaml = None
CLIENT_DEFAULTS = {
# Network settings
"hb_port": 50003, # Port where hbd servers listen
"interval": 10, # Heartbeat interval in seconds
# Runtime flags
"foreground": False,
"verbose": False,
"debug": 0,
# Plugin configuration
"plugins": {}, # Per-plugin configuration
"thresholds": {}, # Threshold configuration for monitoring
}
def load_config(path=None):
"""Load configuration from a YAML file and merge with client defaults.
If YAML is not available or the file does not exist, defaults are returned.
Args:
path: Path to YAML config file (default: ~/.hb.yaml)
Returns:
Dictionary with configuration
"""
cfg = CLIENT_DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
if os.path.exists(path):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# Merge YAML data with defaults
# Keep all keys from YAML to support plugin configs and future extensions
for k, v in data.items():
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
return cfg
+696
View File
@@ -0,0 +1,696 @@
#!/usr/bin/env python3
"""
HeartBeat Client (hbc) - Async version with plugin support.
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
system information via plugins.
"""
import argparse
import asyncio
import logging
import os
import signal
import socket
import sys
import time
from hashlib import md5
from pathlib import Path
from typing import Dict, List, Optional
# Import protocol and config
from .config import load_config
from ..common.proto import dicttos, stodict
# Import plugin system
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
# Constants
PORT = 50003
INTERVAL = 10
MAXRECV = 32767
# Global state
running = True
dorestart = False
shutdown_event: Optional[asyncio.Event] = None
active_tasks: List[asyncio.Task] = []
class AsyncConnection:
"""Async UDP connection to a heartbeat server."""
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
self.conn_id = conn_id
self.addr = addr
self.port = port
self.af = af
self.name = name
self.ackcount = 0
self.lastack = 0.0
self.send_count = 0
self.lastsend = 0.0
self.rtts = [0.0]
self.transport: Optional[asyncio.DatagramTransport] = None
self.protocol: Optional[asyncio.DatagramProtocol] = None
self.logger = logging.getLogger(f"hbc.conn.{addr}")
async def open(self) -> bool:
"""Open the UDP connection.
Returns:
True if successful, False otherwise
"""
try:
loop = asyncio.get_event_loop()
# Create datagram endpoint
self.transport, self.protocol = await loop.create_datagram_endpoint(
lambda: HeartbeatProtocol(self),
family=self.af
)
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
return True
except Exception as e:
self.logger.error(f"Failed to open connection: {e}")
return False
def close(self):
"""Close the connection."""
if self.transport:
self.transport.close()
self.transport = None
self.protocol = None
async def sendto(self, msg: dict, msg_id: str = "HTB"):
"""Send a message to the server.
Args:
msg: Message dictionary
msg_id: Message ID (HTB, PLG, etc.)
"""
if not self.transport:
await self.open()
if not self.transport:
self.logger.error("Cannot send - no transport")
return
# Add standard fields
msg["name"] = shortname(self.name)
msg["id"] = self.conn_id
msg["time"] = time.time()
# Encode message
data = dicttos(msg_id, msg)
# Send
self.transport.sendto(data, (self.addr, self.port))
self.send_count += 1
self.lastsend = time.time()
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
def handle_ack(self, msg: dict, now: float):
"""Handle ACK message from server.
RTT is calculated as: (time ACK received) - (time HTB sent)
"""
self.lastack = now
# Calculate RTT: time ACK received minus time HTB sent
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
self.rtts.append(rtt)
if len(self.rtts) > 10:
self.rtts.pop(0)
self.ackcount += 1
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
class HeartbeatProtocol(asyncio.DatagramProtocol):
"""Protocol handler for incoming UDP messages."""
def __init__(self, connection: AsyncConnection):
self.connection = connection
self.logger = logging.getLogger("hbc.protocol")
def datagram_received(self, data: bytes, addr):
"""Handle incoming datagram."""
try:
msg = stodict(data)
if not msg:
self.logger.warning(f"Failed to parse message from {addr}")
return
now = time.time()
msg_id = msg.get("ID")
if msg_id == "ACK":
self.connection.handle_ack(msg, now)
elif msg_id == "CMD":
# Command from server
asyncio.create_task(handle_command(self.connection, msg))
elif msg_id == "UPD":
# Update from server
asyncio.create_task(handle_update(self.connection, msg))
else:
self.logger.warning(f"Unknown message type: {msg_id}")
except Exception as e:
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
def error_received(self, exc):
"""Handle protocol errors."""
self.logger.error(f"Protocol error: {exc}")
async def handle_command(conn: AsyncConnection, msg: dict):
"""Execute a command received from server."""
import subprocess
cmd = msg.get("cmd", "")
if not cmd:
return
logger = logging.getLogger("hbc.command")
logger.info(f"Executing command: {cmd}")
try:
result = subprocess.check_output(
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
).decode()
status = "OK"
except subprocess.CalledProcessError as e:
result = str(e)
status = "CalledProcessError"
except subprocess.TimeoutExpired:
result = "Command timed out"
status = "Timeout"
except Exception as e:
result = str(e)
status = "Error"
# Send response
response = {
"service": "command",
"msg": f"{status} {result}"
}
await conn.sendto(response)
async def handle_update(conn: AsyncConnection, msg: dict):
"""Handle self-update from server."""
import codecs
import shutil
logger = logging.getLogger("hbc.update")
try:
code = codecs.decode(msg["code"], "base64").decode()
csum = msg["csum"]
except Exception as e:
error = f"Missing code/csum: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Verify checksum
m = md5()
m.update(code.encode())
if m.hexdigest() != csum:
error = "Checksum mismatch"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Backup current file
fn = sys.argv[0]
ofn = f"{fn}.sav"
try:
shutil.copy2(fn, ofn)
except Exception as e:
error = f"Backup failed: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Write new code
try:
with open(fn, "w") as fh:
fh.write(code)
except Exception as e:
error = f"Write failed: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
logger.info("Update successful, restart required")
await conn.sendto({"service": "update", "msg": "OK"})
# Trigger restart
global dorestart
dorestart = True
stop()
async def heartbeat_sender(conn: AsyncConnection, interval: int):
"""Send periodic heartbeats.
Args:
conn: Connection to send on
interval: Heartbeat interval in seconds
"""
logger = logging.getLogger("hbc.heartbeat")
while running:
try:
msg = {
"acks": conn.ackcount,
"rtt": conn.rtts[-1],
"interval": interval
}
await conn.sendto(msg, "HTB")
except Exception as e:
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
except asyncio.CancelledError:
logger.debug("Heartbeat sender cancelled")
raise
# Wait for next interval or shutdown event
try:
if shutdown_event:
await asyncio.wait_for(
shutdown_event.wait(),
timeout=interval
)
break
else:
await asyncio.sleep(interval)
except asyncio.TimeoutError:
pass # Normal timeout, continue loop
except asyncio.CancelledError:
logger.debug("Heartbeat sender cancelled during sleep")
raise
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
"""Collect and send plugin data.
Args:
conn: Connection to send on
registry: Plugin registry
"""
logger = logging.getLogger("hbc.plugins")
# Collect InfoPlugins once at startup
info_plugins = registry.get_by_type(InfoPlugin)
for plugin in info_plugins:
try:
data = await plugin.collect()
if data:
# Create PLG message with plugin name
plugin_msg = {"plugin": plugin.name, **data}
await conn.sendto(plugin_msg, "PLG")
logger.info(f"Sent {plugin.name} data")
except Exception as e:
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
# Schedule MonitorPlugins
# Group plugins by interval
from collections import defaultdict
by_interval = defaultdict(list)
monitor_plugins = registry.get_by_type(MonitorPlugin)
for plugin in monitor_plugins:
by_interval[plugin.interval].append(plugin)
# Create tasks for each interval
tasks = []
for interval, plugins in by_interval.items():
task = asyncio.create_task(
plugin_collector_interval(conn, plugins, interval)
)
tasks.append(task)
# Wait for all tasks
if tasks:
try:
await asyncio.gather(*tasks, return_exceptions=True)
except asyncio.CancelledError:
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
for task in tasks:
if not task.done():
task.cancel()
raise
async def plugin_collector_interval(
conn: AsyncConnection,
plugins: List,
interval: int
):
"""Collect plugins on a specific interval.
Args:
conn: Connection to send on
plugins: List of plugins to collect
interval: Collection interval in seconds
"""
logger = logging.getLogger(f"hbc.plugins.{interval}s")
while running:
for plugin in plugins:
try:
data = await plugin.collect()
if data:
# Don't use encode_plugin_data - create dict directly
plugin_msg = {"plugin": plugin.name, **data}
await conn.sendto(plugin_msg, "PLG")
logger.debug(f"Sent {plugin.name} data")
except asyncio.CancelledError:
logger.debug("Plugin collector cancelled")
raise
except Exception as e:
logger.error(
f"Error collecting {plugin.name}: {e}",
exc_info=True
)
# Wait for next interval or shutdown event
try:
if shutdown_event:
await asyncio.wait_for(
shutdown_event.wait(),
timeout=interval
)
break
else:
await asyncio.sleep(interval)
except asyncio.TimeoutError:
pass # Normal timeout, continue loop
except asyncio.CancelledError:
logger.debug("Plugin collector cancelled during sleep")
raise
def shortname(name: str) -> str:
"""Extract short hostname."""
return name.split(".")[0]
def stop():
"""Stop the event loop."""
global running
running = False
# Set shutdown event to wake up sleeping tasks
if shutdown_event:
shutdown_event.set()
# Cancel all active tasks
for task in active_tasks:
if not task.done():
task.cancel()
async def cleanup(connections: List[AsyncConnection]):
"""Cleanup connections on shutdown."""
logger = logging.getLogger("hbc.cleanup")
logger.info("Cleaning up connections")
for conn in connections:
try:
msg = {
"shutdown": 1,
"acks": conn.ackcount
}
await conn.sendto(msg)
except Exception as e:
logger.error(f"Error sending shutdown: {e}")
conn.close()
# Give messages time to send
await asyncio.sleep(0.5)
async def async_main(args, config):
"""Async main function."""
global running, shutdown_event, active_tasks
# Create shutdown event
shutdown_event = asyncio.Event()
active_tasks = []
logger = logging.getLogger("hbc.main")
# Setup
iam = socket.gethostname()
if args.name:
iam = args.name
hb_hosts = args.hosts
hb_port = config.get("hb_port", PORT)
interval = config.get("interval", INTERVAL)
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
logger.info(f"Port: {hb_port}, Interval: {interval}s")
# Create connections
connections = []
conn_id = 1
for host in hb_hosts:
try:
addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
except socket.gaierror as e:
logger.error(f"Cannot resolve {host}: {e}")
continue
for addr_info in addrs:
af = addr_info[0]
addr = addr_info[4][0]
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
if await conn.open():
connections.append(conn)
conn_id += 1
if not connections:
logger.error("No connections established")
return 1
logger.info(f"Created {len(connections)} connections")
# Send boot/message if requested
if args.boot or args.message:
boot_msg = {}
if args.boot:
boot_msg["boot"] = 1
if args.message:
boot_msg["service"] = "service"
boot_msg["msg"] = args.message
boot_msg["acks"] = 0
for conn in connections:
await conn.sendto(boot_msg)
if args.message and not args.daemon:
# Message-only mode
await cleanup(connections)
return 0
# Load plugins
registry = PluginRegistry()
loader = PluginLoader(registry)
plugin_dir = Path(__file__).parent / "plugins"
if plugin_dir.exists():
count = await loader.load_from_directory(plugin_dir, config)
logger.info(f"Loaded {count} plugins")
else:
logger.warning(f"Plugin directory not found: {plugin_dir}")
# Setup signal handlers
loop = asyncio.get_event_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, stop)
# Start async tasks
# Heartbeat senders (one per connection)
for conn in connections:
task = asyncio.create_task(heartbeat_sender(conn, interval))
active_tasks.append(task)
# Plugin collector (uses all connections, but we'll use first one)
if connections and registry.get_enabled():
task = asyncio.create_task(plugin_collector(connections[0], registry))
active_tasks.append(task)
# Wait for stop or tasks to complete
try:
await asyncio.gather(*active_tasks, return_exceptions=True)
except asyncio.CancelledError:
logger.info("Tasks cancelled")
# Cleanup
logger.info("Shutting down...")
await cleanup(connections)
await loader.unload_all()
return 0
def daemonize(
working_dir="/",
stdin="/dev/zero",
stdout="/dev/null",
stderr="/dev/null"
):
"""UNIX double-fork daemonization."""
try:
pid = os.fork()
if pid > 0:
os._exit(0)
except OSError as e:
sys.stderr.write(f"fork #1 failed: {e}\n")
os._exit(1)
os.chdir(working_dir)
os.setsid()
os.umask(0)
try:
pid = os.fork()
if pid > 0:
os._exit(0)
except OSError as e:
sys.stderr.write(f"fork #2 failed: {e}\n")
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = open(stdin, "r")
so = open(stdout, "a+")
se = open(stderr, "a+")
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
def build_parser():
"""Build argument parser."""
parser = argparse.ArgumentParser(
prog="hbc",
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-b", "--boot",
action="store_true",
help="Send a boot message"
)
parser.add_argument(
"-c", "--config",
dest="configfile",
help="Config file path (YAML)"
)
parser.add_argument(
"-m", "--message",
dest="message",
help="Send a message"
)
parser.add_argument(
"-n", "--name",
dest="name",
help="Name to use in heartbeat message"
)
parser.add_argument(
"-d", "--daemon",
action="store_true",
help="Run in daemon mode"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Verbose output"
)
parser.add_argument(
"-x", "--debug",
action="count",
default=0,
help="Increase debug level"
)
parser.add_argument(
"hosts",
nargs="+",
help="Heartbeat daemon hosts to send to"
)
return parser
def main(argv=None):
"""Main entry point."""
global running, dorestart
parser = build_parser()
args = parser.parse_args(argv)
# Load config
config = load_config(args.configfile)
# Setup logging
log_level = logging.INFO
if args.verbose:
log_level = logging.DEBUG
if args.debug:
log_level = logging.DEBUG
logging.basicConfig(
level=log_level,
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
# Daemonize if requested
if args.daemon:
print("Daemonizing...")
import syslog
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
syslog.syslog(syslog.LOG_INFO, f"Starting heartbeat to {', '.join(args.hosts)}")
daemonize()
# Reconfigure logging for syslog
logging.basicConfig(
level=log_level,
format="hbc[%(process)d]: %(name)s %(levelname)s: %(message)s"
)
# Run async main
try:
exit_code = asyncio.run(async_main(args, config))
except KeyboardInterrupt:
logging.info("Interrupted by user")
exit_code = 0
except Exception as e:
logging.error(f"Fatal error: {e}", exc_info=True)
exit_code = 1
# Handle restart
if dorestart:
logging.info("Restarting...")
os.execv(sys.argv[0], sys.argv)
sys.exit(exit_code)
if __name__ == "__main__":
main()
+410
View File
@@ -0,0 +1,410 @@
"""Plugin system for extending Heartbeat data collection and monitoring.
This module provides the base classes and infrastructure for the plugin system
that enables extending hbc (client) data collection and hbd (server) processing.
Plugin Types:
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
Plugins run on the client (hbc) to gather data, which is then sent to the server
(hbd) for storage, threshold checking, and display.
"""
import importlib.util
import inspect
import logging
import sys
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Type
class Plugin(ABC):
"""Base class for all plugins.
Attributes:
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
version: Plugin version string
description: Human-readable description
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
enabled: Whether plugin is active (can be disabled via config)
"""
name: str = ""
version: str = "1.0.0"
description: str = ""
interval: int = 0
enabled: bool = True
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize plugin with optional configuration.
Args:
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
"""
self.config = config or {}
self.logger = logging.getLogger(f"plugin.{self.name}")
self._initialized = False
@abstractmethod
async def initialize(self) -> bool:
"""Initialize plugin (load resources, check dependencies).
Called once when plugin is loaded. Plugins should validate dependencies
(e.g., check if psutil is available) and prepare any resources.
Returns:
True if initialization succeeded, False otherwise
"""
pass
@abstractmethod
async def collect(self) -> Dict[str, Any]:
"""Collect data from the system.
This is the main method called on each collection interval. Should return
a dictionary of key-value pairs representing the collected data.
Keys should be strings (metric names). Values can be:
- Scalars: int, float, str, bool
- Lists/dicts (will be serialized appropriately)
Returns:
Dictionary of collected metrics, or empty dict on error
"""
pass
async def cleanup(self) -> None:
"""Cleanup plugin resources before shutdown.
Called when plugin is being unloaded or on system shutdown.
Override to release resources, close connections, etc.
"""
pass
def validate_data(self, data: Dict[str, Any]) -> bool:
"""Validate collected data before sending to server.
Override to implement custom validation logic.
Args:
data: Data returned from collect()
Returns:
True if data is valid, False otherwise
"""
return isinstance(data, dict)
class InfoPlugin(Plugin):
"""Plugin for collecting static or rarely-changing information.
InfoPlugins collect data that doesn't change frequently:
- OS name and version
- Hardware specifications (CPU model, RAM size)
- Network interface MAC addresses
Characteristics:
- interval = 0 (collected once at startup by default)
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
- Data is cached and reused until next collection
"""
interval: int = 0 # Collect once at startup
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._cached_data: Optional[Dict[str, Any]] = None
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
"""Get cached data if available (avoids re-collection).
Returns:
Cached data dict, or None if not yet collected
"""
return self._cached_data
async def collect(self) -> Dict[str, Any]:
"""Collect and cache static information."""
if self._cached_data is None:
self._cached_data = await self._collect_info()
return self._cached_data
@abstractmethod
async def _collect_info(self) -> Dict[str, Any]:
"""Internal method to perform actual data collection.
Override this method instead of collect() for InfoPlugins.
"""
pass
def invalidate_cache(self) -> None:
"""Force re-collection on next collect() call."""
self._cached_data = None
class MonitorPlugin(Plugin):
"""Plugin for collecting periodic monitoring data.
MonitorPlugins collect time-series metrics that change frequently:
- CPU usage percentage
- Memory consumption
- Disk I/O statistics
- Network traffic
Characteristics:
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
- Collected continuously on schedule
- Data includes timestamps for time-series tracking
"""
interval: int = 30 # Default: collect every 30 seconds
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._last_reading: Optional[Dict[str, Any]] = None
def get_last_reading(self) -> Optional[Dict[str, Any]]:
"""Get the last collected reading.
Returns:
Last reading dict with timestamp, or None if not yet collected
"""
return self._last_reading
async def collect(self) -> Dict[str, Any]:
"""Collect monitoring data and store as last reading."""
data = await self._collect_metrics()
if data:
# Add collection timestamp
import time
data['_timestamp'] = time.time()
self._last_reading = data
return data
@abstractmethod
async def _collect_metrics(self) -> Dict[str, Any]:
"""Internal method to perform actual metric collection.
Override this method instead of collect() for MonitorPlugins.
"""
pass
class PluginRegistry:
"""Registry for managing loaded plugins.
Maintains a collection of loaded plugins and provides methods to
query plugins by name, type, or interval.
"""
def __init__(self):
self._plugins: Dict[str, Plugin] = {}
self.logger = logging.getLogger("plugin.registry")
def register(self, plugin: Plugin) -> bool:
"""Register a plugin instance.
Args:
plugin: Plugin instance to register
Returns:
True if registered successfully, False if name conflict
"""
if plugin.name in self._plugins:
self.logger.error(f"Plugin '{plugin.name}' already registered")
return False
self._plugins[plugin.name] = plugin
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
return True
def unregister(self, name: str) -> bool:
"""Unregister a plugin by name.
Args:
name: Plugin name to unregister
Returns:
True if unregistered, False if not found
"""
if name in self._plugins:
del self._plugins[name]
self.logger.info(f"Unregistered plugin: {name}")
return True
return False
def get(self, name: str) -> Optional[Plugin]:
"""Get plugin by name.
Args:
name: Plugin name
Returns:
Plugin instance or None if not found
"""
return self._plugins.get(name)
def get_all(self) -> List[Plugin]:
"""Get all registered plugins."""
return list(self._plugins.values())
def get_enabled(self) -> List[Plugin]:
"""Get all enabled plugins."""
return [p for p in self._plugins.values() if p.enabled]
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
"""Get all plugins of a specific type.
Args:
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
Returns:
List of plugins matching the type
"""
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
def get_by_interval(self, interval: int) -> List[Plugin]:
"""Get all plugins with a specific collection interval.
Args:
interval: Interval in seconds (0 for one-time collection)
Returns:
List of plugins with matching interval
"""
return [p for p in self._plugins.values() if p.interval == interval]
class PluginLoader:
"""Load plugins from filesystem and instantiate them.
Scans plugin directories for Python modules containing Plugin subclasses,
loads them dynamically, and registers them with the PluginRegistry.
"""
def __init__(self, registry: PluginRegistry):
self.registry = registry
self.logger = logging.getLogger("plugin.loader")
self._loaded_modules: Dict[str, Any] = {}
async def load_from_directory(
self,
directory: Path,
config: Optional[Dict[str, Any]] = None
) -> int:
"""Load all plugins from a directory.
Scans for .py files, imports them, finds Plugin subclasses,
instantiates them with config, initializes, and registers.
Args:
directory: Path to plugin directory
config: Configuration dict (may contain per-plugin config)
Returns:
Number of plugins successfully loaded
"""
if not directory.exists() or not directory.is_dir():
self.logger.warning(f"Plugin directory not found: {directory}")
return 0
loaded_count = 0
plugin_config = config or {}
# Scan for Python files
for plugin_file in directory.glob("*.py"):
if plugin_file.name.startswith("_"):
continue # Skip __init__.py and private modules
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
try:
# Load module dynamically
module_name = f"plugins.{plugin_file.stem}"
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
if not spec or not spec.loader:
self.logger.warning(f"Could not create spec for {plugin_file}")
continue
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
self._loaded_modules[module_name] = module
self.logger.debug(f"Loaded module: {module_name}")
# Track which plugin classes we've already processed to avoid duplicates
processed_classes = set()
# Find Plugin subclasses in module
for name, obj in inspect.getmembers(module, inspect.isclass):
# Skip base classes and non-Plugin classes
if obj in (Plugin, InfoPlugin, MonitorPlugin):
self.logger.debug(f"Skipping base class: {name}")
continue
if not issubclass(obj, Plugin):
self.logger.debug(f"Skipping non-Plugin class: {name}")
continue
# Skip if we've already processed this class (handles module-level aliases)
if id(obj) in processed_classes:
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
continue
processed_classes.add(id(obj))
self.logger.debug(f"Found plugin class: {name}")
# Instantiate plugin with config
plugin_instance_config = plugin_config.get(obj.name, {})
plugin = obj(config=plugin_instance_config)
# Initialize plugin
try:
initialized = await plugin.initialize()
if not initialized:
self.logger.warning(
f"Plugin {plugin.name} failed initialization, skipping"
)
continue
except Exception as e:
self.logger.error(
f"Error initializing plugin {plugin.name}: {e}",
exc_info=True
)
continue
# Register with registry
if self.registry.register(plugin):
loaded_count += 1
self.logger.info(
f"Loaded plugin: {plugin.name} v{plugin.version} "
f"(interval: {plugin.interval}s)"
)
except Exception as e:
self.logger.error(
f"Error loading plugin from {plugin_file}: {e}",
exc_info=True
)
return loaded_count
async def unload_all(self) -> None:
"""Unload all plugins and cleanup resources."""
for plugin in self.registry.get_all():
try:
await plugin.cleanup()
except Exception as e:
self.logger.error(
f"Error cleaning up plugin {plugin.name}: {e}",
exc_info=True
)
self.registry.unregister(plugin.name)
# Remove loaded modules
for module_name in self._loaded_modules:
if module_name in sys.modules:
del sys.modules[module_name]
self._loaded_modules.clear()
+129
View File
@@ -0,0 +1,129 @@
"""CPU Monitoring Plugin for Heartbeat.
Collects CPU usage statistics including overall CPU percentage, per-core usage,
load average, and process counts.
"""
from typing import Any, Dict, Optional
import sys
from pathlib import Path
# Import from parent package
from hbd.client.plugin import MonitorPlugin
class CPUMonitorPlugin(MonitorPlugin):
"""Monitor CPU usage and load.
Collects:
- Overall CPU usage percentage
- Per-core CPU usage (if enabled in config)
- Load average (1min, 5min, 15min)
- Process count
- CPU frequency (if available)
"""
name = "cpu_monitor"
version = "1.0.0"
description = "CPU usage and load monitoring"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.psutil = None
self.per_core = config.get("per_core", False) if config else False
self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool:
"""Initialize the CPU monitor plugin.
Checks if psutil is available.
Returns:
True if psutil is available, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
try:
import psutil
self.psutil = psutil
self.logger.info(f"{self.name} initialized successfully")
return True
except ImportError:
self.logger.error(
"psutil module not available. Install with: pip install psutil"
)
return False
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect CPU metrics.
Returns:
Dictionary with CPU metrics
"""
if not self.psutil:
return {}
try:
data = {}
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
# Per-core CPU usage (if enabled)
if self.per_core:
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
data["cpu_per_core"] = per_core_percents
data["cpu_core_count"] = len(per_core_percents)
else:
# Just report core count
data["cpu_core_count"] = self.psutil.cpu_count()
# Load average (Unix-like systems only)
try:
load_avg = self.psutil.getloadavg()
data["load_1min"] = round(load_avg[0], 2)
data["load_5min"] = round(load_avg[1], 2)
data["load_15min"] = round(load_avg[2], 2)
except (AttributeError, OSError):
# Not available on Windows
pass
# Process count
try:
data["process_count"] = len(self.psutil.pids())
except Exception as e:
self.logger.warning(f"Could not get process count: {e}")
# CPU frequency (if available)
try:
freq = self.psutil.cpu_freq()
if freq:
data["cpu_freq_current"] = round(freq.current, 2)
data["cpu_freq_min"] = round(freq.min, 2)
data["cpu_freq_max"] = round(freq.max, 2)
except (AttributeError, OSError, RuntimeError, SystemError) as e:
# Not available on all systems, or may fail on FreeBSD with sysctl issues
self.logger.debug(f"CPU frequency not available: {e}")
pass
# CPU times (user, system, idle, etc.)
try:
cpu_times = self.psutil.cpu_times_percent(interval=0)
data["cpu_user"] = round(cpu_times.user, 1)
data["cpu_system"] = round(cpu_times.system, 1)
data["cpu_idle"] = round(cpu_times.idle, 1)
if hasattr(cpu_times, "iowait"):
data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
)
return data
except Exception as e:
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
return {}
+199
View File
@@ -0,0 +1,199 @@
"""
Disk monitoring plugin for Heartbeat.
Collects disk usage and I/O statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class DiskMonitorPlugin(MonitorPlugin):
"""
Monitor disk usage and I/O statistics.
Collects:
- Disk partition information
- Disk usage per partition (total, used, free, percent)
- Disk I/O counters (read/write bytes, read/write count)
- Disk I/O time statistics
Configuration:
interval: Collection interval in seconds (default: 300)
partitions: List of mount points to monitor (default: all)
include_io: Include disk I/O statistics (default: True)
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
"""
name = "disk_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the disk monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- partitions: List of specific mount points to monitor
- include_io: Include I/O statistics (default: True)
- exclude_types: List of filesystem types to exclude
"""
super().__init__(config)
self.partitions = self.config.get('partitions', None) # None = all partitions
self.include_io = self.config.get('include_io', True)
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for disk_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - disk_monitor cannot run")
return False
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
# Initialize I/O counters if available
if self.include_io:
try:
self._prev_io = psutil.disk_io_counters(perdisk=True)
except Exception as e:
logger.warning(f"Could not initialize disk I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current disk statistics.
Returns:
Dictionary with disk metrics organized by partition:
- partitions: Dict of partition data, keyed by mount point
- device: Device name (e.g., /dev/sda1)
- fstype: Filesystem type (e.g., ext4)
- total: Total space in bytes
- used: Used space in bytes
- free: Free space in bytes
- percent: Usage percentage
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
- read_count: Number of reads
- write_count: Number of writes
- read_bytes: Bytes read
- write_bytes: Bytes written
- read_time: Time spent reading in ms
- write_time: Time spent writing in ms
- read_bytes_delta: Bytes read since last collection
- write_bytes_delta: Bytes written since last collection
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
return data
except Exception as e:
logger.error(f"Error collecting disk metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect disk metrics from psutil."""
metrics = {}
# Collect partition usage
partitions_data = {}
partitions = psutil.disk_partitions(all=False)
for partition in partitions:
# Skip unwanted filesystem types
if partition.fstype in self.exclude_types:
continue
# Skip if we're only monitoring specific partitions
if self.partitions and partition.mountpoint not in self.partitions:
continue
try:
usage = psutil.disk_usage(partition.mountpoint)
partitions_data[partition.mountpoint] = {
'device': partition.device,
'fstype': partition.fstype,
'total': usage.total,
'used': usage.used,
'free': usage.free,
'percent': usage.percent
}
except PermissionError:
logger.debug(f"Permission denied accessing {partition.mountpoint}")
continue
except Exception as e:
logger.warning(f"Error reading {partition.mountpoint}: {e}")
continue
metrics['partitions'] = partitions_data
# Collect I/O statistics
if self.include_io:
try:
io_counters = psutil.disk_io_counters(perdisk=True)
io_data = {}
for disk_name, counters in io_counters.items():
disk_stats = {
'read_count': counters.read_count,
'write_count': counters.write_count,
'read_bytes': counters.read_bytes,
'write_bytes': counters.write_bytes,
}
# Add time statistics if available
if hasattr(counters, 'read_time'):
disk_stats['read_time'] = counters.read_time
if hasattr(counters, 'write_time'):
disk_stats['write_time'] = counters.write_time
if hasattr(counters, 'busy_time'):
disk_stats['busy_time'] = counters.busy_time
# Calculate deltas from previous collection
if disk_name in self._prev_io:
prev = self._prev_io[disk_name]
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
io_data[disk_name] = disk_stats
metrics['io_counters'] = io_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect disk I/O statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Disk monitor cleanup")
# Plugin instance for automatic discovery
plugin = DiskMonitorPlugin
+168
View File
@@ -0,0 +1,168 @@
"""
Filesystem information plugin for Heartbeat.
Collects static filesystem and partition information using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import InfoPlugin
logger = logging.getLogger(__name__)
class FilesystemInfoPlugin(InfoPlugin):
"""
Collect filesystem and partition information.
This is an InfoPlugin that collects static information once during startup.
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
Collects:
- List of mounted filesystems
- Partition details (device, mount point, filesystem type, options)
- Filesystem capabilities and features
Configuration:
include_pseudo: Include pseudo/virtual filesystems (default: False)
exclude_types: List of additional filesystem types to exclude (default: [])
"""
name = "filesystem_info"
interval = 0 # InfoPlugin - collect once
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the filesystem info plugin.
Args:
config: Optional configuration dict with keys:
- include_pseudo: Include pseudo/virtual filesystems (default: False)
- exclude_types: List of filesystem types to exclude (default: [])
"""
super().__init__(config)
self.include_pseudo = self.config.get('include_pseudo', False)
# By default, no exclusions since all=False filters most pseudo filesystems
# Users can add specific types to exclude if needed
self.exclude_types = set(self.config.get('exclude_types', []))
if psutil is None:
raise ImportError("psutil library is required for filesystem_info plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - filesystem_info cannot run")
return False
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect filesystem information.
Returns only physical mounted filesystems by default.
Returns:
Dictionary with filesystem data:
- filesystems: List of filesystem dictionaries:
- device: Device name (e.g., /dev/sda1)
- mountpoint: Mount point path
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
- opts: Mount options (comma-separated string)
- maxfile: Maximum filename length
- maxpath: Maximum path length
- filesystem_types: List of unique filesystem types found
- mount_count: Total number of mounted filesystems
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_info()
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
return data
except Exception as e:
logger.error(f"Error collecting filesystem info: {e}")
return {"error": str(e)}
async def _collect_info(self) -> Dict[str, Any]:
"""Collect filesystem information from psutil."""
info = {}
filesystems = []
filesystem_types = set()
# Get mounted disk partitions
# all=False returns only physical devices (real mounted filesystems)
# all=True would include pseudo filesystems (proc, sysfs, etc.)
partitions = psutil.disk_partitions(all=self.include_pseudo)
for partition in partitions:
# Additional filtering if exclude_types is specified
if partition.fstype in self.exclude_types:
continue
fs_info = {
'device': partition.device,
'mountpoint': partition.mountpoint,
'fstype': partition.fstype,
'opts': partition.opts,
}
# Try to get filesystem capabilities
try:
# Get path configuration for this mount point
import os
if hasattr(os, 'pathconf'):
try:
# Maximum filename length
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
if max_name:
fs_info['maxfile'] = max_name
except (OSError, ValueError):
pass
try:
# Maximum path length
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
if max_path:
fs_info['maxpath'] = max_path
except (OSError, ValueError):
pass
except Exception as e:
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
filesystems.append(fs_info)
filesystem_types.add(partition.fstype)
info['filesystems'] = filesystems
info['filesystem_types'] = sorted(list(filesystem_types))
info['mount_count'] = len(filesystems)
# Add some additional filesystem statistics
try:
# Get boot time (useful for determining filesystem mount times)
boot_time = psutil.boot_time()
info['boot_time'] = boot_time
except Exception as e:
logger.debug(f"Could not get boot time: {e}")
return info
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Filesystem info cleanup")
# Plugin instance for automatic discovery
plugin = FilesystemInfoPlugin
+147
View File
@@ -0,0 +1,147 @@
"""
Memory monitoring plugin for Heartbeat.
Collects memory and swap usage statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class MemoryMonitorPlugin(MonitorPlugin):
"""
Monitor memory and swap usage.
Collects:
- Physical memory (RAM) usage and statistics
- Virtual memory details
- Swap memory usage and statistics
- Memory available for applications
Configuration:
interval: Collection interval in seconds (default: 300)
include_swap: Include swap statistics (default: True)
"""
name = "memory_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the memory monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- include_swap: Include swap statistics (default: True)
"""
super().__init__(config)
self.include_swap = self.config.get('include_swap', True)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for memory_monitor plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - memory_monitor cannot run")
return False
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current memory statistics.
Returns:
Dictionary with memory metrics:
- memory_total: Total physical RAM in bytes
- memory_available: Available memory in bytes
- memory_used: Used memory in bytes
- memory_free: Free memory in bytes
- memory_percent: Memory usage percentage
- memory_active: Active memory (Unix)
- memory_inactive: Inactive memory (Unix)
- memory_buffers: Buffers (Linux)
- memory_cached: Cached (Linux)
- memory_shared: Shared (Linux)
- swap_total: Total swap in bytes (if include_swap)
- swap_used: Used swap in bytes (if include_swap)
- swap_free: Free swap in bytes (if include_swap)
- swap_percent: Swap usage percentage (if include_swap)
- swap_sin: Bytes swapped in from disk (if include_swap)
- swap_sout: Bytes swapped out to disk (if include_swap)
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected memory metrics: {len(data)} fields")
return data
except Exception as e:
logger.error(f"Error collecting memory metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect memory metrics from psutil."""
metrics = {}
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available
metrics['memory_used'] = vmem.used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
metrics['memory_active'] = vmem.active
if hasattr(vmem, 'inactive'):
metrics['memory_inactive'] = vmem.inactive
if hasattr(vmem, 'buffers'):
metrics['memory_buffers'] = vmem.buffers
if hasattr(vmem, 'cached'):
metrics['memory_cached'] = vmem.cached
if hasattr(vmem, 'shared'):
metrics['memory_shared'] = vmem.shared
# Swap memory statistics
if self.include_swap:
try:
swap = psutil.swap_memory()
metrics['swap_total'] = swap.total
metrics['swap_used'] = swap.used
metrics['swap_free'] = swap.free
metrics['swap_percent'] = swap.percent
# Swap in/out counters (may not be available on all platforms)
if hasattr(swap, 'sin'):
metrics['swap_sin'] = swap.sin
if hasattr(swap, 'sout'):
metrics['swap_sout'] = swap.sout
except Exception as e:
logger.warning(f"Could not collect swap statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Memory monitor cleanup")
# Plugin instance for automatic discovery
plugin = MemoryMonitorPlugin
+283
View File
@@ -0,0 +1,283 @@
"""Nagios Plugin Runner for Heartbeat.
Executes Nagios-compatible monitoring plugins and parses their output.
Nagios Plugin Standard:
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
- Output format: Single line status message, optional performance data
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
Example configuration in ~/.hb.yaml:
```yaml
nagios_runner:
interval: 60
commands:
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
```
"""
import re
import subprocess
from typing import Any, Dict, List, Optional, Tuple
from hbd.client.plugin import MonitorPlugin
# Nagios exit codes
NAGIOS_OK = 0
NAGIOS_WARNING = 1
NAGIOS_CRITICAL = 2
NAGIOS_UNKNOWN = 3
STATUS_NAMES = {
NAGIOS_OK: "OK",
NAGIOS_WARNING: "WARNING",
NAGIOS_CRITICAL: "CRITICAL",
NAGIOS_UNKNOWN: "UNKNOWN"
}
class NagiosRunnerPlugin(MonitorPlugin):
"""Run Nagios-compatible monitoring plugins.
This plugin executes external Nagios plugins and collects their output,
including status codes, messages, and performance data.
Configuration:
interval: Collection interval in seconds (default: 300)
commands: List of command definitions with 'name' and 'command' keys
timeout: Command execution timeout in seconds (default: 30)
shell: Whether to execute commands via shell (default: True)
Example:
nagios_runner:
interval: 300 # Check every 5 minutes
timeout: 30
commands:
- name: check_disk
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
"""
name = "nagios_runner"
version = "1.0.0"
description = "Execute Nagios-compatible monitoring plugins"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
# Extract configuration
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
self.timeout: int = config.get("timeout", 30) if config else 30
self.shell: bool = config.get("shell", True) if config else True
self.interval = config.get("interval", 300) if config else 300
# Validate commands
if not self.commands:
self.logger.warning(
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
)
async def initialize(self) -> bool:
"""Initialize the Nagios runner plugin.
Returns:
True if at least one command is configured, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
if not self.commands:
self.logger.error("No Nagios commands configured")
return False
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
for cmd_config in self.commands:
name = cmd_config.get("name", "unnamed")
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
return True
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect metrics from all configured Nagios plugins.
Returns:
Dictionary with results from all plugins
"""
results = {}
# Track overall status (worst status wins)
worst_status = NAGIOS_OK
for cmd_config in self.commands:
name = cmd_config.get("name")
command = cmd_config.get("command")
if not name or not command:
self.logger.warning("Skipping command with missing name or command")
continue
# Execute plugin
try:
status_code, output, perfdata = await self._run_nagios_plugin(command)
# Store results
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
results[f"{name}_status_code"] = status_code
results[f"{name}_output"] = output
# Track worst status
if status_code > worst_status:
worst_status = status_code
# Parse and add performance data
if perfdata:
for metric_name, metric_value in perfdata.items():
results[f"{name}_{metric_name}"] = metric_value
self.logger.debug(
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
)
except Exception as e:
self.logger.error(f"Error running {name}: {e}", exc_info=True)
results[f"{name}_status"] = "ERROR"
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
results[f"{name}_output"] = str(e)
worst_status = NAGIOS_UNKNOWN
# Add overall status
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
results["overall_status_code"] = worst_status
results["plugin_count"] = len(self.commands)
return results
async def _run_nagios_plugin(
self,
command: str
) -> Tuple[int, str, Dict[str, Any]]:
"""Execute a Nagios plugin and parse its output.
Args:
command: Command string to execute
Returns:
Tuple of (status_code, output_message, performance_data_dict)
"""
try:
# Run command
result = subprocess.run(
command,
shell=self.shell,
capture_output=True,
timeout=self.timeout,
text=True
)
status_code = result.returncode
output = result.stdout.strip()
# Nagios plugins can return codes > 3, treat as UNKNOWN
if status_code > 3:
status_code = NAGIOS_UNKNOWN
# Parse performance data
perfdata = self._parse_perfdata(output)
# Extract just the status message (before the pipe if present)
if '|' in output:
output_msg = output.split('|')[0].strip()
else:
output_msg = output
return status_code, output_msg, perfdata
except subprocess.TimeoutExpired:
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
except Exception as e:
self.logger.error(f"Error executing command: {e}")
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
"""Parse Nagios performance data from plugin output.
Nagios performance data format:
'label'=value[UOM];[warn];[crit];[min];[max]
Multiple metrics separated by spaces.
Args:
output: Plugin output string
Returns:
Dictionary of metric_name: value
"""
perfdata = {}
# Performance data comes after the pipe character
if '|' not in output:
return perfdata
perf_section = output.split('|', 1)[1].strip()
# Regex to match performance data format
# Matches: 'label'=value or label=value
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
for match in re.finditer(perf_regex, perf_section):
label = match.group(1).strip()
value_str = match.group(2)
uom = match.group(3) or ""
warn = match.group(4)
crit = match.group(5)
min_val = match.group(6)
max_val = match.group(7)
# Convert value to float
try:
value = float(value_str)
except ValueError:
continue
# Store the value
perfdata[label] = value
# Optionally store UOM as separate field
if uom:
perfdata[f"{label}_uom"] = uom
# Store thresholds if present
if warn:
try:
perfdata[f"{label}_warn"] = float(warn)
except ValueError:
pass
if crit:
try:
perfdata[f"{label}_crit"] = float(crit)
except ValueError:
pass
if min_val:
try:
perfdata[f"{label}_min"] = float(min_val)
except ValueError:
pass
if max_val:
try:
perfdata[f"{label}_max"] = float(max_val)
except ValueError:
pass
return perfdata
+240
View File
@@ -0,0 +1,240 @@
"""
Network monitoring plugin for Heartbeat.
Collects network interface statistics and connection information using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class NetworkMonitorPlugin(MonitorPlugin):
"""
Monitor network interface statistics and connections.
Collects:
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
- Per-interface statistics
- Network connection counts by state
- Interface addresses and configuration
Configuration:
interval: Collection interval in seconds (default: 300)
interfaces: List of interfaces to monitor (default: all)
include_connections: Include connection statistics (default: True)
include_addresses: Include interface addresses (default: False)
"""
name = "network_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the network monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- interfaces: List of specific interfaces to monitor
- include_connections: Include connection stats (default: True)
- include_addresses: Include interface addresses (default: False)
"""
super().__init__(config)
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
self.include_connections = self.config.get('include_connections', True)
self.include_addresses = self.config.get('include_addresses', False)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for network_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - network_monitor cannot run")
return False
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
f"connections: {self.include_connections})")
# Initialize I/O counters
try:
self._prev_io = psutil.net_io_counters(pernic=True)
except Exception as e:
logger.warning(f"Could not initialize network I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current network statistics.
Returns:
Dictionary with network metrics:
- interfaces: Dict of interface statistics, keyed by interface name
- bytes_sent: Total bytes sent
- bytes_recv: Total bytes received
- packets_sent: Total packets sent
- packets_recv: Total packets received
- errin: Total incoming errors
- errout: Total outgoing errors
- dropin: Total incoming packets dropped
- dropout: Total outgoing packets dropped
- bytes_sent_delta: Bytes sent since last collection
- bytes_recv_delta: Bytes received since last collection
- packets_sent_delta: Packets sent since last collection
- packets_recv_delta: Packets received since last collection
- connections: Connection statistics by state (if include_connections)
- ESTABLISHED: Count of established connections
- LISTEN: Count of listening sockets
- TIME_WAIT: Count of TIME_WAIT connections
- etc.
- addresses: Interface address information (if include_addresses)
- Dict keyed by interface name with address details
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
return data
except Exception as e:
logger.error(f"Error collecting network metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect network metrics from psutil."""
metrics = {}
# Collect per-interface I/O counters
try:
io_counters = psutil.net_io_counters(pernic=True)
interfaces_data = {}
for iface_name, counters in io_counters.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_stats = {
'bytes_sent': counters.bytes_sent,
'bytes_recv': counters.bytes_recv,
'packets_sent': counters.packets_sent,
'packets_recv': counters.packets_recv,
'errin': counters.errin,
'errout': counters.errout,
'dropin': counters.dropin,
'dropout': counters.dropout,
}
# Calculate deltas from previous collection
if iface_name in self._prev_io:
prev = self._prev_io[iface_name]
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
interfaces_data[iface_name] = iface_stats
metrics['interfaces'] = interfaces_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect network I/O counters: {e}")
# Collect connection statistics
if self.include_connections:
try:
connections = psutil.net_connections(kind='inet')
conn_stats = {}
# Count connections by state
for conn in connections:
state = conn.status
conn_stats[state] = conn_stats.get(state, 0) + 1
metrics['connections'] = conn_stats
except (PermissionError, psutil.AccessDenied):
logger.debug("Permission denied for net_connections (requires root/admin)")
except Exception as e:
logger.warning(f"Could not collect connection statistics: {e}")
# Collect interface addresses
if self.include_addresses:
try:
addresses = psutil.net_if_addrs()
addr_data = {}
for iface_name, addrs in addresses.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_addrs = []
for addr in addrs:
addr_info = {
'family': str(addr.family),
'address': addr.address,
}
if addr.netmask:
addr_info['netmask'] = addr.netmask
if addr.broadcast:
addr_info['broadcast'] = addr.broadcast
iface_addrs.append(addr_info)
addr_data[iface_name] = iface_addrs
metrics['addresses'] = addr_data
except Exception as e:
logger.warning(f"Could not collect interface addresses: {e}")
# Add interface stats (up/down status, speed, mtu)
try:
if_stats = psutil.net_if_stats()
stats_data = {}
for iface_name, stats in if_stats.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
stats_data[iface_name] = {
'isup': stats.isup,
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
'speed': stats.speed,
'mtu': stats.mtu,
}
metrics['interface_stats'] = stats_data
except Exception as e:
logger.warning(f"Could not collect interface stats: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Network monitor cleanup")
# Plugin instance for automatic discovery
plugin = NetworkMonitorPlugin
+136
View File
@@ -0,0 +1,136 @@
"""OS Information Plugin for Heartbeat.
Collects static operating system information including OS name, version,
kernel, architecture, and distribution details.
"""
import platform
import sys
from pathlib import Path
from typing import Any, Dict, Optional
# Import from parent package
from hbd.client.plugin import InfoPlugin
class OSInfoPlugin(InfoPlugin):
"""Collect operating system information.
This plugin gathers static OS information that rarely changes:
- OS name and version
- Kernel version
- Architecture (x86_64, arm64, etc.)
- Distribution details (for Linux)
- Python version (used by hbc)
"""
name = "os_info"
version = "1.0.0"
description = "Operating system and platform information"
interval = 0 # InfoPlugin: collect once at startup
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
async def initialize(self) -> bool:
"""Initialize the OS info plugin.
Returns:
True (always succeeds - platform module is stdlib)
"""
self.logger.info(f"Initializing {self.name} plugin")
return True
async def _collect_info(self) -> Dict[str, Any]:
"""Collect OS information.
Returns:
Dictionary with OS details
"""
try:
data = {
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
"node": platform.node(), # hostname
"release": platform.release(), # kernel version
"version": platform.version(), # detailed version
"machine": platform.machine(), # e.g., "x86_64", "arm64"
"processor": platform.processor(), # processor name
"architecture": platform.architecture()[0], # e.g., "64bit"
"python_version": platform.python_version(),
"python_implementation": platform.python_implementation(),
}
# Add Linux-specific distribution info
if platform.system() == "Linux":
data.update(self._get_linux_distro())
# Add macOS-specific info
elif platform.system() == "Darwin":
data["macos_version"] = platform.mac_ver()[0]
# Add Windows-specific info
elif platform.system() == "Windows":
win_ver = platform.win32_ver()
data["windows_release"] = win_ver[0]
data["windows_version"] = win_ver[1]
data["windows_sp"] = win_ver[2]
data["windows_type"] = win_ver[3]
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
return data
except Exception as e:
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
return {}
def _get_linux_distro(self) -> Dict[str, str]:
"""Get Linux distribution information.
Returns:
Dictionary with distribution details
"""
distro_info = {}
# Try reading /etc/os-release (standard on modern Linux)
os_release = Path("/etc/os-release")
if os_release.exists():
try:
with open(os_release) as f:
for line in f:
line = line.strip()
if "=" in line and not line.startswith("#"):
key, value = line.split("=", 1)
# Remove quotes from value
value = value.strip('"').strip("'")
# Map common keys
if key == "NAME":
distro_info["distro_name"] = value
elif key == "VERSION":
distro_info["distro_version"] = value
elif key == "ID":
distro_info["distro_id"] = value
elif key == "VERSION_ID":
distro_info["distro_version_id"] = value
elif key == "PRETTY_NAME":
distro_info["distro_pretty_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/os-release: {e}")
# Fallback: try lsb_release (older systems)
elif Path("/etc/lsb-release").exists():
try:
with open("/etc/lsb-release") as f:
for line in f:
line = line.strip()
if "=" in line:
key, value = line.split("=", 1)
if key == "DISTRIB_ID":
distro_info["distro_id"] = value
elif key == "DISTRIB_RELEASE":
distro_info["distro_version"] = value
elif key == "DISTRIB_DESCRIPTION":
distro_info["distro_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
return distro_info
+3
View File
@@ -0,0 +1,3 @@
"""Common utilities shared between hbc and hbd."""
__version__ = "5.0.5"
+157
View File
@@ -0,0 +1,157 @@
"""Message encoding/decoding utilities for hbd protocol.
Message Types:
HTB: Heartbeat message (client -> server)
ACK: Acknowledgment (server -> client)
CMD: Command message (server -> client)
UPD: Update message (server -> client)
PLG: Plugin data message (client -> server)
"""
from typing import Dict, Any, Union
import json
import zlib
def encode_value(v: Any) -> str:
"""Encode a value for protocol transmission.
Args:
v: Value to encode (int, float, str, bool, list, dict, etc.)
Returns:
String representation suitable for protocol
"""
if isinstance(v, float):
return f"{v:0.5f}"
elif isinstance(v, (list, dict)):
# Use JSON encoding for complex types, prefixed with @
return "@" + json.dumps(v)
elif isinstance(v, bool):
return str(int(v)) # True->1, False->0
else:
return str(v)
def decode_value(val: str) -> Any:
"""Decode a value from protocol format.
Args:
val: String value from protocol
Returns:
Decoded Python object
"""
if not val:
return val
# Check for JSON-encoded complex types
if val.startswith("@"):
try:
return json.loads(val[1:])
except Exception:
return val[1:] # Return as string without @
# Try numeric evaluation (original behavior)
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
try:
return eval(val)
except Exception:
return val
return val
def dicttos(ID: str, d: Dict[str, Any]):
"""Serialize a dict to protocol message bytes.
If compress is True, the payload is zlib-compressed and the message is
prefixed with `!ID:` as the original script did. Otherwise the format is
`ID:key=value;...` (bytes).
"""
s = []
for k in d:
v = d[k]
encoded_val = encode_value(v)
s.append(f"{k}={encoded_val}")
pk = ";".join(s)
zpk = zlib.compress(pk.encode(), 6)
hdr = ("!" + ID + ":").encode()
return hdr + zpk
def stodict(msg: bytes):
"""Deserialize a protocol message into a dict.
Mirrors original behaviour: detects compressed messages starting with
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
message ID and the parsed key/value pairs.
"""
d = {}
if len(msg) > 0 and chr(msg[0]) == "!":
# message is: b'!ID:' + compressed_payload
# original code used msg[1:4].decode() for ID (3 bytes including colon)
try:
pk = zlib.decompress(msg[5:]).decode()
except Exception:
# malformed compressed payload
return {}
d["ID"] = msg[1:4].decode()
else:
try:
r0 = msg.split(b":", 1)
pk = r0[1].decode()
d["ID"] = r0[0].decode()
except Exception:
return {}
if not pk:
return d
parts = pk.split(";")
for v in parts:
if not v:
continue
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
val = vr[1].strip()
d[k] = decode_value(val)
return d
def oldmtodict(msg: bytes):
"""Compatibility wrapper for old-style messages (no ID prefix).
The original implementation prefixed with 'HTB:' and called stodict.
"""
return stodict(b"HTB:" + msg)
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
"""Encode plugin data into a PLG message.
Args:
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
data: Plugin data dictionary
compress: Whether to compress the payload
Returns:
Encoded message bytes
"""
# Add plugin name to data
full_data = {"plugin": plugin_name, **data}
return dicttos("PLG", full_data)
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
"""Decode a PLG message into plugin data.
Args:
msg: Raw message bytes
Returns:
Dictionary with 'ID', 'plugin', and plugin data fields
"""
return stodict(msg)
-68
View File
@@ -1,68 +0,0 @@
"""Configuration loader and defaults for hbd."""
import logging
import os
try:
import yaml
except Exception:
yaml = None
DEFAULTS = {
"hb_port": 50003,
"hbd_port": 50004,
"hbd_host": "",
"pickfile": "/tmp/hb.pick",
"logfile": "/var/log/heartbeat.log",
"logfmt": "text",
"pushsrv": "pushover",
"pushover_token": "",
"pushover_user": "",
"interval": 20,
"grace": 2,
"dyndomains": ["wrede.org"],
"watchhosts": [],
"dyndnshosts": [],
"drophosts": [],
"nsupdate_bin": "/usr/bin/nsupdate",
"foreground": False,
"verbose": False,
"debug": 0,
"smtpserver": "smtp.fastmail.com",
"smtpuser": "andreas@wrede.ca",
"smtppassword": "pvtvefyp5gbhnch2",
"smtpport": 587,
"toemail": ["aew.hbd.notify@wrede.ca"],
"fromemail": "aew.hbd@wrede.ca",
"ws_port": 50005,
"wss_port": None,
"cert_path": "/usr/local/etc/ssl/",
"wss_pem": "fullchain.pem",
"wss_key": "privkey.pem",
}
def load_config(path=None):
"""Load configuration from a YAML file and merge with defaults.
If YAML is not available or the file does not exist, defaults are returned.
"""
cfg = DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
if os.path.exists(path):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# only keep known keys
for k, v in data.items():
if k in cfg:
cfg[k] = v
else:
logging.warning("unknown config key %s in %s", k, path)
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
return cfg
+196
View File
@@ -0,0 +1,196 @@
# Example Heartbeat Client Configuration
# This file demonstrates all available configuration options for the heartbeat client (hbc)
# and its plugin system.
# ==============================================================================
# Server Configuration
# ==============================================================================
server: hbd.example.com # Heartbeat server hostname or IP
port: 50003 # Server UDP port (default: 50003)
interval: 30 # Heartbeat interval in seconds (default: 30)
# ==============================================================================
# Plugin Configuration
# ==============================================================================
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
# and configured with plugin-specific settings.
plugins:
# --------------------------------------------------------------------------
# OS Information Plugin (InfoPlugin - runs once at startup)
# --------------------------------------------------------------------------
os_info:
enabled: true
# No additional configuration needed
# --------------------------------------------------------------------------
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
# --------------------------------------------------------------------------
cpu_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
per_core: false # Collect per-core CPU statistics (default: false)
# When per_core is true, will report CPU usage for each core separately
# --------------------------------------------------------------------------
# Memory Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
memory_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_swap: true # Include swap memory statistics (default: true)
# --------------------------------------------------------------------------
# Disk Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
disk_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_io: true # Include I/O statistics (default: true)
# Optional: Monitor only specific partitions
# partitions:
# - /
# - /home
# - /var
# Optional: Exclude specific filesystem types
exclude_types:
- tmpfs
- devtmpfs
- squashfs
# --------------------------------------------------------------------------
# Network Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
network_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_connections: true # Include connection statistics (default: true)
include_addresses: false # Include interface addresses (default: false)
# Optional: Monitor only specific interfaces
# interfaces:
# - eth0
# - wlan0
# --------------------------------------------------------------------------
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
# --------------------------------------------------------------------------
filesystem_info:
enabled: true
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
# Optional: Exclude additional specific filesystem types
# exclude_types:
# - squashfs
# - iso9660
# --------------------------------------------------------------------------
# Nagios Runner Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
nagios_runner:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
timeout: 30 # Plugin execution timeout in seconds (default: 30)
# List of Nagios plugins to execute
# Each command is executed as-is, so provide full paths and arguments
commands:
# System load monitoring
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
# Disk space monitoring
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
# Process monitoring
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
# Swap usage
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
# Custom script example
# - /usr/local/bin/check_my_app.sh
# ==============================================================================
# Advanced Options
# ==============================================================================
# These options control client behavior
# Compression: Enable zlib compression for heartbeat messages (default: true)
compress: true
# Hostname: Override the system hostname (default: auto-detect)
# hostname: myhost.example.com
# Message: Custom message included in heartbeat (optional)
# message: "Production web server"
# Logging
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
# logfile: /var/log/hbc.log # Optional log file path
# ==============================================================================
# Example Profiles
# ==============================================================================
# Below are example configuration profiles for different use cases
# Minimal Configuration (default settings):
# -----------------------------------------
# server: hbd.example.com
# interval: 30
# Monitoring Server (comprehensive metrics):
# ------------------------------------------
# server: monitoring.example.com
# interval: 30
# plugins:
# cpu_monitor:
# enabled: true
# interval: 15
# per_core: true
# memory_monitor:
# enabled: true
# interval: 15
# disk_monitor:
# enabled: true
# interval: 60
# network_monitor:
# enabled: true
# interval: 30
# include_connections: true
# Nagios Integration (leverage existing plugins):
# -----------------------------------------------
# server: hbd.example.com
# plugins:
# nagios_runner:
# enabled: true
# interval: 300 # Check every 5 minutes
# commands:
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
# ==============================================================================
# Threshold Configuration (for Heartbeat Daemon)
# ==============================================================================
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
#
# Basic threshold example:
# thresholds:
# cpu_monitor:
# cpu_percent:
# warning: 80.0
# critical: 90.0
# memory_monitor:
# percent:
# warning: 85.0
# critical: 95.0
# disk_monitor:
# partitions:
# /:
# percent:
# warning: 80.0
# critical: 90.0
+296
View File
@@ -0,0 +1,296 @@
# ==============================================================================
# Heartbeat Daemon Multi-Threshold Configuration Example
# ==============================================================================
# This file demonstrates the new multi-threshold configuration feature that allows
# different threshold settings for different hosts/clients.
#
# Features:
# - Define multiple named threshold configurations
# - Map specific hosts to specific threshold configurations
# - Set a default configuration for unmapped hosts
# - Backward compatible with single threshold configuration
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Optional: Set default threshold config (defaults to "default" if not specified)
default_threshold_config: "default"
# ----------------------------------------------------------------------------
# Multiple Named Threshold Configurations
# ----------------------------------------------------------------------------
# Define multiple threshold configurations with different sensitivity levels
threshold_configs:
# Default configuration - moderate thresholds for most servers
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
operator: ">"
load_1min:
warning: 4.0
critical: 8.0
operator: ">"
memory_monitor:
percent:
warning: 85.0
critical: 95.0
operator: ">"
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 95.0
operator: ">"
rtt:
# RTT thresholds (applies to all hosts)
warning: 50.0 # ms
critical: 200.0
# High sensitivity configuration - lower thresholds for critical systems
high_sensitivity:
thresholds:
cpu_monitor:
cpu_percent:
warning: 60.0 # Alert earlier
critical: 75.0
operator: ">"
hysteresis: 0.15 # More hysteresis to reduce flapping
load_1min:
warning: 2.0
critical: 4.0
operator: ">"
memory_monitor:
percent:
warning: 75.0 # Alert at lower memory usage
critical: 85.0
operator: ">"
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
disk_monitor:
partitions:
/:
percent:
warning: 75.0
critical: 85.0
operator: ">"
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
rtt:
warning: 30.0
critical: 100.0
# Low sensitivity configuration - higher thresholds for development/test systems
low_sensitivity:
thresholds:
cpu_monitor:
cpu_percent:
warning: 90.0 # Only alert at very high usage
critical: 95.0
operator: ">"
memory_monitor:
percent:
warning: 90.0
critical: 98.0
operator: ">"
disk_monitor:
partitions:
/:
percent:
warning: 90.0
critical: 95.0
operator: ">"
rtt:
warning: 100.0
critical: 500.0
# Production database servers - specialized thresholds
database:
thresholds:
cpu_monitor:
cpu_percent:
warning: 70.0
critical: 85.0
operator: ">"
memory_monitor:
percent:
warning: 90.0 # Databases can use high memory
critical: 97.0
operator: ">"
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
disk_monitor:
partitions:
/:
percent:
warning: 80.0
critical: 90.0
operator: ">"
/var/lib/mysql: # Database data partition
percent:
warning: 75.0 # Alert earlier for DB partition
critical: 85.0
operator: ">"
rtt:
warning: 20.0 # Stricter latency requirements
critical: 50.0
# ----------------------------------------------------------------------------
# Host to Threshold Configuration Mapping
# ----------------------------------------------------------------------------
# Map specific hosts to specific threshold configurations
# ----------------------------------------------------------------------------
# Notification Channels
# ----------------------------------------------------------------------------
# Define notification providers centrally with their credentials
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
notification_channels:
# Signal notifications
signal_ops:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +1234567890
signal_oncall:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +0987654321
# Email notifications
email_ops:
type: email
recipients: [ops@example.com, alerts@example.com]
sender: heartbeat@example.com
smtp_server: smtp.example.com
smtp_port: 587
smtp_user: heartbeat@example.com
smtp_password: your-smtp-password
# Pushover notifications
pushover_urgent:
type: pushover
token: your-pushover-app-token
user: your-pushover-user-key
# Mattermost notifications
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: your-webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
# Default notification channels (used if host doesn't specify channels)
default_notification_channels: [email_ops]
# ----------------------------------------------------------------------------
# Host Definitions (New Unified Format)
# ----------------------------------------------------------------------------
# Define hosts with threshold configs, monitoring, DNS, and notification settings
hosts:
# Critical production servers - high sensitivity, multiple notification channels
prod-web-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-web-02:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-api-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, email_ops]
dyndns: false
# Database servers - database-specific thresholds
prod-db-01:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-02:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-replica:
threshold_config: database
watch: true
notification_channels: [email_ops] # Replica gets email only
dyndns: false
# Development servers - low sensitivity, minimal notifications
dev-server-01:
threshold_config: low_sensitivity
watch: false # Don't monitor dev servers closely
notification_channels: [email_ops]
dyndns: false
dev-server-02:
threshold_config: low_sensitivity
watch: false
notification_channels: [email_ops]
dyndns: false
# Test servers
test-server-01:
threshold_config: low_sensitivity
watch: false
dyndns: false
# No notification channels - uses default_notification_channels
# Home server with dynamic DNS
home-server:
threshold_config: default
watch: true
notification_channels: [signal_ops]
dyndns: true # Update DNS when IP changes
# Hosts not listed in the hosts section will use:
# - default_threshold_config for thresholds (falls back to "default")
# - default_notification_channels for notifications
# ----------------------------------------------------------------------------
# Notes on Configuration Structure
# ----------------------------------------------------------------------------
#
# All configuration is centralized in the hosts section. Each host can specify:
# - threshold_config: Name of threshold configuration to use
# - watch: Whether to monitor this host actively (send notifications)
# - notification_channels: List of channels to use for this host
# - dyndns: Whether to update DNS when IP address changes
#
# Notification channels are defined once at the top level and referenced
# by name in host definitions, allowing easy reuse and updates.
#
# For hosts not explicitly listed, the system will still accept heartbeats
# and track their state, but won't apply thresholds or send notifications
# unless default settings are configured.
+111
View File
@@ -0,0 +1,111 @@
# Heartbeat Configuration Example with Nagios Plugin Runner
# This example shows how to configure the Nagios Runner plugin
# to execute existing Nagios-compatible monitoring plugins
# Basic server settings (existing config)
hb_port: 50003
hbd_port: 50004
interval: 20
grace: 2
# Plugin configuration
# Each plugin can have its own configuration section
# CPU Monitor Plugin
cpu_monitor:
interval: 300 # Collect every 5 minutes (default)
per_core: false # Set to true to get per-core CPU usage
# Nagios Runner Plugin
nagios_runner:
interval: 300 # Run Nagios plugins every 5 minutes (default)
timeout: 30 # Command execution timeout in seconds
shell: true # Execute commands via shell
# List of Nagios plugins to run
commands:
# Example 1: Check disk space
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
# Example 2: Check disk space for /home
- name: check_disk_home
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
# Example 3: Check system load
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
# Example 4: Check process count
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
# Example 5: Check SSH service
- name: check_ssh
command: /usr/lib/nagios/plugins/check_ssh localhost
# Example 6: Check HTTP service
- name: check_http
command: /usr/lib/nagios/plugins/check_http -H localhost
# Example 7: Check swap usage
- name: check_swap
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
# Example 8: Custom script (Nagios plugin format)
- name: check_custom
command: /usr/local/bin/my_custom_check.sh
# Example 9: Check specific log file
- name: check_logs
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
# Notes:
#
# 1. Nagios Plugin Output Format:
# - Single line: STATUS - Message | performance_data
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
#
# 2. Exit Codes:
# - 0 = OK
# - 1 = WARNING
# - 2 = CRITICAL
# - 3 = UNKNOWN
#
# 3. Performance Data:
# - Automatically parsed and included in heartbeat data
# - Metrics are stored as: {plugin_name}_{metric_name}
# - Example: check_disk_root_/ will contain the disk usage percentage
#
# 4. Overall Status:
# - The plugin reports the worst status from all commands
# - Useful for quick health checks
#
# 5. Plugin Paths:
# Common Nagios plugin directories:
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
# - Custom installs: /usr/local/nagios/libexec/
#
# 6. Installing Nagios Plugins:
# Debian/Ubuntu: sudo apt-get install nagios-plugins
# RHEL/CentOS: sudo yum install nagios-plugins-all
# Arch Linux: sudo pacman -S monitoring-plugins
#
# 7. Writing Custom Nagios Plugins:
# Any script can be a Nagios plugin if it:
# - Returns appropriate exit codes (0-3)
# - Prints status message to stdout
# - Optionally includes performance data after "|"
#
# Example custom plugin (save as /usr/local/bin/check_example.sh):
# #!/bin/bash
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
# echo "CRITICAL - Too many users | users=52;40;50;0"
# exit 2
# else
# echo "OK - Normal user count | users=25;40;50;0"
# exit 0
# fi
+254
View File
@@ -0,0 +1,254 @@
# ==============================================================================
# Heartbeat Daemon Threshold Configuration Example
# ==============================================================================
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
# Thresholds can be defined for any metric collected by monitoring plugins.
#
# Threshold levels:
# - WARNING: First level of concern, typically for early notification
# - CRITICAL: Severe condition requiring immediate attention
#
# Alert notifications are sent when:
# - A metric crosses from OK to WARNING or CRITICAL
# - A metric crosses from WARNING to CRITICAL
# - A metric recovers (returns to a lower severity level)
#
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Threshold definitions per plugin
thresholds:
# ----------------------------------------------------------------------------
# CPU Monitor Thresholds
# ----------------------------------------------------------------------------
cpu_monitor:
# Overall CPU usage percentage (0-100)
cpu_percent:
warning: 80.0 # Warn when CPU usage exceeds 80%
critical: 90.0 # Critical when CPU usage exceeds 90%
operator: ">" # Alert when value is GREATER than threshold
hysteresis: 0.1 # 10% hysteresis to prevent flapping
enabled: true
# 1-minute load average
load_1min:
warning: 4.0 # Warn when 1-min load exceeds 4.0
critical: 8.0 # Critical when 1-min load exceeds 8.0
operator: ">"
hysteresis: 0.15 # 15% hysteresis
enabled: true
# 5-minute load average
load_5min:
warning: 3.0
critical: 6.0
operator: ">"
hysteresis: 0.15
enabled: true
# 15-minute load average
load_15min:
warning: 2.0
critical: 4.0
operator: ">"
hysteresis: 0.15
enabled: true
# ----------------------------------------------------------------------------
# Memory Monitor Thresholds
# ----------------------------------------------------------------------------
memory_monitor:
# Memory usage percentage
percent:
warning: 85.0 # Warn at 85% memory usage
critical: 95.0 # Critical at 95% memory usage
operator: ">"
hysteresis: 0.1
enabled: true
# Available memory in MB (inverse threshold - alert when LOW)
available_mb:
warning: 1000 # Warn when less than 1GB available
critical: 500 # Critical when less than 500MB available
operator: "<" # Alert when value is LESS than threshold
hysteresis: 0.1
enabled: true
# Swap usage percentage
swap_percent:
warning: 50.0 # Warn at 50% swap usage
critical: 80.0 # Critical at 80% swap usage
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Disk Monitor Thresholds
# ----------------------------------------------------------------------------
disk_monitor:
# Partition-specific thresholds
# Use the mount point as the key
partitions:
# Root filesystem
/:
percent:
warning: 80.0 # Warn at 80% disk usage
critical: 90.0 # Critical at 90% disk usage
operator: ">"
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
enabled: true
free_gb:
warning: 10.0 # Warn when less than 10GB free
critical: 5.0 # Critical when less than 5GB free
operator: "<"
hysteresis: 0.1
enabled: true
# Home filesystem (if separate partition)
/home:
percent:
warning: 85.0
critical: 95.0
operator: ">"
hysteresis: 0.05
enabled: true
# Var filesystem (logs, etc.)
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
hysteresis: 0.05
enabled: true
free_gb:
warning: 5.0 # Var needs space for logs
critical: 2.0
operator: "<"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Network Monitor Thresholds
# ----------------------------------------------------------------------------
network_monitor:
# Total error count across all interfaces
errors_total:
warning: 100 # Warn at 100 errors
critical: 1000 # Critical at 1000 errors
operator: ">"
hysteresis: 0.2 # 20% hysteresis for counters
enabled: true
# Total dropped packets
dropin_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
dropout_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
# TCP connections in TIME_WAIT state
connections_TIME_WAIT:
warning: 1000 # Warn at 1000 TIME_WAIT connections
critical: 5000 # Critical at 5000 TIME_WAIT connections
operator: ">"
hysteresis: 0.2
enabled: true
# Total established connections
connections_ESTABLISHED:
warning: 500
critical: 1000
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Nagios Plugin Thresholds (if using nagios_runner)
# ----------------------------------------------------------------------------
nagios_runner:
# Nagios plugins report exit codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
# We can threshold on the exit_code directly
exit_code:
warning: 1 # Map Nagios WARNING to our WARNING
critical: 2 # Map Nagios CRITICAL to our CRITICAL
operator: ">=" # Alert when exit code >= threshold
hysteresis: 0.0 # No hysteresis for exit codes
enabled: true
# ==============================================================================
# Notification Configuration
# ==============================================================================
# Configure notification methods (email, pushover, etc.)
# These are used when threshold violations occur
# Email notifications
toemail:
- admin@example.com
- oncall@example.com
fromemail: heartbeat@example.com
smtpserver: smtp.example.com
smtpport: 587
smtpuser: heartbeat@example.com
smtppassword: your-password-here
# Pushover notifications (optional)
# pushover_token: your-pushover-app-token
# pushover_user: your-pushover-user-key
# Mattermost webhook (optional)
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
# ==============================================================================
# Watched Hosts
# ==============================================================================
# Hosts in this list will trigger notifications for:
# - Heartbeat timeouts/overdue
# - Threshold violations
# - Boot messages
watchhosts:
- webserver01
- database01
- mailserver
- critical-app
# ==============================================================================
# Additional Server Settings
# ==============================================================================
hb_port: 50003 # UDP port for heartbeat messages
hbd_port: 50004 # HTTP port for web interface
grace: 10 # Grace period for overdue detection (seconds)
debug: 0 # Debug level (0-3)
verbose: false # Verbose output
# Journal settings (message logging)
journal_enabled: true
journal_path: /var/log/heartbeat/messages.journal
journal_max_size: 104857600 # 100MB before rotation
journal_max_backups: 10
# ==============================================================================
# Example: Production Configuration with Conservative Thresholds
# ==============================================================================
# For production systems, consider:
# - Higher warning thresholds to reduce alert fatigue
# - Appropriate hysteresis values (5-15% typical)
# - Re-notification intervals matching on-call rotation
# - Multiple escalation contacts
# - Integration with incident management systems
# ==============================================================================
-602
View File
@@ -1,602 +0,0 @@
#!/usr/bin/env python3
# $Id: hbc,v 1.9 2012/03/29 02:08:36 andreas Exp $
# NEW
import argparse
import sys
import time
import socket
import os
import signal
import select
import traceback
from hashlib import md5
import shutil
import zlib
import subprocess
import syslog
import codecs
from .config import load_config
PORT = 50003
INTERVAL = 10
REOPENC = 6
PIDFILE = "/tmp/hbc.pid"
VER = 6
MAXRECV = 32767
running = True
dorestart = False
warned1 = False
msgonly = False
helpflag = False
verbose = False
fdaemon = False
daemonized = False
msgboot = {}
home = os.environ["HOME"]
configfile = "%s/.hbrc" % home
cmdargs = []
iam = socket.gethostname()
def log(msg):
if fdaemon:
syslog.syslog(syslog.LOG_ERR, msg)
else:
print(msg)
def handler(signum, frame):
if signum == signal.SIGTERM:
cleanup()
class NullDevice:
def write(self, s):
pass
class Conn:
def __init__(self, conId, addr, port, af):
self.conId = conId
self.addr = addr
self.port = port
self.af = af
self.ackcount = 0 # num of accks received
self.lastack = 0 # time() last ACK was received
self.send = 0
self.lastsend = 0 # time() last msg was sent
self.rtts = [0]
self.sock = None
def __str__(self):
return "Con(%s, %s %s)" % (self.addr, self.port, self.af)
def open(self):
self.sock = socket.socket(self.af, socket.SOCK_DGRAM)
self.sock.setsockopt(
socket.SOL_SOCKET,
socket.SO_REUSEADDR,
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) | 1,
)
def sendto(self, msg, ID="HTB"): # default ID is HearTBeat
global warned1
if self.send % REOPENC == 0:
self.close()
if not self.sock:
self.open()
msg["name"] = shortname(iam)
msg["id"] = self.conId
msg["ver"] = VER
msg["time"] = time.time()
m = dicttos(ID, msg) # always compress
if verbose:
log("conn.send('%s', (%s:%s) %s)" % (msg, self.addr, self.port, len(m)))
try:
self.sock.sendto(m, (self.addr, self.port))
except socket.error as e:
if not warned1:
log("socket error: %s %s:%s" % (e, self.addr, self.port))
warned1 = True
self.close()
return
self.send += 1
self.lastsend = time.time()
def ack(self, msgDict, now):
try:
self.lastack = msgDict["time"]
mul = 2
except Exception:
self.lastack = now
mul = 1
rtt = (self.lastack - self.lastsend) * mul
if verbose:
log("ack RTT: %0.1f ms (now %s)" % (rtt * 1000.0, now))
self.rtts.append(rtt * 1000.0)
if len(self.rtts) > 10:
del self.rtts[0]
self.ackcount += 1
def close(self):
if self.sock:
self.sock.close()
self.sock = None
def shortname(name):
r = name.split(".")
return r[0]
def dicttos(ID, d):
s = []
for k in d:
if isinstance(d[k], float):
s.append("%s=%0.5f" % (k, d[k]))
else:
s.append("%s=%s" % (k, d[k]))
pk = ";".join(s)
zpk = zlib.compress(pk.encode(), 6)
ID = "!" + ID + ":"
return ID.encode() + zpk
def stodict(msg):
d = {}
if len(msg) > 0 and chr(msg[0]) == "!":
pk = zlib.decompress(msg[5:]).decode()
d["ID"] = msg[1:4].decode()
else:
r0 = msg.split(":", 1)
pk = r0[1]
d["ID"] = r0[0]
r = pk.split(";")
for v in r:
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
v = vr[1].strip()
try:
v = eval(v)
except Exception:
pass
d[k] = v
if verbose:
print("msg is %s" % d)
return d
def XXstodict(msg):
d = {}
r0 = msg.split(":", 1)
if len(r0) == 1:
return None
if r0[0][0] == "!": # compressed
pk = zlib.decompress(msg[len(r0[0]) + 1 :])
d["ID"] = r0[0][1:]
else:
pk = r0[1]
d["ID"] = r0[0]
r = pk.split(";")
for v in r:
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
v = vr[1].strip()
try:
if v[0].isdigit():
v = eval(v)
except Exception:
pass
d[k] = v
return d
def syslogtrace(note):
logm = "%s hbc died: \n%s" % (note, traceback.format_exc())
log(logm)
for line in logm.split("\n"):
syslog.syslog(syslog.LOG_ERR, " tb: %s" % line)
if verbose:
print(logm)
conId = 1
def createConnections(hosts):
global conId
for host in hosts:
if verbose:
log("createConnections for %s" % host)
try:
rs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
except socket.gaierror:
logm = "%s hbc died: \n%s" % ("createConnections", traceback.format_exc())
if verbose:
log(logm)
return None
for r in rs:
if verbose:
log("address %s" % str(r))
if r[0] in [10, 24, 28, 30]: # for Linux, NetBSD, FreeBSD
af = socket.AF_INET6
elif r[0] == 2:
af = socket.AF_INET
else:
print("dont know this net type: %s" % r[0][0])
sys.exit(1)
addr = r[4][0]
conns[conId] = Conn(conId, addr, hb_port, af)
if verbose:
print("cons[%s] = %s" % (conId, str(conns[conId])))
conId += 1
def doexec(conn, data):
try:
ro = subprocess.check_output(
data, stderr=subprocess.STDOUT, shell=True
).decode()
fail = "OK"
except subprocess.CalledProcessError as e:
ro = str(e)
fail = "CalledProcessError"
except Exception as e:
syslogtrace("System")
ro = "N/A"
fail = "cmd failed: %s" % e
msg = {"service": "command", "msg": fail + " " + ro}
conns[conn].sendto(msg)
def doupdate(conn, msgDict):
fail = None
try:
code = codecs.decode(msgDict["code"], "base64").decode()
csum = msgDict["csum"]
except Exception as e:
fail = "csum/code missing: %s" % e
if not fail:
fail = doupdateone(code, csum)
msg = {"service": "update", "msg": fail if fail else "OK"}
conns[conn].sendto(msg)
if not fail:
log("hc updates, fs = %s" % (len(code)))
return fail
def doupdateone(code, csum):
m = md5()
m.update(code.encode())
icsum = m.hexdigest()
if icsum != csum:
return "checksum error"
fn = sys.argv[0]
ofn = "%s.sav" % fn
try:
shutil.copy2(fn, ofn)
except Exception as e:
return "cannot make backup copy: %s" % e
try:
fh = open(fn, "w")
fh.write(code)
fh.close()
except Exception as e:
return "cannot write new code: %s" % e
return None
def restart():
if verbose:
print("restart: execv %s %s" % (sys.argv[0], [sys.argv[0]] + cmdargs))
syslog.syslog(syslog.LOG_ERR, "restart %s" % (sys.argv[0]))
e = "fallthrough"
try:
os.execv(sys.argv[0], [sys.argv[0]] + cmdargs)
except Exception:
pass
print("should not be here:", str(e))
log("restart failed: %s" % e)
def process():
global running, dorestart
nextReport = time.time()
while running:
while time.time() < nextReport:
ifiles = {}
conIds = {}
for conn in conns:
if conns[conn].sock:
ifiles[conns[conn].sock.fileno()] = conns[conn].sock
conIds[conns[conn].sock.fileno()] = conn
sleep = nextReport - time.time()
if sleep <= 0:
break
try:
r = select.select(list(ifiles.keys()), [], [], sleep)
now = (
time.time()
) # nb: delay from actual packet arrival to select is ca. 105ms!
except KeyboardInterrupt:
running = False
break
except SystemExit:
log("daemon exit, running was %s" % running)
if running:
running = False
break
except Exception:
if running:
syslogtrace("select")
running = False
break
for rfh in r[0]:
conn = conIds[rfh]
data, addr = ifiles[rfh].recvfrom(MAXRECV)
if verbose:
print("sock.recvfrom: %s (%s) %s" % (addr, len(data), data[:4]))
try:
msgDict = stodict(data)
except Exception as e:
print(
"failed to parse incoming data from %s: %s (%s)"
% (addr, data, e)
)
continue
if verbose:
print(
"sock.recvfrom: %s (%s) %s"
% (addr, len(data), str(msgDict)[:80])
)
if msgDict is None:
print("bad backet from %s (%s) %s" % (addr, len(data), data))
elif msgDict["ID"] == "ACK":
conns[conn].ack(msgDict, now)
elif msgDict["ID"] == "UPD":
if doupdate(conn, msgDict) is None:
if verbose:
print("process: restart after update")
dorestart = True
break
elif msgDict["ID"] == "CMD":
doexec(conn, msgDict["cmd"])
else:
doexec(conn, data) # deprecated until no more VER - hbc
if dorestart:
running = False
break
if not running:
break
for conn in conns:
msg = {"acks": conns[conn].ackcount, "rtt": conns[conn].rtts[-1]}
conns[conn].sendto(msg)
time.sleep(
0.1
) # N.B. Linux (i.e. Rasperry Pi 3 drops the second pkg unless delayed
if nextReport + interval >= time.time():
nextReport += interval
else:
nextReport = time.time() + interval
if verbose:
log("process: done running")
def cleanup():
global running
if not running:
return
if verbose:
log("cleanup")
running = False
for conn in conns:
msg = {"shutdown": 1, "acks": conns[conn].ackcount}
conns[conn].sendto(msg)
conns[conn].close()
time.sleep(1)
closeall()
def closeall():
if verbose:
syslog.syslog(syslog.LOG_ERR, "closecall")
for conn in conns:
conns[conn].close()
def daemonize(
working_dir="/", stdin="/dev/zero", stdout="/dev/null", stderr="/dev/null"
):
"""
Does the UNIX double-fork magic, see Stevens' "Advanced Programming in the
UNIX Environment" for details (ISBN 0201563177)
http://www.yendor.com/programming/unix/apue/proc/fork2.c
"""
try:
# first fork
pid = os.fork()
if pid > 0:
# exit from first parent
os._exit(0)
except OSError as e:
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
os._exit(1)
# decouple from parent environment
os.chdir(working_dir)
os.setsid()
os.umask(0)
# second fork
try:
pid = os.fork()
if pid > 0:
# exit from second parent
os._exit(0)
except OSError as e:
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
sys.exit(1)
# redirects standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
si = open(stdin, "r")
so = open(stdout, "a+")
se = open(stderr, "a+")
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
#
# Main program
#
def build_parser():
parser = argparse.ArgumentParser(
prog="hbc",
description="HeartBeatClient - send a heatbeat message to a HeartBeatDaemon",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("-b", "--boot", action="store_true", help="Send a boot message")
parser.add_argument(
"-c", "--config", dest="configfile", help="Config file path (YAML)"
)
parser.add_argument("-m", "--message", dest="message", help="Send a message")
parser.add_argument(
"-n", "--name", dest="name", help="Name to use in heartbeat message"
)
parser.add_argument(
"-f", "--daemon", action="store_true", help="Run in daemon mode"
)
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
parser.add_argument(
"-x", "--debug", action="count", default=0, help="Increase debug level"
)
parser.add_argument("hosts", nargs="+", help="Heartbeat daemon hosts to send to")
return parser
def main(argv=None):
global msgonly, verbose, fdaemon, daemonized, cmdargs, iam, hb_port, conns, interval, hb_hosts
parser = build_parser()
args = parser.parse_args(argv)
config = load_config(args.configfile)
# Apply CLI overrides
if args.boot:
msgboot["boot"] = 1
if args.message:
msgboot["service"] = "service"
msgboot["msg"] = args.message
msgonly = True
if args.name:
iam = args.name
cmdargs += ["-n", iam]
if args.daemon:
fdaemon = True
if args.verbose:
verbose = True
cmdargs.append("--verbose")
if args.debug:
config.setdefault("debug", 0)
config["debug"] += args.debug
cmdargs.append("-" + "x" * args.debug)
if verbose:
print("cmdargs for restart are %s" % cmdargs)
#
# set defaults
hb_hosts = args.hosts
hb_port = config.get("hb_port", PORT)
interval = config.get("interval", INTERVAL)
#
if verbose:
print("notice: hb_hosts: %s" % str(hb_hosts))
print("notice: hb_port: %s" % hb_port)
print("notice: interval: %s" % interval)
print("notice: iam: %s" % iam)
print("notice: msgonly: %s" % msgonly)
print("notice: msgboot: %s" % msgboot)
if not msgonly:
msgboot["interval"] = interval
conns = {}
while True:
if verbose:
log("create connections")
createConnections(hb_hosts)
if len(conns) != 0:
break
if verbose:
log("no connections yet, sleep a bit")
time.sleep(2)
if verbose:
log("%s connections created" % (len(conns)))
if len(msgboot) > 0:
if verbose:
print("on boot")
msgboot["acks"] = 0
for conn in conns:
conns[conn].sendto(msgboot)
if msgonly:
if verbose:
print("msgboot done msgonly=%s" % msgonly)
closeall()
sys.exit(0)
#
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
if fdaemon:
print("daemoinizing.")
daemonize()
daemonized = True
syslog.syslog(syslog.LOG_ERR, "starting heartbeat to %s" % ",".join(hb_hosts))
signal.signal(signal.SIGTERM, handler)
try:
process()
except Exception as e:
syslogtrace("process")
if verbose:
print("err: process exit: %s" % e)
if verbose:
log("main: cleanup")
cleanup()
if dorestart:
restart()
if __name__ == "__main__":
main()
-381
View File
@@ -1,381 +0,0 @@
"""
host and connection class shared between hbd and
the websit's heartbeat.py
"""
import time
import json
import copy
import queue
num = 0
MAXRTTS = 10
DEBUG = 2
def log(host, m):
if DEBUG:
print("class log: %s %s" % (host, m))
class Connection:
# map of addrs to names
htab = {}
UNKNOWN = "unknown"
UP = "up"
DOWN = "down"
OVERDUE = "overdue"
def __init__(self, host, cid, addr, afam):
self.host = host
self.cid = cid
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.addr = addr
self.afam = afam
self.rtts = [0]
self.lastbeat = time.time()
self.statetime = self.lastbeat
self.deltastatetime = "computed"
self.state = Connection.UNKNOWN
if host:
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr))
def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr))
def clearstate(self):
d = {}
d["addr"] = ""
d["rtt"] = ""
d["lastbeat"] = ""
d["state"] = ""
d["statetime"] = ""
d["deltastatetime"] = ""
d["rttstate"] = ""
return d
def statedict(self, Null=False):
d = self.clearstate()
now = time.time()
if not Null:
d["addr"] = self.addr
if self.rtts[-1]:
d["rtt"] = "%0.1f" % self.rtts[-1]
elif self.state == Connection.UNKNOWN:
d["rtt"] = ""
else:
d["rtt"] = "?"
d["lastbeat"] = self.lastbeat
if self.state == Connection.OVERDUE:
d["state"] = "<b>%s</b>" % self.state
else:
d["state"] = self.state
if self.state == Connection.UP:
d["rttstate"] = d["rtt"]
elif self.state == Connection.OVERDUE:
d["rttstate"] = ""
else:
d["rttstate"] = d["state"]
d["statetime"] = time.strftime(
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
)
delta = now - self.statetime
if self.state == Connection.UNKNOWN:
d["deltastatetime"] = ""
elif delta > 86400:
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
elif delta > 3600:
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
elif delta > 60:
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
else:
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
d["deltastatetime"] = "%i secs" % (delta)
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
d = self.clearstate()
return d
def headerdict(self, afam):
d = {}
d["addr"] = "%s Addr" % afam
d["rtt"] = "Latencey"
d["lastbeat"] = "Last Contact"
d["state"] = "State"
d["statetime"] = "Last State"
d["rttstate"] = "Reach"
d["deltastatetime"] = "Last State"
return d
def jsons(self):
return json.dumps(self.__dict__)
# set new state, return number of secs in previous state
def newstate(self, state, now, when=0):
self.state = state
delta = now - when
s = delta - self.statetime
self.statetime = delta
return s
def getstate(self):
return self.state
def newaddr(self, addr, rtt, now):
self.lastbeat = now
self.rtts.append(rtt)
if len(self.rtts) > MAXRTTS:
del self.rtts[0]
if self.addr == addr:
r = None
else:
r = "changed from %s to %s" % (self.addr, addr)
try:
del Connection.htab[self.addr]
except Exception:
pass
self.addr = addr
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr))
return r
#
class Host:
# Table of Hosts
hosts = {}
dnsQ = queue.Queue()
def __init__(self, name):
global num
self.name = name
if name:
num += 1
Host.hosts[name] = self
self.num = num
self.dyn = False
self.watched = False
self.upcount = 0
self.interval = 0
self.doesack = -1
self.cmds = []
self.cver = 0
self.connections = {}
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
def statedict(self):
d = {}
d["name"] = self.name
if self.dyn:
d["name"] += "*"
if self.watched:
d["name"] = "<b>%s</b>" % d["name"]
d["dyn"] = str(self.dyn)
d["ver"] = str(self.cver)
d["num"] = self.num
for c in ["IPv4", "IPv6"]:
if c in self.connections:
cs = self.connections[c].statedict()
else:
cs = ubConnection.statedict(True)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def headerdict(self):
d = {}
d["name"] = "Name"
d["dyn"] = "Dyn"
d["ver"] = "Ver"
d["num"] = "??"
for c in ["IPv4", "IPv6"]:
cs = ubConnection.headerdict(c)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def registerDns(self):
for af in self.connections:
self.connections[af].registerDns()
def stateinfo(self):
ddict = {}
for d in self.__dict__:
if d == "connections":
cl = []
for c in ["IPv4", "IPv6"]:
if c not in self.connections:
continue
# dirty ugly hack: fix conn to host backpointer
cld = copy.deepcopy(self.connections[c].__dict__)
cld["host"] = cld["host"].name
cl.append(cld)
ddict[d] = cl
else:
ddict[d] = self.__dict__[d]
return ddict
def jsons(self):
return json.dumps(self.stateinfo())
def setcver(self, cver):
self.cver = cver
def isDynDns(self):
return self.dyn
def isIPv4(self, addr):
if isinstance(addr, tuple):
return addr[0].find(".") > 0
else:
return addr.find(".") > 0
def conndata(self, cid, addr, rtt, now):
if addr[0:7] == "::ffff:":
addr = addr[7:]
if self.isIPv4(addr):
afam = "IPv4"
else:
afam = "IPv6"
if afam not in self.connections:
self.connections[afam] = Connection(self, cid, addr, afam)
conn = self.connections[afam]
res = conn.newaddr(addr, rtt, now)
return conn, res
# called when reloading class from pickle, add new fields here
def fixup(self):
for c in ["IPv4", "IPv6"]:
if c in self.connections:
addr = self.connections[c].addr
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.connections[c].addr = addr
pass
# def dispstate(self):
# if self.state in ["down", "overdue"]:
# state = "<b>%s</b>" % self.state
# elif self.state in ["up", "UP"]:
# state = ""
# for x in list(self.connections.keys()):
# try:
# state += " %5.1f" % (self.connections[x].rtts[-1])
# except:
# state += " %5s" % (self.connections[x].rtts[-1])
# elif self.state in ["unknown", "UNKNOWN"]:
# state = ""
# else:
# state = "%s" % self.state
# return state
def dispstats(self):
if self.doesack != -1:
if self.upcount > 0:
r = ""
for v in range(3):
a, u = self.hdwcounts[v]
if (self.upcount - u) != 0:
vs = "%0.0f" % (
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
)
if vs == "0":
vs = ""
else:
vs = "-"
r += '<td align="right">%s</td>' % vs
return r
else:
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
return '<td align="right">N/A</td><td></td<td></td>>'
hostfields_long = [
"name",
"IPv4.addr",
"IPv4.state",
("IPv4.rtt", 'style="text-align: right;"'),
("IPv4.statetime", 'style="text-align: right;"'),
"IPv6.addr",
"IPv6.state",
("IPv6.rtt", 'style="text-align: right;"'),
("IPv6.statetime", 'style="text-align: right;"'),
"ver",
]
hostfields_short = [
"name",
("IPv4.rttstate", 'style="text-align: right;"'),
("IPv4.deltastatetime", 'style="text-align: right;"'),
("IPv6.rttstate", 'style="text-align: right;"'),
("IPv6.deltastatetime", 'style="text-align: right;"'),
]
def gene(self, tag, v, attrib=None):
if attrib:
a = " %s" % attrib
else:
a = ""
return "<%s%s>%s</%s>" % (tag, a, v, tag)
def htmltable(self, tag, hd, short):
if short:
hostfields = Host.hostfields_short
else:
hostfields = Host.hostfields_long
h = []
for f in hostfields:
if isinstance(f, tuple):
h.append(self.gene(tag, hd[f[0]], f[1]))
else:
h.append(self.gene(tag, hd[f]))
return self.gene("tr", "\n".join(h))
def buildhosttable(self, short=False):
if DEBUG > 1:
print("DBG buildhosttable: start")
res = []
res.append('<table id="ntable" class="sortable">')
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
hosts_sorted = list(Host.hosts.keys())
if len(hosts_sorted):
hosts_sorted.sort()
for h in hosts_sorted:
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
res.append("</table>")
if DEBUG > 1:
print("DBG buildhosttable: %s" % res)
return res
def buildmsgtable(self, msgs):
res = []
le = max(40 - len(Host.hosts), 3)
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs) - le :]:
res.append("%s<BR>" % m)
return res
# create fake "unbound objects", remove in Python 3.0
ubHost = Host(None)
ubConnection = Connection(None, "", "", "")
-221
View File
@@ -1,221 +0,0 @@
"""HTTP server implementation using aiohttp and jinja2."""
import asyncio
import json
import time
import urllib.parse
import os
import logging
from aiohttp import web
import jinja2
logger = logging.getLogger(__name__)
def _render_template(html_str: str, **context) -> str:
tmpl = jinja2.Template(html_str)
return tmpl.render(**context)
async def start(
host: str,
port: int,
config,
hbdclass,
msgs_getter,
log=None,
email=None,
pushmsg=None,
msg_to_websockets=None,
tcss=None,
DEBUG=0,
verbose=False,
get_now=None,
VER="",
):
"""Start an aiohttp web server and block until cancelled.
This function is intended to be awaited inside the main asyncio event loop.
"""
get_now = get_now or (lambda: time.time())
async def index(request):
res = []
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append("<html>")
res.append("<head>")
res.append("<title>Heartbeat</title>")
if tcss:
res.append(tcss)
res.append("</head>")
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
res.append(f"<H2>Heartbeat status {VER}</h2>")
res += hbdclass.ubHost.buildhosttable()
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
res.append(
"<p> %s (%s)</p>"
% (
time.strftime("%H:%M:%S", time.localtime(get_now())),
config.get("tz", "CET-1CDT"),
)
)
res.append("</body></html>")
body = "\n".join(res)
return web.Response(text=body, content_type="text/html")
async def api_hosts(request):
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
async def api_messages(request):
lst = msgs_getter()[-30:]
return web.json_response(lst)
async def cmd(request):
qa = request.rel_url.query
uname = qa.get("h")
ucmd = qa.get("c")
if not ucmd or not uname:
return web.Response(status=400, text="need h= and c= arguments")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
hbdclass.Host.hosts[uname].cmds.append(
("CMD", {"cmd": urllib.parse.unquote(ucmd)})
)
return web.Response(text=f"cmd {uname} queued")
async def drop(request):
qa = request.rel_url.query
uname = qa.get("h")
if not uname:
return web.Response(status=400, text="need h= argument")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
if log:
log(uname, "dropped")
del hbdclass.Host.hosts[uname]
return web.Response(text="Done")
async def register(request):
qa = request.rel_url.query
uname = qa.get("h")
if not uname:
return web.Response(status=400, text="need h= argument")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
ll = hbdclass.Host.hosts[uname].registerDns()
if log:
log(uname, ll)
return web.Response(text=str(ll))
async def update(request):
qa = request.rel_url.query
uname = urllib.parse.unquote(qa.get("h", ""))
ucode = qa.get("c")
if not ucode or not uname:
return web.Response(status=400, text="need h= and c= arguments")
if uname != "All" and uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
if uname != "All":
names = [uname]
else:
names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2]
out = []
for n in names:
err = None
try:
r = {"csum": None, "code": ucode}
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
except Exception as e:
err = str(e)
out.append(f"update started for {n}: {err if err else 'OK'}")
return web.Response(text="\n".join(out))
async def restart(request):
# signal main application to perform restart if needed
# not implemented here - return OK
if log:
log(None, "restart request")
return web.Response(text="restart request")
async def live(request):
# render template from hbd/templates/live.html using Jinja2
# Resolve templates directory relative to the hbd package
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
host = config.get("hb_host", "localhost")
extra_scripts = config.get("http_extra_scripts", "")
host = request.host.split(":")[0]
if config.get("wss_port"):
heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
else:
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
tmpl = env.get_template("live.html")
body = tmpl.render(
title="Heartbeat",
header="Heartbeat",
request=request,
heartbeat_ws_url=heartbeat_ws_url,
extra_scripts=extra_scripts,
hosts=[
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
],
messages=msgs_getter()[-30:],
)
return web.Response(text=body, content_type="text/html")
async def static(request):
"""Serve files from the package static directory.
URL form: /static/<path>
"""
p = request.match_info.get("path", "")
logger.debug("static file requested: %s", p)
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
# normalize and prevent directory traversal
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
if not target.startswith(base + os.sep) and target != base:
return web.Response(status=403, text="Forbidden")
if not os.path.exists(target) or not os.path.isfile(target):
return web.Response(status=404, text="Not Found")
logger.info("serving static file: %s", target)
return web.FileResponse(path=target)
async def favicon(request):
"""Serve favicon.ico from the package static directory."""
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
target = os.path.join(base, "favicon.ico")
if not os.path.exists(target) or not os.path.isfile(target):
return web.Response(status=404, text="Not Found")
return web.FileResponse(path=target)
app = web.Application()
app.add_routes(
[
web.get("/", index),
web.get("/api/0/hosts", api_hosts),
web.get("/api/0/messages", api_messages),
web.get("/c", cmd),
web.get("/d", drop),
web.get("/n", register),
web.get("/u", update),
web.get("/r", restart),
web.get("/live", live),
web.get("/static/{path:.*}", static),
web.get("/favicon.ico", favicon),
]
)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, host, port)
await site.start()
if verbose:
print(f"HTTP server started on {host}:{port}")
try:
await asyncio.Future()
finally:
await runner.cleanup()
-50
View File
@@ -1,50 +0,0 @@
"""monitor helper and thread for heartbeat daemon."""
from __future__ import annotations
import asyncio
import time
DROPOVERDUE = 7 * 24 * 3600
def checkoverdue(
config: dict,
hbdclass,
log: callable,
pushmsg: callable,
msg_to_websockets: callable,
):
now = time.time()
for h in list(hbdclass.Host.hosts.keys()):
pmsg = []
for c in hbdclass.Host.hosts[h].connections:
conn = hbdclass.Host.hosts[h].connections[c]
if conn.state == hbdclass.Connection.DOWN:
continue
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
pmsg.append(conn.afam)
if (
conn.state == hbdclass.Connection.OVERDUE
and (now - conn.lastbeat) > DROPOVERDUE
):
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
if pmsg != []:
if h in config.get("watchhosts", []):
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
log(h, "%s overdue" % " and ".join(pmsg))
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
async def start(
config: dict,
hbdclass: callable,
log=None,
pushmsg=None,
msg_to_websockets=None,
):
"""start a monitor loop that checks for overdue hosts every minute"""
while True:
await asyncio.sleep(15) # 15 seconds between checks
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
-202
View File
@@ -1,202 +0,0 @@
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
import logging
from typing import Optional
import http.client
import urllib.parse
import subprocess
import smtplib
import time
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
# module-level configuration set via setup()
_config = {}
logger = logging.getLogger(__name__)
def setup(cfg: dict):
"""Initialize notifier defaults from a configuration dict."""
global _config
_config = dict(cfg)
def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
"""Send a plain email via SMTP. Returns True on success."""
try:
smtpport = _config.get("smtpport", 587)
server = smtplib.SMTP(smtpserver, smtpport)
if debug > 0:
server.set_debuglevel(1)
if smtpport == 587:
server.starttls()
server.ehlo()
smtpuser = _config.get("smtpuser", None)
smtppassword = _config.get("smtppassword", None)
if smtpuser and smtppassword:
server.login(smtpuser, smtppassword)
server.sendmail(sender, toaddrs, body)
except Exception as e:
logger.warning("email send failed: %s", e)
try:
server.quit()
except Exception:
pass
return False
try:
server.quit()
except Exception:
pass
return True
def email(subject: str, msg: str, debug: int = 0) -> bool:
"""Convenience wrapper exposed to the rest of the application.
Uses module-level configuration to supply recipient list, smtp server
and sender address.
"""
toaddrs = _config.get("toemail")
fromemail = _config.get("fromemail")
smtpserver = _config.get("smtpserver")
if not toaddrs or not fromemail or not smtpserver:
logger.warning(
"email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
toaddrs,
fromemail,
smtpserver,
)
return False
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
toaddrs[0] if toaddrs else "",
fromemail,
subject,
date,
msg,
)
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
"""Send message via Pushover API."""
conn = http.client.HTTPSConnection("api.pushover.net:443")
try:
conn.request(
"POST",
"/1/messages.json",
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
{"Content-type": "application/x-www-form-urlencoded"},
)
r = conn.getresponse()
logger.debug("pushover response: %s %s", r.status, r.reason)
return r.status == 200
except Exception as e:
logger.error("pushover error: %s", e)
return False
def pushmattermost(
host: str,
token: str,
channel: str,
msg: str,
username: str = "hbd",
icon: Optional[str] = None,
debug: int = 0,
) -> bool:
"""Send a message to Mattermost via simple webhook driver if available.
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
If the import fails it returns False.
"""
try:
from mattermostdriver import Driver
except Exception:
return False
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
mm = Driver(ses)
payload = {"text": msg, "channel": channel, "username": username}
if icon:
payload["icon_url"] = icon
try:
rc = mm.webhooks.call_webhook(token, payload)
logger.debug("mattermost rc: %s", rc)
return bool(rc is None or rc == "")
except Exception as e:
logger.error("mattermost error: %s", e)
return False
def pushsignal(
signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
) -> bool:
"""Send a message via signal-cli (requires local installation).
Uses subprocess to call signal-cli. Returns True if the command succeeded.
"""
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
logger.debug("signal cli: %s", CLI)
try:
res = subprocess.run(CLI, capture_output=True)
if res.returncode != 0:
logger.error("signal failed: %s".res.stderr.decode())
return False
logger.debug("signal sent: %s", res.stdout.decode())
return True
except Exception as e:
logger.exception("signal exception: %s", e)
return False
def pushmsg(cfg: dict, msg: str, debug: int = 0):
"""Dispatch push notifications according to `cfg['pushsrv']`.
cfg is expected to contain keys for different services when needed, e.g.
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
- cfg['pushover_token'], cfg['pushover_user']
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
Returns a dict of results per provider.
"""
results = {}
p = cfg.get("pushsrv", "pushover")
if p in ("all", "pushover"):
ok = pushover(
cfg.get("pushover_token", ""),
cfg.get("pushover_user", ""),
msg,
debug=debug,
)
results["pushover"] = ok
if p in ("all", "mattermost"):
ok = pushmattermost(
cfg.get("matter_host", ""),
cfg.get("matter_token", ""),
cfg.get("matter_channel", ""),
msg,
username=cfg.get("matter_username", "hbd"),
icon=cfg.get("matter_icon"),
debug=debug,
)
results["mattermost"] = ok
if p in ("all", "signal"):
ok = pushsignal(
cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
cfg.get("signal_user", ""),
cfg.get("signal_recipient", ""),
msg,
debug=debug,
)
results["signal"] = ok
if p in ("all", "email"):
ok = email("Heartbeat notification", msg, debug=debug)
results["email"] = ok
logger.debug("push results: %s", results)
return results
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
"""Use the module-level configuration dict to dispatch a push message."""
return pushmsg(_config, msg, debug=debug)
-82
View File
@@ -1,82 +0,0 @@
"""Message encoding/decoding utilities for hbd protocol."""
from typing import Dict, Any
import zlib
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
"""Serialize a dict to protocol message bytes.
If compress is True, the payload is zlib-compressed and the message is
prefixed with `!ID:` as the original script did. Otherwise the format is
`ID:key=value;...` (bytes).
"""
s = []
for k in d:
v = d[k]
if isinstance(v, float):
s.append(f"{k}={v:0.5f}")
else:
s.append(f"{k}={v}")
pk = ";".join(s)
if compress:
zpk = zlib.compress(pk.encode(), 6)
hdr = ("!" + ID + ":").encode()
return hdr + zpk
else:
return (ID + ":" + pk).encode()
def stodict(msg: bytes):
"""Deserialize a protocol message into a dict.
Mirrors original behaviour: detects compressed messages starting with
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
message ID and the parsed key/value pairs.
"""
d = {}
if len(msg) > 0 and chr(msg[0]) == "!":
# message is: b'!ID:' + compressed_payload
# original code used msg[1:4].decode() for ID (3 bytes including colon)
try:
pk = zlib.decompress(msg[5:]).decode()
except Exception:
# malformed compressed payload
return {}
d["ID"] = msg[1:4].decode()
else:
try:
r0 = msg.split(b":", 1)
pk = r0[1].decode()
d["ID"] = r0[0].decode()
except Exception:
return {}
if not pk:
return d
parts = pk.split(";")
for v in parts:
if not v:
continue
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
val = vr[1].strip()
if val and val[0].isdigit():
try:
val_e = eval(val)
except Exception:
val_e = val
d[k] = val_e
else:
d[k] = val
return d
def oldmtodict(msg: bytes):
"""Compatibility wrapper for old-style messages (no ID prefix).
The original implementation prefixed with 'HTB:' and called stodict.
"""
return stodict(b"HTB:" + msg)
+3
View File
@@ -0,0 +1,3 @@
"""HeartBeat Daemon (hbd) - Server/daemon component."""
__version__ = "5.0.5"
+5 -5
View File
@@ -3,7 +3,7 @@
import argparse
from .config import load_config
from .server import run as run_server
from .main import run as run_server
PUSHSRVS = ["all", "pushover", "mattermost"]
@@ -43,11 +43,11 @@ def main(argv=None):
config["verbose"] = True
if args.pushsrv:
config["pushsrv"] = args.pushsrv
if args.debug:
config.setdefault("debug", 0)
config["debug"] += args.debug
if args.debug > 0:
config["debug"] = args.debug
run_server(config)
# Pass config_path for reloading support
run_server(config, config_path=args.configfile)
if __name__ == "__main__":
+340
View File
@@ -0,0 +1,340 @@
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
import asyncio
import logging
import os
try:
import yaml
except Exception:
yaml = None
SERVER_DEFAULTS = {
# Network settings
"hb_port": 50003, # Port to listen for heartbeats
"hbd_port": 50004, # HTTP API port
"hbd_host": "", # Bind address (empty = all interfaces)
# Persistence
"pickfile": "/tmp/hb.pick",
# Logging
"logfile": "/var/log/heartbeat.log",
"logfmt": "text", # text or msg or json
# Notification channels
"notification_channels": {}, # Named channels with type and credentials
"default_notification_channels": [], # Default channels if host doesn't specify
# Monitoring settings
"interval": 20, # Expected heartbeat interval (for server checks)
"grace": 2, # Grace multiplier (interval * grace = timeout)
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
# Host management
"hosts": {}, # New unified host definitions (optional)
"watchhosts": [], # Hosts to monitor and notify about (legacy)
"dyndnshosts": [], # Hosts with dynamic DNS (legacy)
"drophosts": [], # Hosts to ignore
"dyndomains": ["wrede.org"],
# DNS updates
"nsupdate_bin": "/usr/bin/nsupdate",
# WebSocket settings
"ws_port": 50005,
"wss_port": None,
"cert_path": "/usr/local/etc/ssl/",
"wss_pem": "fullchain.pem",
"wss_key": "privkey.pem",
# Message journal configuration
"journal_enabled": True,
"journal_dir": "/var/log/heartbeat",
"journal_file": "messages.journal",
"journal_max_size": 100 * 1024 * 1024, # 100MB
"journal_max_backups": 10,
# Runtime flags
"foreground": False,
"verbose": False,
"debug": 0,
# Plugin/threshold configs (for clients reporting to this server)
"plugins": {},
"thresholds": {},
}
def load_config(path=None):
"""Load configuration from a YAML file and merge with server defaults.
If YAML is not available or the file does not exist, defaults are returned.
Args:
path: Path to YAML config file (default: ~/.hb.yaml)
Returns:
Dictionary with configuration
"""
cfg = SERVER_DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
if os.path.exists(path):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# Merge YAML data with defaults
# Keep all keys from YAML to support plugin configs and future extensions
for k, v in data.items():
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
return cfg
class ReloadableConfig:
"""Thread-safe/async-safe configuration wrapper that supports runtime reloading.
This class wraps the configuration dictionary and provides:
- Thread-safe config reloading via SIGHUP
- Backward-compatible dict-like access
- Async lock to prevent concurrent reloads
"""
def __init__(self, initial_config, config_path=None):
"""Initialize with initial configuration.
Args:
initial_config: Initial configuration dictionary
config_path: Path to config file for reloading (optional)
"""
self._config = initial_config
self._config_path = config_path
self._lock = asyncio.Lock()
self._logger = logging.getLogger(__name__)
async def reload(self, config_path=None):
"""Reload configuration from file.
Args:
config_path: Path to config file (uses stored path if not provided)
Returns:
New configuration dictionary
Raises:
Exception if reload fails (keeps existing config)
"""
path = config_path or self._config_path
if not path:
raise ValueError("No config path specified for reload")
async with self._lock:
try:
# Load new config
new_config = load_config(path)
# Store old config for rollback if needed
old_config = self._config
# Update config
self._config = new_config
self._logger.info(f"Configuration reloaded from {path}")
return new_config
except Exception as e:
self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True)
# Keep existing config on error
raise
def get(self, key, default=None):
"""Get a config value (dict-compatible)."""
return self._config.get(key, default)
def __getitem__(self, key):
"""Get a config value via subscript (dict-compatible)."""
return self._config[key]
def __contains__(self, key):
"""Check if key exists (dict-compatible)."""
return key in self._config
def keys(self):
"""Return config keys (dict-compatible)."""
return self._config.keys()
def items(self):
"""Return config items (dict-compatible)."""
return self._config.items()
def values(self):
"""Return config values (dict-compatible)."""
return self._config.values()
@property
def config(self):
"""Get the underlying config dict (for components that need full dict)."""
return self._config
def get_watchhosts(config):
"""Extract watchhosts from config, supporting both new and legacy formats.
Args:
config: Configuration dictionary
Returns:
List of hostnames to watch
"""
watchhosts = []
# New format: hosts section with watch attribute
if "hosts" in config:
hosts_config = config["hosts"]
if isinstance(hosts_config, dict):
for host_name, host_attrs in hosts_config.items():
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
watchhosts.append(host_name)
# Legacy format: watchhosts list
if "watchhosts" in config:
legacy_watchhosts = config.get("watchhosts", [])
if isinstance(legacy_watchhosts, (list, set)):
watchhosts.extend(legacy_watchhosts)
elif isinstance(legacy_watchhosts, dict):
# Old dict format: {"host1": {attrs}, "host2": {attrs}}
watchhosts.extend(legacy_watchhosts.keys())
return list(set(watchhosts)) # Remove duplicates
def get_dyndnshosts(config):
"""Extract dyndnshosts from config, supporting both new and legacy formats.
Args:
config: Configuration dictionary
Returns:
List of hostnames with dynamic DNS
"""
dyndnshosts = []
# New format: hosts section with dyndns attribute
if "hosts" in config:
hosts_config = config["hosts"]
if isinstance(hosts_config, dict):
for host_name, host_attrs in hosts_config.items():
if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
dyndnshosts.append(host_name)
# Legacy format: dyndnshosts list/set
if "dyndnshosts" in config:
legacy_dyndnshosts = config.get("dyndnshosts", [])
if isinstance(legacy_dyndnshosts, (list, set)):
dyndnshosts.extend(legacy_dyndnshosts)
return list(set(dyndnshosts)) # Remove duplicates
def get_host_config(config, hostname):
"""Get configuration for a specific host.
Args:
config: Configuration dictionary
hostname: Host name
Returns:
Dictionary with host attributes or empty dict
"""
if "hosts" in config:
hosts_config = config.get("hosts", {})
if isinstance(hosts_config, dict) and hostname in hosts_config:
return hosts_config[hostname] if isinstance(hosts_config[hostname], dict) else {}
# Check legacy watchhosts for notification settings
if "watchhosts" in config:
watchhosts = config.get("watchhosts", {})
if isinstance(watchhosts, dict) and hostname in watchhosts:
legacy_attrs = watchhosts[hostname]
if isinstance(legacy_attrs, dict):
# Convert legacy format to new format
return {
"watch": True,
"notify": legacy_attrs.get("notify"),
"notify_src": legacy_attrs.get("src"),
}
return {}
def get_notification_channels_for_host(config, hostname):
"""Get notification channels configured for a specific host.
Args:
config: Configuration dictionary
hostname: Host name
Returns:
List of channel names to use for this host
"""
host_config = get_host_config(config, hostname)
# Check if host specifies notification channels
channels = host_config.get("notification_channels", [])
if channels:
if isinstance(channels, str):
return [channels]
elif isinstance(channels, list):
return channels
# Fall back to default channels
default_channels = config.get("default_notification_channels", [])
if default_channels:
if isinstance(default_channels, str):
return [default_channels]
elif isinstance(default_channels, list):
return default_channels
# No channels configured, return empty list (will use legacy global config)
return []
def get_channel_config(config, channel_name):
"""Get configuration for a specific notification channel.
Args:
config: Configuration dictionary
channel_name: Name of the notification channel
Returns:
Dictionary with channel configuration or None if not found
"""
channels = config.get("notification_channels", {})
if isinstance(channels, dict) and channel_name in channels:
return channels[channel_name]
return None
def get_notification_channels_config(config, hostname):
"""Get list of notification channel configurations for a host.
Args:
config: Configuration dictionary
hostname: Host name
Returns:
List of (channel_name, channel_config) tuples
"""
channel_names = get_notification_channels_for_host(config, hostname)
channels = []
for channel_name in channel_names:
channel_config = get_channel_config(config, channel_name)
if channel_config and channel_config.get("type"):
channels.append((channel_name, channel_config))
return channels
+12
View File
@@ -0,0 +1,12 @@
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
class Data:
def __init__(self, config):
self.config = config
self.data = {}
def update(self, new_data):
self.data.update(new_data)
def get(self, key, default=None):
return self.data.get(key, default)
+2 -12
View File
@@ -136,16 +136,7 @@ async def dns_update_worker(
)
if err:
m += f", DNS update failed: {err}"
if pushmsg:
try:
await loop.run_in_executor(
None,
pushmsg,
"error: nsupdate failed",
f"{name}.dy.{dyndomain}: {m}",
)
except Exception:
pass
logger.error("DNS update failed for %s: %s", name, err)
else:
m += ", DNS updated."
@@ -171,7 +162,6 @@ def start_dns_worker(
hbdclass,
cfg: dict,
log: Optional[callable] = None,
pushmsg: Optional[callable] = None,
loop: Optional[asyncio.AbstractEventLoop] = None,
):
"""Start the async DNS worker and return the Task.
@@ -218,7 +208,7 @@ def start_dns_worker(
task = loop.create_task(
dns_update_worker(
hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
)
)
return task
+587
View File
@@ -0,0 +1,587 @@
"""
host and connection class shared between hbd and
the websit's heartbeat.py
"""
import time
import json
import copy
import queue
num = 0
MAXRTTS = 10
DEBUG = 2
def log(host, m):
if DEBUG:
print("class log: %s %s" % (host, m))
class Connection:
# map of addrs to names
htab = {}
UNKNOWN = "unknown"
UP = "up"
DOWN = "down"
OVERDUE = "overdue"
def __init__(self, host, cid, addr, afam):
self.host = host
self.cid = cid
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.addr = addr
self.afam = afam
self.rtts = [0]
self.lastbeat = time.time()
self.statetime = self.lastbeat
self.deltastatetime = "computed"
self.state = Connection.UNKNOWN
# Timer-based reachability monitoring
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
if host:
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr))
def __getstate__(self):
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
state = self.__dict__.copy()
# Remove asyncio timer objects that can't be pickled
# These will be recreated when the next HTB arrives after unpickling
state['overdue_timer'] = None
state['overdue_callback'] = None
state['timeout_duration'] = None
return state
def __setstate__(self, state):
"""Restore Connection from pickle, reinitializing timer fields."""
self.__dict__.update(state)
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
if not hasattr(self, 'overdue_timer'):
self.overdue_timer = None
if not hasattr(self, 'overdue_callback'):
self.overdue_callback = None
if not hasattr(self, 'timeout_duration'):
self.timeout_duration = None
def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr))
def clearstate(self):
d = {}
d["addr"] = ""
d["rtt"] = ""
d["lastbeat"] = ""
d["state"] = ""
d["statetime"] = ""
d["deltastatetime"] = ""
d["rttstate"] = ""
return d
def statedict(self, Null=False):
d = self.clearstate()
now = time.time()
if not Null:
d["addr"] = self.addr
if self.rtts[-1]:
d["rtt"] = "%0.1f" % self.rtts[-1]
elif self.state == Connection.UNKNOWN:
d["rtt"] = ""
else:
d["rtt"] = "?"
d["lastbeat"] = self.lastbeat
if self.state == Connection.OVERDUE:
d["state"] = "<b>%s</b>" % self.state
else:
d["state"] = self.state
if self.state == Connection.UP:
d["rttstate"] = d["rtt"]
elif self.state == Connection.OVERDUE:
d["rttstate"] = ""
else:
d["rttstate"] = d["state"]
d["statetime"] = time.strftime(
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
)
delta = now - self.statetime
if self.state == Connection.UNKNOWN:
d["deltastatetime"] = ""
elif delta > 86400:
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
elif delta > 3600:
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
elif delta > 60:
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
else:
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
d["deltastatetime"] = "%i secs" % (delta)
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
d = self.clearstate()
return d
def headerdict(self, afam):
d = {}
d["addr"] = "%s Addr" % afam
d["rtt"] = "Latencey"
d["lastbeat"] = "Last Contact"
d["state"] = "State"
d["statetime"] = "Last State"
d["rttstate"] = "Reach"
d["deltastatetime"] = "Last State"
return d
def jsons(self):
"""Serialize connection to JSON, excluding non-serializable timer objects."""
data = {}
for key, value in self.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
data[key] = value.name if value else None
else:
data[key] = value
return json.dumps(data)
# set new state, return number of secs in previous state
def newstate(self, state, now, when=0):
self.state = state
delta = now - when
s = delta - self.statetime
self.statetime = delta
return s
def getstate(self):
return self.state
def newaddr(self, addr, rtt, now):
self.lastbeat = now
if rtt is not None:
self.rtts.append(rtt)
if len(self.rtts) > MAXRTTS:
del self.rtts[0]
if self.addr == addr:
r = None
else:
r = "changed from %s to %s" % (self.addr, addr)
try:
del Connection.htab[self.addr]
except Exception:
pass
self.addr = addr
Connection.htab[addr] = self.host.nameconnection_count
if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr))
return r
def reset_overdue_timer(self, timeout_seconds, callback):
"""Reset the overdue timer for this connection.
Cancels any existing timer and sets a new one that will mark
the connection as overdue if no heartbeat arrives before timeout.
Args:
timeout_seconds: Seconds before marking as overdue
callback: Async function to call when timer expires
"""
import asyncio
# Cancel existing timer if any
if self.overdue_timer and not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
# Store parameters for later reference
self.timeout_duration = timeout_seconds
self.overdue_callback = callback
# Create new timer
async def timer_expired():
await callback(self)
try:
loop = asyncio.get_event_loop()
self.overdue_timer = loop.call_later(timeout_seconds,
lambda: asyncio.create_task(timer_expired()))
except RuntimeError:
# No event loop running yet
pass
def cancel_overdue_timer(self):
"""Cancel the overdue timer if it exists and clear all timer references."""
if self.overdue_timer:
try:
if not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
except Exception:
pass
# Clear all timer-related references
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
def get_avg_rtt(self):
"""Get average RTT from recent samples."""
valid_rtts = [r for r in self.rtts if r > 0]
if valid_rtts:
return sum(valid_rtts) / len(valid_rtts)
return 0
def get_current_rtt(self):
"""Get most recent RTT value."""
return self.rtts[-1] if self.rtts else 0
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
"""Check if RTT exceeds thresholds.
Args:
warning_threshold: RTT in ms for warning level
critical_threshold: RTT in ms for critical level
Returns:
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
"""
rtt = self.get_current_rtt()
if rtt <= 0:
return (None, rtt)
if critical_threshold and rtt > critical_threshold:
return ('CRITICAL', rtt)
elif warning_threshold and rtt > warning_threshold:
return ('WARNING', rtt)
return (None, rtt)
#
class Host:
# Table of Hosts
hosts = {}
dnsQ = queue.Queue()
def __init__(self, name):
global num
self.name = name
if name:
num += 1
Host.hosts[name] = self
self.num = num
self.dyn = False
self.watched = False
self.upcount = 0
self.interval = 0
self.doesack = -1
self.cmds = []
self.connections = {}
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
self.plugin_data = {}
self.plugin_retention = 100 # Keep last N samples per plugin
# Alert state tracking: {metric_path: AlertState}
self.alert_states = {}
def statedict(self):
d = {}
d["name"] = self.name
if self.dyn:
d["name"] += "*"
if self.watched:
d["name"] = "<b>%s</b>" % d["name"]
d["dyn"] = str(self.dyn)
d["num"] = self.num
# Add alert counts (split by acknowledged status)
warning_unacked = 0
warning_acked = 0
critical_unacked = 0
critical_acked = 0
for metric_path, alert_state in self.alert_states.items():
# Import AlertLevel here to avoid circular imports
from .threshold import AlertLevel
if alert_state.level == AlertLevel.WARNING:
if alert_state.acknowledged:
warning_acked += 1
else:
warning_unacked += 1
elif alert_state.level == AlertLevel.CRITICAL:
if alert_state.acknowledged:
critical_acked += 1
else:
critical_unacked += 1
d["alert_warning_unacked"] = warning_unacked
d["alert_warning_acked"] = warning_acked
d["alert_critical_unacked"] = critical_unacked
d["alert_critical_acked"] = critical_acked
for c in ["IPv4", "IPv6"]:
if c in self.connections:
cs = self.connections[c].statedict()
else:
cs = ubConnection.statedict(True)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def headerdict(self):
d = {}
d["name"] = "Name"
d["dyn"] = "Dyn"
d["num"] = "??"
for c in ["IPv4", "IPv6"]:
cs = ubConnection.headerdict(c)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def registerDns(self):
for af in self.connections:
self.connections[af].registerDns()
def stateinfo(self):
ddict = {}
for d in self.__dict__:
if d in ["alert_states", "plugin_data"]:
continue
if d == "connections":
cl = []
for c in ["IPv4", "IPv6"]:
if c not in self.connections:
continue
# Create connection dict, excluding non-serializable timer objects
conn = self.connections[c]
cld = {}
for key, value in conn.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
cld[key] = value.name if value else None
else:
# Safe copy for serializable values
try:
cld[key] = copy.deepcopy(value)
except Exception:
# If deepcopy fails, use shallow copy
cld[key] = value
cl.append(cld)
ddict[d] = cl
else:
ddict[d] = self.__dict__[d]
# Add alert counts (computed from alert_states)
warning_unacked = 0
warning_acked = 0
critical_unacked = 0
critical_acked = 0
if hasattr(self, 'alert_states'):
from .threshold import AlertLevel
for metric_path, alert_state in self.alert_states.items():
if alert_state.level == AlertLevel.WARNING:
if alert_state.acknowledged:
warning_acked += 1
else:
warning_unacked += 1
elif alert_state.level == AlertLevel.CRITICAL:
if alert_state.acknowledged:
critical_acked += 1
else:
critical_unacked += 1
ddict["alert_warning_unacked"] = warning_unacked
ddict["alert_warning_acked"] = warning_acked
ddict["alert_critical_unacked"] = critical_unacked
ddict["alert_critical_acked"] = critical_acked
return ddict
def jsons(self):
return json.dumps(self.stateinfo())
def isDynDns(self):
return self.dyn
def isIPv4(self, addr):
if isinstance(addr, tuple):
return addr[0].find(".") > 0
else:
return addr.find(".") > 0
def conndata(self, cid, addr, rtt, now):
if addr[0:7] == "::ffff:":
addr = addr[7:]
if self.isIPv4(addr):
afam = "IPv4"
else:
afam = "IPv6"
if afam not in self.connections:
self.connections[afam] = Connection(self, cid, addr, afam)
conn = self.connections[afam]
res = conn.newaddr(addr, rtt, now)
return conn, res
# called when reloading class from pickle, add new fields here
def fixup(self):
for c in ["IPv4", "IPv6"]:
if c in self.connections:
addr = self.connections[c].addr
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.connections[c].addr = addr
# Add plugin_data if missing (for backward compatibility)
if not hasattr(self, "plugin_data"):
self.plugin_data = {}
if not hasattr(self, "plugin_retention"):
self.plugin_retention = 100
if not hasattr(self, "alert_states"):
self.alert_states = {}
pass
def add_plugin_data(self, plugin_name, data, timestamp=None):
"""Store plugin data with timestamp.
Args:
plugin_name: Name of the plugin (e.g., "cpu_monitor")
data: Dict of plugin data
timestamp: Optional timestamp (default: current time)
"""
if timestamp is None:
timestamp = time.time()
if plugin_name not in self.plugin_data:
self.plugin_data[plugin_name] = []
# Add new data
self.plugin_data[plugin_name].append((timestamp, data))
# Enforce retention limit (keep last N samples)
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
def get_plugin_data(self, plugin_name, limit=None):
"""Retrieve plugin data for a specific plugin.
Args:
plugin_name: Name of the plugin
limit: Optional limit on number of recent samples to return
Returns:
List of (timestamp, data) tuples, most recent last
"""
data = self.plugin_data.get(plugin_name, [])
if limit and len(data) > limit:
return data[-limit:]
return data
def get_latest_plugin_data(self, plugin_name):
"""Get the most recent plugin data for a plugin.
Args:
plugin_name: Name of the plugin
Returns:
(timestamp, data) tuple or None if no data
"""
data = self.plugin_data.get(plugin_name, [])
return data[-1] if data else None
def get_all_plugin_data(self):
"""Get all plugin data for this host.
Returns:
Dict of {plugin_name: [(timestamp, data), ...]}
"""
return self.plugin_data
hostfields_long = [
"name",
"IPv4.addr",
"IPv4.state",
("IPv4.rtt", 'style="text-align: right;"'),
("IPv4.statetime", 'style="text-align: right;"'),
"IPv6.addr",
"IPv6.state",
("IPv6.rtt", 'style="text-align: right;"'),
("IPv6.statetime", 'style="text-align: right;"'),
]
hostfields_short = [
"name",
("IPv4.rttstate", 'style="text-align: right;"'),
("IPv4.deltastatetime", 'style="text-align: right;"'),
("IPv6.rttstate", 'style="text-align: right;"'),
("IPv6.deltastatetime", 'style="text-align: right;"'),
]
def gene(self, tag, v, attrib=None):
if attrib:
a = " %s" % attrib
else:
a = ""
return "<%s%s>%s</%s>" % (tag, a, v, tag)
def htmltable(self, tag, hd, short):
if short:
hostfields = Host.hostfields_short
else:
hostfields = Host.hostfields_long
h = []
for f in hostfields:
if isinstance(f, tuple):
h.append(self.gene(tag, hd[f[0]], f[1]))
else:
h.append(self.gene(tag, hd[f]))
return self.gene("tr", "\n".join(h))
def buildhosttable(self, short=False):
if DEBUG > 1:
print("DBG buildhosttable: start")
res = []
res.append('<table id="ntable" class="sortable">')
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
hosts_sorted = list(Host.hosts.keys())
if len(hosts_sorted):
hosts_sorted.sort()
for h in hosts_sorted:
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
res.append("</table>")
if DEBUG > 1:
print("DBG buildhosttable: %s" % res)
return res
def buildmsgtable(self, msgs):
res = []
le = max(40 - len(Host.hosts), 3)
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs) - le :]:
res.append("%s<BR>" % m)
return res
# create fake "unbound objects", remove in Python 3.0
ubHost = Host(None)
ubConnection = Connection(None, "", "", "")
+445
View File
@@ -0,0 +1,445 @@
"""HTTP server implementation using aiohttp and jinja2."""
import asyncio
import json
import time
import urllib.parse
import os
import logging
from aiohttp import web
import jinja2
from . import data
from . import notify as notify_mod
logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog
def _render_template(html_str: str, **context) -> str:
tmpl = jinja2.Template(html_str)
return tmpl.render(**context)
async def start(
host: str,
port: int,
config,
hbdclass,
tcss=None,
verbose=False,
get_now=None,
VER="",
threshold_checker=None,
):
"""Start an aiohttp web server and block until cancelled.
This function is intended to be awaited inside the main asyncio event loop.
"""
get_now = get_now or (lambda: time.time())
async def index(request):
res = []
res.append('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">')
res.append("<html>")
res.append("<head>")
res.append("<title>Heartbeat</title>")
if tcss:
res.append(tcss)
res.append("</head>")
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
res.append(f"<H2>Heartbeat status {VER}</h2>")
res += hbdclass.ubHost.buildhosttable()
res += hbdclass.ubHost.buildmsgtable(data.msgs)
res.append(
"<p> %s (%s)</p>"
% (
time.strftime("%H:%M:%S", time.localtime(get_now())),
config.get("tz", "CET-1CDT"),
)
)
res.append("</body></html>")
body = "\n".join(res)
return web.Response(text=body, content_type="text/html")
async def api_hosts(request):
lst = [hbdclass.Host.hosts[h].jsons() for h in hbdclass.Host.hosts]
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
async def api_messages(request):
lst = data.msgs[-30:]
return web.json_response(lst)
async def cmd(request):
qa = request.rel_url.query
uname = qa.get("h")
ucmd = qa.get("c")
if not ucmd or not uname:
return web.Response(status=400, text="need h= and c= arguments")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
hbdclass.Host.hosts[uname].cmds.append(
("CMD", {"cmd": urllib.parse.unquote(ucmd)})
)
return web.Response(text=f"cmd {uname} queued")
async def drop(request):
qa = request.rel_url.query
uname = qa.get("h")
if not uname:
return web.Response(status=400, text="need h= argument")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
eventlog(uname, "INFO", "dropped")
del hbdclass.Host.hosts[uname]
return web.Response(text="Done")
async def register(request):
qa = request.rel_url.query
uname = qa.get("h")
if not uname:
return web.Response(status=400, text="need h= argument")
if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
ll = hbdclass.Host.hosts[uname].registerDns()
eventlog(uname, "INFO", ll)
return web.Response(text=str(ll))
async def update(request):
qa = request.rel_url.query
uname = urllib.parse.unquote(qa.get("h", ""))
ucode = qa.get("c")
if not ucode or not uname:
return web.Response(status=400, text="need h= and c= arguments")
if uname != "All" and uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found")
if uname != "All":
names = [uname]
else:
names = [n for n in hbdclass.Host.hosts]
out = []
for n in names:
err = None
try:
r = {"csum": None, "code": ucode}
hbdclass.Host.hosts[n].cmds.append(("UPD", r))
except Exception as e:
err = str(e)
out.append(f"update started for {n}: {err if err else 'OK'}")
return web.Response(text="\n".join(out))
async def live(request):
# render template from hbd/templates/live.html using Jinja2
# Resolve templates directory relative to the hbd package
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
host = config.get("hb_host", "localhost")
extra_scripts = config.get("http_extra_scripts", "")
host = request.host.split(":")[0]
if config.get("wss_port"):
heartbeat_ws_url = f"wss://{host}:{config['wss_port']}/hbd"
else:
heartbeat_ws_url = f"ws://{host}:{config.get('ws_port', 50005)}/hbd"
tmpl = env.get_template("live.html")
body = tmpl.render(
title="Heartbeat",
header="Heartbeat",
request=request,
heartbeat_ws_url=heartbeat_ws_url,
extra_scripts=extra_scripts,
hosts=[
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
],
messages=data.msgs[-30:],
)
return web.Response(text=body, content_type="text/html")
async def static(request):
"""Serve files from the package static directory.
URL form: /static/<path>
"""
p = request.match_info.get("path", "")
logger.debug("static file requested: %s", p)
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static"))
# normalize and prevent directory traversal
target = os.path.abspath(os.path.normpath(os.path.join(base, p)))
if not target.startswith(base + os.sep) and target != base:
return web.Response(status=403, text="Forbidden")
if not os.path.exists(target) or not os.path.isfile(target):
return web.Response(status=404, text="Not Found")
logger.info("serving static file: %s", target)
return web.FileResponse(path=target)
async def favicon(request):
"""Serve favicon.ico from the package static directory."""
base = os.path.abspath(os.path.join(os.path.dirname(__file__), "static/images"))
target = os.path.join(base, "favicon.ico")
if not os.path.exists(target) or not os.path.isfile(target):
return web.Response(status=404, text="Not Found")
return web.FileResponse(path=target)
# -------------------------------------------------------------------------
# Plugin Data API Endpoints
# -------------------------------------------------------------------------
async def api_host_plugins(request):
"""Get all plugin data for a specific host."""
hostname = request.match_info.get("hostname")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get plugin data with most recent sample for each plugin
plugins_summary = {}
for plugin_name, samples in host.plugin_data.items():
if samples:
# Get most recent sample
timestamp, data = samples[-1]
plugins_summary[plugin_name] = {
"timestamp": timestamp,
"data": data,
"sample_count": len(samples),
}
return web.json_response({
"hostname": hostname,
"plugins": plugins_summary,
})
async def api_host_plugin_detail(request):
"""Get detailed data for a specific plugin on a host."""
hostname = request.match_info.get("hostname")
plugin_name = request.match_info.get("plugin_name")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get limit from query parameter
limit = request.rel_url.query.get("limit", "10")
try:
limit = int(limit)
except ValueError:
limit = 10
# Get plugin data
samples = host.get_plugin_data(plugin_name, limit=limit)
if not samples:
return web.json_response(
{"error": f"No data for plugin '{plugin_name}' on host '{hostname}'"},
status=404
)
# Format samples
formatted_samples = [
{
"timestamp": ts,
"data": data,
}
for ts, data in samples
]
return web.json_response({
"hostname": hostname,
"plugin": plugin_name,
"samples": formatted_samples,
"sample_count": len(formatted_samples),
})
async def api_host_alerts(request):
"""Get alert states for a specific host."""
hostname = request.match_info.get("hostname")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get alert states
alerts = []
for metric_path, alert_state in host.alert_states.items():
alerts.append(alert_state.to_dict())
# Get summary if threshold_checker available
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
if threshold_checker:
summary = threshold_checker.get_alert_summary(host.alert_states)
return web.json_response({
"hostname": hostname,
"alerts": alerts,
"summary": summary,
})
async def api_all_alerts(request):
"""Get all active alerts across all hosts."""
all_alerts = []
for hostname, host in hbdclass.Host.hosts.items():
if threshold_checker:
active_alerts = threshold_checker.get_active_alerts(host.alert_states)
else:
# Fallback if no threshold checker
from hbd.server.threshold import AlertLevel
active_alerts = [
state for state in host.alert_states.values()
if state.level != AlertLevel.OK
]
for alert in active_alerts:
alert_dict = alert.to_dict()
alert_dict["hostname"] = hostname
all_alerts.append(alert_dict)
# Sort by level (critical first) then by hostname
level_order = {"CRITICAL": 0, "WARNING": 1, "UNKNOWN": 2, "OK": 3}
all_alerts.sort(
key=lambda a: (level_order.get(a["level"], 99), a["hostname"], a["metric_path"])
)
# Get summary counts
summary = {"critical": 0, "warning": 0, "unknown": 0, "total": len(all_alerts)}
for alert in all_alerts:
level = alert["level"].lower()
if level in summary:
summary[level] += 1
return web.json_response({
"alerts": all_alerts,
"summary": summary,
"host_count": len(hbdclass.Host.hosts),
})
async def api_acknowledge_alert(request):
"""Acknowledge an alert to stop reminder notifications."""
try:
data = await request.json()
except Exception:
return web.json_response(
{"error": "Invalid JSON in request body"},
status=400
)
hostname = data.get("hostname")
metric_path = data.get("metric_path")
if not hostname or not metric_path:
return web.json_response(
{"error": "Missing required fields: hostname and metric_path"},
status=400
)
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
if metric_path not in host.alert_states:
return web.json_response(
{"error": f"Alert '{metric_path}' not found for host '{hostname}'"},
status=404
)
alert_state = host.alert_states[metric_path]
alert_state.acknowledge()
return web.json_response({
"success": True,
"hostname": hostname,
"metric_path": metric_path,
"acknowledged_at": alert_state.acknowledged_at,
})
# -------------------------------------------------------------------------
# UI Pages
# -------------------------------------------------------------------------
async def plugins_page(request):
"""Render the plugin metrics visualization page."""
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
# Collect all hosts with plugin data
hosts_with_plugins = []
for hostname in sorted(hbdclass.Host.hosts.keys()):
host = hbdclass.Host.hosts[hostname]
if host.plugin_data:
hosts_with_plugins.append({
"name": hostname,
"plugins": list(host.plugin_data.keys()),
})
tmpl = env.get_template("plugins.html")
body = tmpl.render(
title="Plugin Metrics - Heartbeat",
header="Plugin Metrics",
hosts=hosts_with_plugins,
)
return web.Response(text=body, content_type="text/html")
async def alerts_page(request):
"""Render the alerts dashboard page."""
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
tmpl = env.get_template("alerts.html")
body = tmpl.render(
title="Alerts Dashboard - Heartbeat",
header="Alerts Dashboard",
)
return web.Response(text=body, content_type="text/html")
app = web.Application()
app.add_routes(
[
web.get("/", index),
web.get("/api/0/hosts", api_hosts),
web.get("/api/0/messages", api_messages),
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
web.get("/api/0/hosts/{hostname}/alerts", api_host_alerts),
web.get("/api/0/alerts", api_all_alerts),
web.post("/api/0/alerts/acknowledge", api_acknowledge_alert),
web.get("/c", cmd),
web.get("/d", drop),
web.get("/n", register),
web.get("/u", update),
web.get("/live", live),
web.get("/plugins", plugins_page),
web.get("/alerts", alerts_page),
web.get("/static/{path:.*}", static),
web.get("/favicon.ico", favicon),
]
)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, host, port)
await site.start()
if verbose:
print(f"HTTP server started on {host}:{port}")
try:
await asyncio.Future()
finally:
await runner.cleanup()
+342
View File
@@ -0,0 +1,342 @@
"""
Journal logging for heartbeat messages.
Provides size-based rotating log files for all received heartbeat messages.
Messages are logged in JSON format for easy parsing and analysis.
"""
import json
import logging
import os
import asyncio
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Optional
logger = logging.getLogger(__name__)
class MessageJournal:
"""
Journal logger for heartbeat messages with size-based rotation.
Features:
- Logs all received messages in JSON format
- Automatic rotation when file size exceeds threshold
- Keeps configurable number of rotated logs
- Thread-safe and async-safe operation
- Configurable log directory and file naming
Configuration:
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
journal_file: Base filename (default: messages.journal)
max_size: Maximum file size in bytes before rotation (default: 100MB)
max_backups: Number of backup files to keep (default: 10)
enabled: Enable/disable journaling (default: True)
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the message journal.
Args:
config: Configuration dictionary with journal settings
"""
self.config = config or {}
# Configuration options
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
self.journal_file = self.config.get('journal_file', 'messages.journal')
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
self.max_backups = self.config.get('journal_max_backups', 10)
self.enabled = self.config.get('journal_enabled', True)
# Runtime state
self._file_handle = None
self._current_size = 0
self._lock = asyncio.Lock()
self._initialized = False
# Full path to current journal file
self.journal_path = self.journal_dir / self.journal_file
async def initialize(self) -> bool:
"""
Initialize the journal.
Creates journal directory if needed and opens the journal file.
Returns:
True if initialization successful, False otherwise
"""
if not self.enabled:
logger.info("Message journal disabled in configuration")
return True
try:
# Create journal directory if it doesn't exist
self.journal_dir.mkdir(parents=True, exist_ok=True)
# Open journal file in append mode
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
# Get current file size
try:
self._current_size = os.path.getsize(self.journal_path)
except OSError:
self._current_size = 0
self._initialized = True
logger.info(f"Message journal initialized: {self.journal_path} "
f"(current size: {self._current_size:,} bytes, "
f"max: {self.max_size:,} bytes)")
return True
except Exception as e:
logger.error(f"Failed to initialize message journal: {e}")
self.enabled = False
return False
async def log_message(
self,
msg: Dict[str, Any],
addr: tuple,
timestamp: Optional[float] = None
):
"""
Log a received message to the journal.
Args:
msg: Parsed message dictionary
addr: Source address (ip, port) tuple
timestamp: Message timestamp (defaults to current time)
"""
if not self.enabled or not self._initialized:
return
# Skip HTB (heartbeat) messages - too verbose
msg_id = msg.get('ID', '')
if msg_id == 'HTB':
return
async with self._lock:
try:
# Prepare journal entry
if timestamp is None:
import time
timestamp = time.time()
entry = {
'timestamp': timestamp,
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
'message': msg
}
# Serialize to JSON (one line per entry)
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
json_bytes = json_line.encode('utf-8')
# Check if rotation is needed
if self._current_size + len(json_bytes) > self.max_size:
await self._rotate()
# Write to journal
if self._file_handle:
self._file_handle.write(json_line)
self._file_handle.flush() # Ensure data is written
self._current_size += len(json_bytes)
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
except Exception as e:
logger.error(f"Error writing to journal: {e}")
async def _rotate(self):
"""
Rotate the journal file.
Renames current file with timestamp, opens new file, and removes
old backups exceeding max_backups limit.
"""
try:
# Close current file
if self._file_handle:
self._file_handle.close()
self._file_handle = None
# Generate backup filename with timestamp
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
backup_name = f"{self.journal_file}.{timestamp_str}"
backup_path = self.journal_dir / backup_name
# Rename current file to backup
if self.journal_path.exists():
self.journal_path.rename(backup_path)
logger.info(f"Rotated journal: {backup_path} "
f"(size: {self._current_size:,} bytes)")
# Open new journal file
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
self._current_size = 0
# Clean up old backups
await self._cleanup_old_backups()
except Exception as e:
logger.error(f"Error rotating journal: {e}")
# Try to reopen the file even if rotation failed
try:
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
except Exception as e2:
logger.error(f"Failed to reopen journal after rotation error: {e2}")
self.enabled = False
async def _cleanup_old_backups(self):
"""
Remove old backup files exceeding max_backups limit.
Keeps only the most recent backups based on filename (which includes timestamp).
"""
try:
# Find all backup files
backup_pattern = f"{self.journal_file}.*"
backup_files = sorted(self.journal_dir.glob(backup_pattern))
# Remove oldest backups if we have too many
if len(backup_files) > self.max_backups:
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
for backup_file in files_to_remove:
try:
backup_file.unlink()
logger.info(f"Removed old backup: {backup_file.name}")
except Exception as e:
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
except Exception as e:
logger.error(f"Error cleaning up old backups: {e}")
async def log_threshold_event(
self,
host_name: str,
metric_path: str,
old_level: str,
new_level: str,
value: Any,
timestamp: Optional[float] = None
):
"""
Log a threshold state change event.
Args:
host_name: Name of the host
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
old_level: Previous alert level
new_level: New alert level
value: Current metric value
timestamp: Event timestamp (default: current time)
"""
if not self.enabled or not self._initialized:
return
try:
if timestamp is None:
timestamp = __import__('time').time()
event = {
'timestamp': timestamp,
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
'event_type': 'threshold',
'host': host_name,
'metric': metric_path,
'old_level': old_level,
'new_level': new_level,
'value': value,
}
async with self._lock:
if not self._file_handle:
return
# Check if rotation is needed
if self._current_size >= self.max_size:
await self._rotate()
# Write event
line = json.dumps(event) + '\n'
self._file_handle.write(line)
self._file_handle.flush()
# Update size
self._current_size += len(line.encode('utf-8'))
except Exception as e:
logger.error(f"Error logging threshold event: {e}")
async def close(self):
"""
Close the journal and release resources.
Should be called during shutdown.
"""
async with self._lock:
if self._file_handle:
try:
self._file_handle.close()
logger.info("Message journal closed")
except Exception as e:
logger.error(f"Error closing journal: {e}")
finally:
self._file_handle = None
self._initialized = False
def get_stats(self) -> Dict[str, Any]:
"""
Get journal statistics.
Returns:
Dictionary with journal stats
"""
return {
'enabled': self.enabled,
'initialized': self._initialized,
'current_file': str(self.journal_path),
'current_size': self._current_size,
'max_size': self.max_size,
'max_backups': self.max_backups,
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
}
# Global journal instance
_journal_instance: Optional[MessageJournal] = None
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
"""
Get or create the global journal instance.
Args:
config: Configuration dictionary (only used on first call)
Returns:
MessageJournal instance
"""
global _journal_instance
if _journal_instance is None:
_journal_instance = MessageJournal(config)
return _journal_instance
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
"""
Convenience function to log a message using the global journal.
Args:
msg: Parsed message dictionary
addr: Source address (ip, port) tuple
timestamp: Message timestamp (defaults to current time)
"""
journal = get_journal()
await journal.log_message(msg, addr, timestamp)
+169 -85
View File
@@ -13,80 +13,141 @@ from . import udp
from . import hbdclass
from . import ws as ws_mod
from . import notify as notify_mod
from . import data
logger = logging.getLogger(__name__)
msg_to_websockets = ws_mod.broadcast
logf = None
lastfm = ["", "", ""]
eventlog = notify_mod.eventlog
# shared runtime collections and helpers
msgs = []
def initlog(logfile):
try:
return open(logfile, "a+")
except Exception as e:
import sys
print("cannot open loffile %s, using STDERR: %s" % (logfile, e))
return sys.stderr
def log(host, m, service=None):
ts = time.time()
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
msgs.append(s)
logger.info(s)
if logf:
try:
logf.write(s + "\n")
logf.flush()
except Exception as e:
logger.warning("failed to write to logfile: %s", e)
msg_to_websockets("message", s)
def cleanup_function(config):
def cleanup_function(config, hbdclass):
"""This function will be executed upon program exit."""
logger.info("Running cleanup function...")
import pickle
# Ensure all timer references are cleared before pickling
for hostname, host in list(hbdclass.Host.hosts.items()):
for conn_type, conn in host.connections.items():
if hasattr(conn, 'cancel_overdue_timer'):
conn.cancel_overdue_timer()
if hasattr(conn, 'overdue_timer'):
conn.overdue_timer = None
if hasattr(conn, 'overdue_callback'):
conn.overdue_callback = None
if hasattr(conn, 'timeout_duration'):
conn.timeout_duration = None
pickfile = config.get("pickfile", "hbd.pickle")
pickf = open(pickfile, "wb")
pick = pickle.Pickler(pickf)
pick.dump(hbdclass.Host.hosts)
pick.dump(msgs)
pick.dump(lastfm)
pick.dump(data.msgs)
pickf.close()
logger.info("Cleanup complete.")
async def _run_async(config):
async def reload_configuration(config_obj, config_path, components):
"""Reload configuration and update all components.
Args:
config_obj: ReloadableConfig instance
config_path: Path to config file
components: Dict with threshold_checker and other components
Returns:
True if reload succeeded, False otherwise
"""
try:
logger.info("=" * 60)
logger.info("Starting configuration reload...")
logger.info("=" * 60)
# Reload config file
new_config = await config_obj.reload(config_path)
# Update notify module
notify_mod.reload_config(new_config)
# Reload threshold checker
if 'threshold_checker' in components:
components['threshold_checker'].reload(new_config)
# Note: Changes to the following require restart:
# - hb_port, hbd_port, ws_port (already bound)
# - SSL certificates (already loaded)
# - pickfile (already opened)
# - journal settings (journal already initialized)
# These are reloadable and effective immediately:
# - notification_channels
# - threshold_configs
# - hosts (watchhosts, dyndnshosts, notification_channels)
# - grace period (used on next heartbeat)
# - debug/verbose flags (used on next message)
logger.info("=" * 60)
logger.info("Configuration reload completed successfully")
logger.info("=" * 60)
return True
except Exception as e:
logger.error("=" * 60)
logger.error(f"Failed to reload configuration: {e}", exc_info=True)
logger.error("Keeping previous configuration")
logger.error("=" * 60)
return False
async def _run_async(config, config_path=None):
loop = asyncio.get_running_loop()
shutdown_event = asyncio.Event()
reload_event = asyncio.Event()
# Signal handlers for graceful shutdown
# Signal handlers for graceful shutdown and reload
def signal_handler(signum, frame):
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
logger.info(f"Received {sig_name}, initiating shutdown...")
loop.call_soon_threadsafe(shutdown_event.set)
def reload_handler(signum, frame):
sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum
logger.info(f"Received {sig_name}, initiating config reload...")
loop.call_soon_threadsafe(reload_event.set)
# Register signal handlers
loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None)
loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None)
loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None)
from . import http as http_mod
from . import dns as dns_mod
from . import notify as notify_mod
from . import monitor as monitor_mod
from . import journal as journal_mod
from . import threshold as threshold_mod
notify_mod.setup(config)
pushmsg = notify_mod.pushmsg_from_config
# Initialize message journal
msg_journal = journal_mod.get_journal(config)
await msg_journal.initialize()
# Initialize threshold checker
threshold_checker = threshold_mod.ThresholdChecker(
config=config,
renotify_interval=config.get("threshold_renotify_interval", 3600),
journal=msg_journal,
)
logger.info("Threshold checker initialized")
# Components dict for reload orchestration
components = {
'threshold_checker': threshold_checker,
'msg_journal': msg_journal,
}
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
@@ -110,9 +171,10 @@ async def _run_async(config):
ctx = dict(
config=config,
hbdclass=hbdclass,
log=log,
pushmsg=pushmsg,
log=eventlog,
msg_to_websockets=msg_to_websockets,
msg_journal=msg_journal,
threshold_checker=threshold_checker,
DEBUG=config.get("debug", 0),
verbose=config.get("verbose", False),
)
@@ -131,12 +193,7 @@ async def _run_async(config):
port=config.get("hbd_port", 50004),
config=config,
hbdclass=hbdclass,
msgs_getter=lambda: msgs,
log=log,
pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets,
tcss=None,
DEBUG=config.get("debug", 0),
verbose=config.get("verbose", False),
get_now=lambda: time.time(),
VER="",
@@ -154,7 +211,7 @@ async def _run_async(config):
dns_task = None
try:
dns_task = dns_mod.start_dns_worker(
hbdclass, config, log=log, pushmsg=pushmsg, loop=loop
hbdclass, config, log=eventlog, loop=loop
)
logger.info("dns update worker started")
except Exception as e:
@@ -180,43 +237,63 @@ async def _run_async(config):
ssl_context = None
try:
ws_port = config.get("ws_port", 50005)
logger.info("Starting WebSocket server on port %s", ws_port)
ws_task = asyncio.create_task(
ws_mod.start(
host=config.get("hbd_host", ""),
ws_port=config.get("ws_port", None),
ws_port=ws_port,
wss_port=config.get("wss_port", None),
ssl_context=ssl_context,
get_hosts=lambda: [
hbdclass.Host.hosts[h].stateinfo()
for h in sorted(hbdclass.Host.hosts)
],
get_msgs=lambda: msgs,
verbose=config.get("verbose", False),
# get_msgs=lambda: msgs,
config=config,
)
)
logger.info("WebSocket task started")
except Exception as e:
logger.exception("websocket server failed to start: %s", e)
# Start the monitor thread as a background task
# Main event loop - monitor shutdown and reload events
try:
monitor_task = asyncio.create_task(
monitor_mod.start(
config=config,
hbdclass=hbdclass,
log=log,
pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets,
while True:
# Wait for either shutdown or reload event
done, pending = await asyncio.wait(
[
asyncio.create_task(shutdown_event.wait()),
asyncio.create_task(reload_event.wait()),
],
return_when=asyncio.FIRST_COMPLETED
)
)
logger.info("Monitor task started")
except Exception as e:
logger.exception("monitor task failed to start: %s", e)
try:
# run forever until shutdown event is set
await shutdown_event.wait()
logger.info("Shutdown signal received, stopping services...")
# Check which event was triggered
if shutdown_event.is_set():
logger.info("Shutdown signal received, stopping services...")
# Cancel pending wait tasks
for task in pending:
task.cancel()
break
if reload_event.is_set():
# Clear the event for next reload
reload_event.clear()
# Cancel pending wait tasks
for task in pending:
task.cancel()
# Perform reload if config_path is available
if config_path:
await reload_configuration(config, config_path, components)
else:
logger.warning("Cannot reload: no config path available")
# Continue main loop
continue
except Exception as e:
logger.exception("Error in main loop: %s", e)
finally:
@@ -227,7 +304,7 @@ async def _run_async(config):
except Exception as e:
logger.warning("Error closing UDP transport: %s", e)
tasks_to_cancel = [http_task, ws_task, monitor_task]
tasks_to_cancel = [http_task, ws_task]
for task in tasks_to_cancel:
if task:
try:
@@ -248,6 +325,12 @@ async def _run_async(config):
logger.warning("Timeout waiting for tasks to cancel")
except Exception as e:
logger.debug("Exception during task cancellation: %s", e)
# Close message journal
try:
await msg_journal.close()
except Exception as e:
logger.warning("Error closing message journal: %s", e)
# Signal DNS worker to exit and await it
try:
@@ -277,13 +360,13 @@ async def _run_async(config):
def load_pickled_hosts(config, hbdclass):
"""Load pickled hosts from file, if available."""
global lastfm, msgs
import os
import pickle
from . import config as config_mod
pickfile = config.get("pickfile", "hbd.pickle")
dyndnshosts = config.get("dyndnshosts", [])
watchhosts = config.get("watchhosts", [])
dyndnshosts = config_mod.get_dyndnshosts(config)
watchhosts = config_mod.get_watchhosts(config)
drophosts = config.get("drophosts", [])
if 1 and os.path.exists(pickfile):
if config.get("verbose", False):
@@ -292,11 +375,7 @@ def load_pickled_hosts(config, hbdclass):
pick = pickle.Unpickler(pickf)
try:
hbdclass.Host.hosts = pick.load()
msgs = pick.load()
try:
lastfm = pick.load()
except Exception:
lastfm = ["", "", ""]
data.msgs = pick.load()
pickf.close()
except Exception as e:
logger.exception("load pickled failed: %s", e)
@@ -316,12 +395,15 @@ def load_pickled_hosts(config, hbdclass):
logger.info("no pickled data")
def run(config):
def run(config, config_path=None):
"""Start the hbd service (blocking).
Manually manages the event loop to ensure clean shutdown.
Args:
config: Configuration dictionary
config_path: Path to config file (for reload support)
"""
global logf
import os
logging.basicConfig(
@@ -329,27 +411,29 @@ def run(config):
)
load_pickled_hosts(config, hbdclass)
logf = initlog(logfile=config.get("logfile", "messages.log"))
log(None, f"hbd version {__version__} starting up")
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
eventlog(None, "INFO", f"hbd version {__version__} starting up")
if config_path:
logger.info(f"Config file: {config_path} (reload with SIGHUP)")
else:
logger.warning("No config path provided - reload via SIGHUP disabled")
# Create and set the event loop manually
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_run_async(config))
loop.run_until_complete(_run_async(config, config_path=config_path))
except KeyboardInterrupt:
logger.info("Received KeyboardInterrupt, shutting down...")
except Exception as e:
logger.exception("Unhandled exception in main: %s", e)
finally:
cleanup_function(config)
cleanup_function(config, hbdclass)
logger.info("hbd shutdown complete")
if logf and logf != sys.stderr:
try:
logf.close()
except Exception:
pass
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
notify_mod.closelog()
# Explicitly close the loop
try:
# Cancel all remaining tasks
+28
View File
@@ -0,0 +1,28 @@
"""Monitor helper for heartbeat daemon.
This module provides monitoring tasks for the heartbeat daemon.
The primary reachability monitoring is now event-driven (timers set/reset
on HTB arrival in udp.py) rather than periodic polling.
This module can be extended for additional monitoring tasks.
"""
from __future__ import annotations
import asyncio
import time
from . import notify as notify_mod
DROPOVERDUE = 7 * 24 * 3600
eventlog = notify_mod.eventlog
async def cleanup_connections(hbdclass):
"""Clean up connection timers on shutdown.
Cancels all active overdue timers to prevent callbacks after shutdown.
"""
for hostname, host in list(hbdclass.Host.hosts.items()):
for conn_type, conn in host.connections.items():
if hasattr(conn, 'cancel_overdue_timer'):
conn.cancel_overdue_timer()
+326
View File
@@ -0,0 +1,326 @@
"""Notification helpers: email, pushover, mattermost, signal and dispatcher."""
import logging
from typing import Optional
import http.client
import urllib.parse
import subprocess
import smtplib
import time
import sys
from . import data
from . import ws as ws_mod
from . import main as main_mod
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
msg_to_websockets = ws_mod.broadcast
# module-level configuration set via setup()
_config = {}
logger = logging.getLogger(__name__)
logf = None
def initlog(logfile):
global logf
try:
logf = open(logfile, "a+")
except Exception as e:
import sys
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
logf = sys.stderr
return logf
def closelog():
global logf
if logf and logf != sys.stderr:
try:
logf.close()
except Exception:
pass
def eventlog(host, lvl, m, service=None):
ts = time.time()
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
if host:
s += f"{host} "
s += m
data.msgs.append(s)
logger.info(s)
if logf:
try:
logf.write(s + "\n")
logf.flush()
except Exception as e:
logger.warning("failed to write to logfile: %s", e)
msg_to_websockets("message", s)
def setup(cfg: dict):
"""Initialize notifier defaults from a configuration dict."""
global _config
_config = dict(cfg)
def reload_config(cfg: dict):
"""Reload notification configuration.
This function updates the module-level notification configuration
during runtime config reloads.
Args:
cfg: New configuration dictionary
"""
global _config
_config = dict(cfg)
logger.info("Notification configuration reloaded")
def send_email(toaddrs, smtpserver, sender, subject, body, debug=0):
"""Send a plain email via SMTP. Returns True on success."""
try:
smtpport = _config.get("smtpport", 587)
server = smtplib.SMTP(smtpserver, smtpport)
if debug > 0:
server.set_debuglevel(1)
if smtpport == 587:
server.starttls()
server.ehlo()
smtpuser = _config.get("smtpuser", None)
smtppassword = _config.get("smtppassword", None)
if smtpuser and smtppassword:
server.login(smtpuser, smtppassword)
server.sendmail(sender, toaddrs, body)
except Exception as e:
logger.warning("email send failed: %s", e)
try:
server.quit()
except Exception:
pass
return False
try:
server.quit()
except Exception:
pass
return True
def email(subject: str, msg: str, debug: int = 0) -> bool:
"""Convenience wrapper exposed to the rest of the application.
Uses module-level configuration to supply recipient list, smtp server
and sender address.
"""
toaddrs = _config.get("toemail")
fromemail = _config.get("fromemail")
smtpserver = _config.get("smtpserver")
if not toaddrs or not fromemail or not smtpserver:
logger.warning(
"email config incomplete: toemail=%s, fromemail=%s, smtpserver=%s",
toaddrs,
fromemail,
smtpserver,
)
return False
date = time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime())
body = "To: %s\nFrom: %s\nSubject: %s\nDate: %s\n\n%s" % (
toaddrs[0] if toaddrs else "",
fromemail,
subject,
date,
msg,
)
return send_email(toaddrs, smtpserver, fromemail, subject, body, debug=debug)
def pushover(token: str, user: str, msg: str, debug: int = 0) -> bool:
"""Send message via Pushover API."""
conn = http.client.HTTPSConnection("api.pushover.net:443")
try:
conn.request(
"POST",
"/1/messages.json",
urllib.parse.urlencode({"token": token, "user": user, "message": msg}),
{"Content-type": "application/x-www-form-urlencoded"},
)
r = conn.getresponse()
logger.debug("pushover response: %s %s", r.status, r.reason)
return r.status == 200
except Exception as e:
logger.error("pushover error: %s", e)
return False
def pushmattermost(
host: str,
token: str,
channel: str,
msg: str,
username: str = "hbd",
icon: Optional[str] = None,
debug: int = 0,
) -> bool:
"""Send a message to Mattermost via simple webhook driver if available.
This helper tries to import mattermostdriver.Driver and uses webhooks if present.
If the import fails it returns False.
"""
try:
from mattermostdriver import Driver
except Exception:
return False
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
mm = Driver(ses)
payload = {"text": msg, "channel": channel, "username": username}
if icon:
payload["icon_url"] = icon
try:
rc = mm.webhooks.call_webhook(token, payload)
logger.debug("mattermost rc: %s", rc)
return bool(rc is None or rc == "")
except Exception as e:
logger.error("mattermost error: %s", e)
return False
def pushsignal(
signal_cli_bin: str, user: str, recipient: str, msg: str, debug: int = 0
) -> bool:
"""Send a message via signal-cli (requires local installation).
Uses subprocess to call signal-cli. Returns True if the command succeeded.
"""
CLI = [signal_cli_bin, "-u", user, "send", "-m", msg, recipient]
logger.debug("signal cli: %s", CLI)
try:
res = subprocess.run(CLI, capture_output=True)
if res.returncode != 0:
logger.error("signal failed: %s".res.stderr.decode())
return False
logger.debug("signal sent: %s", res.stdout.decode())
return True
except Exception as e:
logger.exception("signal exception: %s", e)
return False
def _dispatch_to_channel(channel_name: str, channel_config: dict, msg: str, debug: int = 0) -> bool:
"""Dispatch a message to a specific notification channel.
Args:
channel_name: Name of the channel (for logging)
channel_config: Channel configuration dictionary with 'type' and type-specific fields
msg: Message to send
debug: Debug level
Returns:
True if notification sent successfully, False otherwise
"""
channel_type = channel_config.get("type")
if channel_type == "pushover":
return pushover(
channel_config.get("token", ""),
channel_config.get("user", ""),
msg,
debug=debug
)
elif channel_type == "email":
# Build email from channel config
recipients = channel_config.get("recipients", [])
sender = channel_config.get("sender", "")
smtp_server = channel_config.get("smtp_server", "")
smtp_port = channel_config.get("smtp_port", 587)
smtp_user = channel_config.get("smtp_user")
smtp_password = channel_config.get("smtp_password")
if not recipients or not sender or not smtp_server:
logger.warning(
"Email channel '%s' missing required fields: recipients=%s, sender=%s, smtp_server=%s",
channel_name, recipients, sender, smtp_server
)
return False
# Temporarily update _config for email() function
old_config = dict(_config)
_config["toemail"] = recipients
_config["fromemail"] = sender
_config["smtpserver"] = smtp_server
_config["smtpport"] = smtp_port
if smtp_user:
_config["smtpuser"] = smtp_user
if smtp_password:
_config["smtppassword"] = smtp_password
result = email("Heartbeat notification", msg, debug=debug)
# Restore config
_config.clear()
_config.update(old_config)
return result
elif channel_type == "signal":
return pushsignal(
channel_config.get("cli_path", "/usr/local/bin/signal-cli"),
channel_config.get("user", ""),
channel_config.get("recipient", ""),
msg,
debug=debug
)
elif channel_type == "mattermost":
return pushmattermost(
channel_config.get("host", ""),
channel_config.get("token", ""),
channel_config.get("channel", ""),
msg,
username=channel_config.get("username", "hbd"),
icon=channel_config.get("icon"),
debug=debug
)
else:
logger.warning("Unknown channel type '%s' for channel '%s'", channel_type, channel_name)
return False
def pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict:
"""Send notification for a specific host using its configured channels.
This function looks up the host's notification channels from the config
and sends the message to those channels.
Args:
hostname: Name of the host to send notification for
msg: Message to send
debug: Debug level
Returns:
Dictionary of results per channel: {"channel_name": True/False}
"""
from . import config as config_mod
# Get notification channels for this host
channels = config_mod.get_notification_channels_config(_config, hostname)
if not channels:
logger.warning("No notification channels configured for host '%s'", hostname)
return {}
# Dispatch to each channel
results = {}
for channel_name, channel_config in channels:
try:
success = _dispatch_to_channel(channel_name, channel_config, msg, debug=debug)
results[channel_name] = success
if success:
logger.info("Notification sent to channel '%s': %s", channel_name, msg)
else:
logger.warning("Failed to send notification to channel '%s'", channel_name)
except Exception as e:
logger.error("Error sending to channel '%s': %s", channel_name, e)
results[channel_name] = False
return results

Before

Width:  |  Height:  |  Size: 5.3 KiB

After

Width:  |  Height:  |  Size: 5.3 KiB

+585
View File
@@ -0,0 +1,585 @@
<!DOCTYPE html>
<html>
{% include 'head.html' %}
<style>
body {
margin: 20px;
background: #f5f5f5;
}
.nav {
background: #fff;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border-radius: 4px;
}
.nav a {
margin-right: 20px;
text-decoration: none;
color: #0066cc;
font-weight: 500;
}
.nav a:hover {
text-decoration: underline;
}
.nav a.active {
color: #333;
font-weight: bold;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #333;
margin-bottom: 10px;
}
.subtitle {
color: #666;
margin-bottom: 30px;
}
.summary-cards {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.summary-card {
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
text-align: center;
}
.summary-card.critical {
border-left: 5px solid #f44336;
}
.summary-card.warning {
border-left: 5px solid #ff9800;
}
.summary-card.ok {
border-left: 5px solid #4caf50;
}
.summary-number {
font-size: 3em;
font-weight: bold;
margin: 10px 0;
}
.summary-number.critical {
color: #f44336;
}
.summary-number.warning {
color: #ff9800;
}
.summary-number.ok {
color: #4caf50;
}
.summary-label {
color: #666;
text-transform: uppercase;
font-size: 0.9em;
letter-spacing: 1px;
}
.filters {
background: white;
border-radius: 8px;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
display: flex;
gap: 15px;
align-items: center;
}
.filter-label {
font-weight: bold;
color: #555;
}
.filter-button {
padding: 8px 16px;
border: 2px solid #ddd;
background: white;
border-radius: 20px;
cursor: pointer;
transition: all 0.2s;
font-size: 0.9em;
}
.filter-button:hover {
border-color: #2196f3;
}
.filter-button.active {
background: #2196f3;
color: white;
border-color: #2196f3;
}
.alerts-container {
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.alert-item {
border-left: 5px solid #ddd;
padding: 15px;
margin-bottom: 15px;
background: #fafafa;
border-radius: 4px;
display: flex;
justify-content: space-between;
align-items: center;
transition: all 0.2s;
}
.alert-item.acknowledged {
opacity: 0.6;
background: #f0f0f0;
}
.alert-item:hover {
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
transform: translateX(5px);
}
.alert-item.critical {
border-left-color: #f44336;
background: #ffebee;
}
.alert-item.warning {
border-left-color: #ff9800;
background: #fff3e0;
}
.alert-item.unknown {
border-left-color: #9e9e9e;
background: #f5f5f5;
}
.alert-main {
flex: 1;
}
.alert-header {
display: flex;
align-items: center;
gap: 15px;
margin-bottom: 8px;
}
.alert-level {
padding: 4px 12px;
border-radius: 12px;
font-size: 0.75em;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.alert-level.critical {
background: #f44336;
color: white;
}
.alert-level.warning {
background: #ff9800;
color: white;
}
.alert-level.unknown {
background: #9e9e9e;
color: white;
}
.alert-hostname {
font-weight: bold;
color: #333;
font-size: 1.1em;
}
.alert-metric {
color: #666;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
.alert-details {
display: flex;
gap: 20px;
color: #666;
font-size: 0.9em;
}
.alert-value {
font-weight: bold;
color: #333;
}
.alert-duration {
color: #999;
font-size: 0.85em;
}
.alert-actions {
display: flex;
flex-direction: column;
gap: 8px;
margin-left: 15px;
}
.acknowledge-btn {
padding: 8px 16px;
background: #2196f3;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 0.85em;
transition: all 0.2s;
white-space: nowrap;
}
.acknowledge-btn:hover {
background: #1976d2;
transform: scale(1.05);
}
.acknowledge-btn:disabled {
background: #ccc;
cursor: not-allowed;
transform: none;
}
.acknowledged-badge {
padding: 4px 8px;
background: #4caf50;
color: white;
border-radius: 4px;
font-size: 0.75em;
text-align: center;
white-space: nowrap;
}
.no-alerts {
text-align: center;
padding: 60px 20px;
color: #999;
}
.no-alerts-icon {
font-size: 4em;
margin-bottom: 20px;
}
.loading {
text-align: center;
padding: 40px;
color: #666;
}
.error {
background: #ffebee;
border-left: 4px solid #f44336;
padding: 20px;
margin: 20px 0;
border-radius: 4px;
color: #c62828;
}
.refresh-info {
text-align: center;
color: #999;
font-size: 0.85em;
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #e0e0e0;
}
.last-update {
color: #666;
font-size: 0.9em;
text-align: right;
margin-bottom: 15px;
}
</style>
<body>
<div class="nav">
<a href="/live">Live Dashboard</a>
<a href="/plugins">Plugin Metrics</a>
<a href="/alerts" class="active">Alerts</a>
</div>
<div class="container">
<h1>{{ header }}</h1>
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
<div class="summary-cards" id="summary-cards">
<div class="summary-card critical">
<div class="summary-label">Critical</div>
<div class="summary-number critical" id="critical-count">-</div>
</div>
<div class="summary-card warning">
<div class="summary-label">Warning</div>
<div class="summary-number warning" id="warning-count">-</div>
</div>
<div class="summary-card ok">
<div class="summary-label">Total Hosts</div>
<div class="summary-number ok" id="host-count">-</div>
</div>
</div>
<div class="filters">
<span class="filter-label">Show:</span>
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
</div>
<div class="alerts-container">
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
<div id="alerts-list">
<div class="loading">Loading alerts...</div>
</div>
<div class="refresh-info">
Auto-refreshing every 15 seconds
</div>
</div>
</div>
<script>
let currentFilter = 'all';
let allAlerts = [];
async function loadAlerts() {
try {
const response = await fetch('/api/0/alerts');
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const data = await response.json();
allAlerts = data.alerts;
// Update summary cards
document.getElementById('critical-count').textContent = data.summary.critical || 0;
document.getElementById('warning-count').textContent = data.summary.warning || 0;
document.getElementById('host-count').textContent = data.host_count || 0;
// Update last update time
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
// Render alerts
renderAlerts(allAlerts);
} catch (error) {
document.getElementById('alerts-list').innerHTML =
`<div class="error">Failed to load alerts: ${error.message}</div>`;
}
}
function renderAlerts(alerts) {
const container = document.getElementById('alerts-list');
// Filter alerts based on current filter
let filteredAlerts = alerts;
if (currentFilter !== 'all') {
filteredAlerts = alerts.filter(alert =>
alert.level.toLowerCase() === currentFilter
);
}
if (filteredAlerts.length === 0) {
if (currentFilter === 'all' && alerts.length === 0) {
container.innerHTML = `
<div class="no-alerts">
<div class="no-alerts-icon">✓</div>
<h2>All Systems Normal</h2>
<p>No active alerts at this time</p>
</div>
`;
} else {
container.innerHTML = `
<div class="no-alerts">
<p>No ${currentFilter} alerts</p>
</div>
`;
}
return;
}
let html = '';
for (const alert of filteredAlerts) {
html += renderAlert(alert);
}
container.innerHTML = html;
}
function renderAlert(alert) {
const level = alert.level.toLowerCase();
const duration = getDuration(alert.since);
const acknowledged = alert.acknowledged || false;
// Use formatted message if available, otherwise build from individual fields
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
if (alert.formatted_message) {
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
}
// Build actions section
let actionsHtml = '';
if (acknowledged) {
actionsHtml = `
<div class="alert-actions">
<div class="acknowledged-badge">✓ Acknowledged</div>
</div>
`;
} else {
actionsHtml = `
<div class="alert-actions">
<button class="acknowledge-btn" onclick="acknowledgeAlert('${alert.hostname}', '${alert.metric_path}', event)">
Acknowledge
</button>
</div>
`;
}
return `
<div class="alert-item ${level} ${acknowledged ? 'acknowledged' : ''}">
<div class="alert-main">
<div class="alert-header">
<span class="alert-level ${level}">${alert.level}</span>
<span class="alert-hostname">${alert.hostname}</span>
</div>
<div class="alert-metric">${alert.metric_path}</div>
<div class="alert-details">
<span>${valueText}</span>
<span class="alert-duration">Active for ${duration}</span>
</div>
</div>
${actionsHtml}
</div>
`;
}
function formatValue(value) {
if (typeof value === 'number') {
if (value > 1000) {
return value.toLocaleString();
}
return value.toFixed(2);
}
return value;
}
function getDuration(timestamp) {
const now = Date.now() / 1000;
const seconds = Math.floor(now - timestamp);
if (seconds < 60) {
return `${seconds}s`;
} else if (seconds < 3600) {
return `${Math.floor(seconds / 60)}m`;
} else if (seconds < 86400) {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
return `${hours}h ${minutes}m`;
} else {
const days = Math.floor(seconds / 86400);
const hours = Math.floor((seconds % 86400) / 3600);
return `${days}d ${hours}h`;
}
}
function filterAlerts(filter) {
currentFilter = filter;
// Update active button
document.querySelectorAll('.filter-button').forEach(btn => {
btn.classList.remove('active');
});
event.target.classList.add('active');
// Re-render with new filter
renderAlerts(allAlerts);
}
async function acknowledgeAlert(hostname, metricPath, event) {
// Prevent event bubbling
if (event) {
event.stopPropagation();
}
// Disable the button
const button = event.target;
button.disabled = true;
button.textContent = 'Acknowledging...';
try {
const response = await fetch('/api/0/alerts/acknowledge', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
hostname: hostname,
metric_path: metricPath,
}),
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const result = await response.json();
// Update the alert in our local data
const alert = allAlerts.find(a => a.hostname === hostname && a.metric_path === metricPath);
if (alert) {
alert.acknowledged = true;
alert.acknowledged_at = result.acknowledged_at;
}
// Re-render alerts
renderAlerts(allAlerts);
} catch (error) {
alert(`Failed to acknowledge alert: ${error.message}`);
button.disabled = false;
button.textContent = 'Acknowledge';
}
}
// Auto-refresh every 15 seconds
setInterval(loadAlerts, 15000);
// Initial load
loadAlerts();
</script>
</body>
</html>
@@ -1,5 +1,5 @@
<footer>
<div id="copyright">
&copy;2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
&copy;2002-2026 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
</div>
</footer>
+506
View File
@@ -0,0 +1,506 @@
<!DOCTYPE html>
<html>
{% include 'head.html' %}
<style>
body {
margin: 10px;
background: #f5f5f5;
overflow: hidden;
}
.nav {
background: #fff;
padding: 10px 15px;
margin-bottom: 10px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border-radius: 4px;
}
.nav a {
margin-right: 20px;
text-decoration: none;
color: #0066cc;
font-weight: 500;
font-size: 0.9em;
}
.nav a:hover {
text-decoration: underline;
}
.nav a.active {
color: #333;
font-weight: bold;
}
.container {
max-width: 1600px;
margin: 0 auto;
max-height: calc(100vh - 120px);
overflow-y: auto;
padding-right: 10px;
}
h1 {
color: #333;
margin-bottom: 5px;
font-size: 1.5em;
}
h2 {
color: #333;
margin-bottom: 10px;
font-size: 1.2em;
padding: 10px 15px;
background: white;
border-radius: 6px;
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
}
.subtitle {
color: #666;
margin-bottom: 15px;
font-size: 0.9em;
}
.content {
display: flex;
flex-direction: column;
gap: 15px;
}
.table-section {
background: white;
border-radius: 6px;
padding: 15px;
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
}
.log-section {
background: white;
border-radius: 6px;
padding: 15px;
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
max-height: 400px;
overflow-y: auto;
}
#ntable {
border-collapse: collapse;
width: 100%;
font-size: 0.9em;
}
#ntable td,
#ntable th {
border: 1px solid #e0e0e0;
text-align: left;
padding: 8px 10px;
}
#ntable tr:nth-child(even) {
background-color: #fafafa;
}
#ntable tr:hover {
background-color: #e3f2fd;
}
#ntable th {
padding: 12px 10px;
background-color: #2196f3;
color: white;
font-weight: 600;
position: sticky;
top: 0;
z-index: 10;
}
#ntable
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
content: " ⇅";
opacity: 0.5;
}
/* Alert count column styling */
#ntable td.alert-warning {
color: #ff9800;
font-weight: bold;
text-align: center;
}
#ntable td.alert-critical {
color: #f44336;
font-weight: bold;
text-align: center;
}
/* Scrollbar styling */
.container::-webkit-scrollbar,
.log-section::-webkit-scrollbar {
width: 8px;
}
.container::-webkit-scrollbar-track,
.log-section::-webkit-scrollbar-track {
background: #f1f1f1;
border-radius: 4px;
}
.container::-webkit-scrollbar-thumb,
.log-section::-webkit-scrollbar-thumb {
background: #888;
border-radius: 4px;
}
.container::-webkit-scrollbar-thumb:hover,
.log-section::-webkit-scrollbar-thumb:hover {
background: #555;
}
/* Message styling */
#messages {
font-size: 0.85em;
line-height: 1.6;
}
#messages div {
padding: 5px 0;
border-bottom: 1px solid #f0f0f0;
}
/* Modal for connection status messages */
.connection-modal {
display: none;
position: fixed;
z-index: 1000;
left: 0;
top: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.5);
}
.connection-modal.show {
display: flex;
justify-content: center;
align-items: center;
}
.connection-modal-content {
background-color: white;
padding: 30px 40px;
border-radius: 8px;
text-align: center;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
min-width: 300px;
}
.connection-modal-content p {
margin: 0;
font-size: 16px;
color: #333;
}
/* State indicators */
.state-up {
color: #4caf50;
font-weight: 600;
}
.state-down {
color: #f44336;
font-weight: 700;
}
.state-overdue {
color: #ff9800;
font-weight: 700;
}
</style>
<script type="text/javascript">
var cnt = 0;
var nTable = document;
var name_idx = {};
var c = 0;
function setup() {
name_idx = {};
nTable = document.getElementById("ntable");
for (var i = 0, row; (row = nTable.rows[i]); i++) {
if (i == 0) continue;
name = nTable.rows[i].cells[0].innerText;
name_idx[name] = nTable.rows[i];
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
}
}
function createRow(data) {
var row = document.createElement("tr");
var c_name = document.createElement("td");
var c_warning = document.createElement("td");
c_warning.style.textAlign = "center";
c_warning.style.color = "#ff9800";
c_warning.style.fontWeight = "bold";
var c_critical = document.createElement("td");
c_critical.style.textAlign = "center";
c_critical.style.color = "#f44336";
c_critical.style.fontWeight = "bold";
var c_ipv4addr = document.createElement("td");
var c_ipv4state = document.createElement("td");
var c_ipv4latency = document.createElement("td");
c_ipv4latency.style.textAlign = "right";
var c_ipv4statets = document.createElement("td");
c_ipv4statets.style.textAlign = "right";
var c_ipv6addr = document.createElement("td");
var c_ipv6state = document.createElement("td");
var c_ipv6latency = document.createElement("td");
c_ipv6latency.style.textAlign = "right";
var c_ipv6statets = document.createElement("td");
c_ipv6statets.style.textAlign = "right";
row.appendChild(c_name);
row.appendChild(c_warning);
row.appendChild(c_critical);
row.appendChild(c_ipv4addr);
row.appendChild(c_ipv4state);
row.appendChild(c_ipv4latency);
row.appendChild(c_ipv4statets);
row.appendChild(c_ipv6addr);
row.appendChild(c_ipv6state);
row.appendChild(c_ipv6latency);
row.appendChild(c_ipv6statets);
if (data.dyn) {
c_name.innerHTML = "<b>" + data.name + "</b>";
} else {
c_name.innerHTML = data.name;
}
// Set alert counts in "x/y" format (unacked/acked)
var warningUnacked = data.alert_warning_unacked || 0;
var warningAcked = data.alert_warning_acked || 0;
var criticalUnacked = data.alert_critical_unacked || 0;
var criticalAcked = data.alert_critical_acked || 0;
if (warningUnacked > 0 || warningAcked > 0) {
c_warning.innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
} else {
c_warning.innerHTML = "";
}
if (criticalUnacked > 0 || criticalAcked > 0) {
c_critical.innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
} else {
c_critical.innerHTML = "";
}
c_ipv4addr.innerHTML = data.connections[0].addr;
c_ipv4state.innerHTML = data.connections[0].state;
if (data.connections.length > 1) {
c_ipv6addr.innerHTML = data.connections[1].addr;
c_ipv6state.innerHTML = data.connections[1].state;
}
var table = document.getElementById("ntablebody"); // find table to append to
table.appendChild(row); // append row to table
name_idx[c_name] = row;
}
function formatTS(ts) {
const milliseconds = ts * 1000;
const dateObject = new Date(milliseconds);
return dateObject.toLocaleString("de-DE");
}
function update_table(data) {
if (!(data.name in name_idx)) {
createRow(data);
setup();
}
// Update warning and critical counts in "x/y" format (unacked/acked)
var warningUnacked = data.alert_warning_unacked || 0;
var warningAcked = data.alert_warning_acked || 0;
var criticalUnacked = data.alert_critical_unacked || 0;
var criticalAcked = data.alert_critical_acked || 0;
if (warningUnacked > 0 || warningAcked > 0) {
name_idx[data.name].cells[1].innerHTML = warningAcked > 0 ? warningUnacked + "/" + warningAcked : warningUnacked;
} else {
name_idx[data.name].cells[1].innerHTML = "";
}
if (criticalUnacked > 0 || criticalAcked > 0) {
name_idx[data.name].cells[2].innerHTML = criticalAcked > 0 ? criticalUnacked + "/" + criticalAcked : criticalUnacked;
} else {
name_idx[data.name].cells[2].innerHTML = "";
}
for (var i = 0; i < data.connections.length; i++) {
// Offset by 2 for the warning/critical count columns
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
data.connections[i].statetime
);
if (data.connections[i].state == "up") {
state = '<span class="state-up">up</span>';
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
} else {
if (data.connections[i].state == "unknown") {
state = "";
latency = "";
name_idx[data.name].cells[3 + i * 4].innerHTML = "";
name_idx[data.name].cells[6 + i * 4].innerHTML = "";
} else if (data.connections[i].state == "down") {
state = '<span class="state-down">down</span>';
latency = "-";
} else if (data.connections[i].state == "overdue") {
state = '<span class="state-overdue">overdue</span>';
latency = "-";
} else {
state = "<b>" + data.connections[i].state + "</b>";
latency = "-";
}
}
name_idx[data.name].cells[4 + i * 4].innerHTML = state;
name_idx[data.name].cells[5 + i * 4].innerHTML = latency;
}
}
function WS_Connect() {
if ("WebSocket" in window) {
//N.B: subprotocol field causes chrome to error 1006
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
ws_hbd.onopen = function () {
// Web Socket is connected, send data using send()
console.log("ws connect {{heartbeat_ws_url}}");
// Hide modal window if visible
var modal = document.getElementById("connectionModal");
if (modal) {
modal.classList.remove("show");
}
ws_hbd.send("heartbeat_web");
};
ws_hbd.onerror = function (event) {
console.log(event);
};
ws_hbd.onmessage = function (event) {
/* console.log(event.data); */
var state = JSON.parse(event.data);
/* console.log("State: " + state.type); */
if (state.type == "host") {
update_table(state.data);
} else if (state.type == "message") {
var msgs = document.getElementById("messages");
msgs.insertAdjacentHTML("afterbegin", "<div>" + state.data + "</div>");
}
cnt++;
};
ws_hbd.onclose = function (event) {
/* console.log(event); */
console.log("Connection is closed, reopening");
// Show modal window
var modal = document.getElementById("connectionModal");
if (modal) {
modal.classList.add("show");
}
setTimeout(function () {
WS_Connect();
}, 3000);
};
} else {
// The browser doesn't support WebSocket
console.log("WebSocket NOT supported by your Browser!");
}
}
WS_Connect();
</script>
<body>
<div class="nav">
<a href="/live" class="active">Live Dashboard</a>
<a href="/plugins">Plugin Metrics</a>
<a href="/alerts">Alerts</a>
</div>
{% include 'menu.html' %}
<div class="container">
<h1>{{ header }}</h1>
<p class="subtitle">Real-time host monitoring and event log</p>
<div class="table-section">
<table id="ntable" class="sortable">
<thead>
<tr>
<th>Name</th>
<th style="text-align: center" title="Warning Alerts">⚠️</th>
<th style="text-align: center" title="Critical Alerts">🔴</th>
<th>IPv4 Addr</th>
<th>State</th>
<th style="text-align: right">Latency</th>
<th style="text-align: right">Last State</th>
<th>IPv6 Addr</th>
<th>State</th>
<th style="text-align: right">Latency</th>
<th style="text-align: right">Last State</th>
</tr>
</thead>
<tbody id="ntablebody">
{% for host in hosts %}
<tr>
<td>{{ host.name }}</td>
<td style="text-align: center; color: #ff9800; font-weight: bold;">
{%- set warning_unacked = host.alert_warning_unacked -%}
{%- set warning_acked = host.alert_warning_acked -%}
{%- if warning_unacked > 0 or warning_acked > 0 -%}
{{ warning_unacked }}{% if warning_acked > 0 %}/{{ warning_acked }}{% endif %}
{%- endif -%}
</td>
<td style="text-align: center; color: #f44336; font-weight: bold;">
{%- set critical_unacked = host.alert_critical_unacked -%}
{%- set critical_acked = host.alert_critical_acked -%}
{%- if critical_unacked > 0 or critical_acked > 0 -%}
{{ critical_unacked }}{% if critical_acked > 0 %}/{{ critical_acked }}{% endif %}
{%- endif -%}
</td>
{% for conn in host.connections %}
<td>{{ conn.addr if conn.addr else '' }}</td>
<td>{{ conn.state if conn.state else '' }}</td>
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
{% endfor %}
{% if host.connections|length == 0 %}
<td></td><td></td><td></td><td></td>
<td></td><td></td><td></td><td></td>
{% elif host.connections|length == 1 %}
<td></td><td></td><td></td><td></td>
{% endif %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="log-section">
<h2>Log of Events</h2>
<div id="messages"></div>
</div>
</div>
{% include 'foot.html' %}
<!-- Connection status modal -->
<div id="connectionModal" class="connection-modal">
<div class="connection-modal-content">
<p>⚠️ Connection is closed, reopening...</p>
</div>
</div>
<script>
setup();
</script>
</body>
</html>
+3
View File
@@ -0,0 +1,3 @@
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
s<header>{{ header }}</header> -->
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+316
View File
@@ -0,0 +1,316 @@
"""UDP listener and datagram processing."""
import asyncio
import zlib
import logging
from ..common.proto import stodict, oldmtodict
from ..common.utils import dur
from . import notify as notify_mod
logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog
class EchoServerProtocol(asyncio.DatagramProtocol):
def __init__(self, config=None, handler=None):
super().__init__()
self.config = config or {}
self.handler = handler
def connection_made(self, transport):
self.transport = transport
logger.info("UDP Server listening...")
def datagram_received(self, data, addr):
logger.debug("Received from %s", addr)
try:
msg = parse_message(data)
if self.handler:
# handler can be a callable provided by the application
# pass the transport so handlers can send replies (ACKs/commands)
self.handler(msg, addr, self.transport)
except Exception:
logger.exception("Error while processing datagram from %s", addr)
def parse_message(data: bytes):
"""Parse a raw datagram into a message dict.
Uses the protocol decoding helpers and falls back to old format when
decoding returns an empty dict (compat with older clients).
"""
msg = stodict(data)
if not msg:
# fallback to old format
msg = oldmtodict(data)
return msg
def dicttos(ID, d):
s = []
for k in d:
if isinstance(d[k], float):
s.append("%s=%0.5f" % (k, d[k]))
else:
s.append("%s=%s" % (k, d[k]))
pk = ";".join(s)
zpk = zlib.compress(pk.encode(), 6)
ID = "!" + ID + ":"
opk = ID.encode() + zpk
return opk
def handle_datagram(msg: dict, addr, transport, ctx: dict):
"""Handle a parsed datagram message.
ctx is a dictionary with runtime dependencies:
- config: dict of configuration
- hbdclass: module providing Host/Connection classes
- log: callable(loghost, message)
- msg_to_websockets: callable(typ, data)
- msg_journal: MessageJournal instance for logging all messages
- DEBUG, verbose
"""
if not msg:
return
now = __import__("time").time()
# Log message to journal
msg_journal = ctx.get("msg_journal")
if msg_journal:
# Create async task to log message (non-blocking)
import asyncio
try:
loop = asyncio.get_event_loop()
loop.create_task(msg_journal.log_message(msg, addr, now))
except Exception as e:
logger.debug(f"Failed to log message to journal: {e}")
cfg = ctx.get("config", {})
hbdcls = ctx.get("hbdclass")
log = ctx.get("log")
msg_to_websockets = ctx.get("msg_to_websockets")
DEBUG = ctx.get("DEBUG", 0)
verbose = ctx.get("verbose", False)
# normalize addr (ip, port)
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
name = msg.get("name", "unknown")
from ..common.utils import shortname
from . import config as config_mod
uname = shortname(name)
if uname not in hbdcls.Host.hosts:
host = hbdcls.Host(uname)
# Use new config function to check dyndns
dyndnshosts = config_mod.get_dyndnshosts(cfg)
host.dyn = uname in dyndnshosts
if verbose:
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
newh = True
else:
host = hbdcls.Host.hosts[uname]
newh = False
# Get watchhosts once for use throughout message handling
watchhosts = config_mod.get_watchhosts(cfg)
cid = msg.get("id", 0)
try:
rtt = float(msg.get("rtt"))
except TypeError:
rtt = None
if msg.get("ID") == "HTB":
host.doesack = msg.get("acks", -1)
# send ACK back
rmsg = {"time": __import__("time").time()}
opkt = dicttos("ACK", rmsg)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send ack: %s" % e))
elif msg.get("ID") == "PLG":
# Handle plugin data message
plugin_name = msg.get("plugin")
if plugin_name:
# Extract all fields except ID and plugin name
plugin_data = {k: v for k, v in msg.items() if k not in ["ID", "plugin"]}
# Store plugin data with timestamp
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
if DEBUG > 1:
print(f"Stored plugin data for {uname}: {plugin_name}")
# Check thresholds if checker is available
threshold_checker = ctx.get("threshold_checker")
if threshold_checker:
try:
state_changes = threshold_checker.check_plugin_data(
host_name=uname,
plugin_name=plugin_name,
data=plugin_data,
alert_states=host.alert_states,
)
if DEBUG > 1 and state_changes:
print(f"Threshold state changes for {uname}: {state_changes}")
except Exception as e:
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
# Notify websockets of plugin update
if msg_to_websockets:
try:
msg_to_websockets("plugin", {
"host": uname,
"plugin": plugin_name,
"data": plugin_data,
"timestamp": now
})
except Exception:
pass
try:
conn, res = host.conndata(cid, ip, rtt, now)
except Exception as e:
if DEBUG > 0:
print("conndata failed: %s" % e)
return
if res:
eventlog(uname, "WARNING", res)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
interval = int(msg.get("interval", 0) or 0)
shutdown = msg.get("shutdown", 0)
service = msg.get("service", "unknown")
message = msg.get("msg", None)
boot = msg.get("boot", 0)
if boot:
eventlog(uname, "INFO", "booted")
if uname in watchhosts:
m = "%s booted" % (host.name)
notify_mod.pushmsg_for_host(uname, m)
if message:
eventlog(uname, "INFO", "msg: %s" % message, service=service)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, message)
if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now)
if d == 0 or lasts == "unknown":
m = "%s is up" % (conn.afam)
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
if boot or newh:
host.upcount = host.doesack
else:
host.upcount += 1
if shutdown:
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
conn.newstate(hbdcls.Connection.DOWN, now)
if interval > 0:
host.interval = interval
# Timer-based reachability monitoring
# Reset overdue timer on every heartbeat
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
grace = cfg.get("grace", 2)
timeout_seconds = (interval + grace) if interval > 0 else 30
# Create callback for timer expiration
async def on_overdue(connection):
"""Called when connection timer expires (no heartbeat received)."""
import time
now = time.time()
# Only mark as overdue if still in UP state (not already marked)
if connection.getstate() == hbdcls.Connection.UP:
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
msg = f"{connection.afam} overdue"
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
# Check RTT thresholds with infinite RTT for overdue hosts
threshold_checker = ctx.get("threshold_checker")
if threshold_checker:
metric_path = "rtt"
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=float('inf'),
alert_states=host.alert_states
)
# Notify websockets
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
# Set a longer timer for marking as UNKNOWN (7 days)
DROPOVERDUE = 7 * 24 * 3600
async def on_unknown(connection):
"""Mark connection as unknown after extended absence."""
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
if msg_to_websockets:
msg_to_websockets("host", host.stateinfo())
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
# Reset the timer
conn.reset_overdue_timer(timeout_seconds, on_overdue)
# Check RTT thresholds using the threshold checker
threshold_checker = ctx.get("threshold_checker")
if threshold_checker and rtt and rtt > 0:
# Metric path for RTT is simply "rtt"
metric_path = "rtt"
# Check against configured thresholds (handles alerts, notifications, etc.)
threshold_checker.check_value(
host_name=uname,
metric_path=metric_path,
value=rtt,
alert_states=host.alert_states
)
# send any commands we have queued
while len(host.cmds):
op, rmsg = host.cmds[0]
if op == "CMD":
del host.cmds[0]
if log:
log(uname, "command sent")
elif op == "UPD":
del host.cmds[0]
if log:
log(uname, "update initiated")
opkt = dicttos(op, rmsg)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send cmd/update: %s" % e))
if msg_to_websockets:
try:
msg_to_websockets("host", host.stateinfo())
except Exception as e:
if DEBUG > 0:
print(("cannot send websocket message: %s" % e))
+36 -24
View File
@@ -8,15 +8,16 @@ import asyncio
import json
import logging
from typing import Callable, Iterable, Optional
from . import data
import websockets
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
_connections = set()
_loop: Optional[asyncio.AbstractEventLoop] = None
_get_hosts: Optional[Callable[[], Iterable]] = None
_get_msgs: Optional[Callable[[], Iterable]] = None
#_get_msgs: Optional[Callable[[], Iterable]] = None
_verbose = False
@@ -25,19 +26,28 @@ async def _handler(websocket, path=None):
remote_address = websocket.remote_address
if path is None:
path = getattr(websocket, "path", None)
if _verbose:
logger.info("DBG ws_serve: %s: %s", remote_address, path)
logger.info("WebSocket connection from %s: %s", remote_address, path)
try:
# send initial hosts
if _get_hosts:
for h in _get_hosts():
jmsg = json.dumps({"type": "host", "data": h})
await websocket.send(jmsg)
try:
hosts = list(_get_hosts())
logger.debug("Sending %d hosts to new WebSocket client", len(hosts))
for h in hosts:
jmsg = json.dumps({"type": "host", "data": h})
await websocket.send(jmsg)
except Exception as e:
logger.error("Error sending initial hosts: %s", e, exc_info=True)
# send recent messages
if _get_msgs:
for m in list(_get_msgs())[-100:]:
jmsg = json.dumps({"type": "message", "data": m})
await websocket.send(jmsg)
if data.msgs:
try:
# msgs = list(_get_msgs())[-100:]
logger.debug("Sending %d recent messages to new WebSocket client", len(data.msgs))
for m in data.msgs:
jmsg = json.dumps({"type": "message", "data": m})
await websocket.send(jmsg)
except Exception as e:
logger.error("Error sending initial messages: %s", e, exc_info=True)
# keep connection open until client disconnects
async for _ in websocket:
@@ -50,11 +60,11 @@ async def _handler(websocket, path=None):
websockets.exceptions.ConnectionClosedOK,
websockets.exceptions.ConnectionClosedError,
) as e:
if _verbose:
logger.info("ws closed: %r", e)
logger.info("WebSocket closed from %s: %r", remote_address, e)
except Exception as e:
logger.exception("ws handler exception: %s", e)
logger.exception("WebSocket handler exception from %s: %s", remote_address, e)
finally:
logger.debug("Removing WebSocket connection from %s", remote_address)
try:
_connections.remove(websocket)
except KeyError:
@@ -68,8 +78,8 @@ async def start(
wss_port: Optional[int] = None,
ssl_context=None,
get_hosts: Optional[Callable] = None,
get_msgs: Optional[Callable] = None,
verbose: bool = False,
# get_msgs: Optional[Callable] = None,
config: dict = {},
):
"""Start WebSocket servers and block until cancelled.
@@ -77,16 +87,19 @@ async def start(
If `wss_port` and `ssl_context` are provided, a WSS server will also be
started.
"""
global _loop, _get_hosts, _get_msgs, _verbose
global _loop, _get_hosts, _verbose
_loop = asyncio.get_running_loop()
_get_hosts = get_hosts
_get_msgs = get_msgs
_verbose = verbose
_verbose = config.get("verbose", False),
_debug = config.get("debug", 0),
servers = []
# plain WebSocket
websockets_logger = logging.getLogger("websockets.server")
websockets_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
#if _debug > 2:
# websockets_logger.setLevel(logging.DEBUG)
#else:
# websockets_logger.setLevel(logging.INFO)
# regular WebSocket
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
servers.append(ws_server)
@@ -101,10 +114,9 @@ async def start(
for srv in servers:
await srv
if _verbose:
logger.info(
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
)
logger.info(
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
)
# block forever (until loop is stopped or cancelled)
await asyncio.Future()
-281
View File
@@ -1,281 +0,0 @@
<!DOCTYPE html>
<html>
{% include 'head.html' %}
<style>
.content {
display: flex;
flex-direction: column;
}
.table {
/* flex: 1; */
flex-grow: none;
}
.log {
flex: 2;
flex-grow: 1;
}
#ntable {
border-collapse: collapse;
font-size: 95%;
/* width: 100%; */
}
#ntable td,
#ntable th {
border: 1px solid #ddd;
text-align: left;
padding: 0px;
}
#ntable tr:nth-child(even) {
background-color: #f2f2f2;
}
#ntable tr:hover {
background-color: #ddd;
}
#ntable th {
padding-top: 12px;
padding-bottom: 12px;
background-color: #9d9d9d;
color: white;
}
#ntable
th:not(.sorttable_sorted):not(.sorttable_sorted_reverse):not(.sorttable_nosort):after {
content: " \2195";
}
/* Modal for connection status messages */
.connection-modal {
display: none;
position: fixed;
z-index: 1000;
left: 0;
top: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.4);
}
.connection-modal.show {
display: flex;
justify-content: center;
align-items: center;
}
.connection-modal-content {
background-color: #f9f9f9;
padding: 20px;
border: 1px solid #888;
border-radius: 5px;
text-align: center;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
min-width: 300px;
}
.connection-modal-content p {
margin: 10px 0;
font-size: 16px;
color: #333;
}
</style>
<script type="text/javascript">
var cnt = 0;
var nTable = document;
var name_idx = {};
var c = 0;
function setup() {
name_idx = {};
nTable = document.getElementById("ntable");
for (var i = 0, row; (row = nTable.rows[i]); i++) {
if (i == 0) continue;
name = nTable.rows[i].cells[0].innerText;
name_idx[name] = nTable.rows[i];
/* console.log("name_Id[" + name + "]: " + name_idx[name].innerText); */
}
}
function createRow(data) {
var row = document.createElement("tr");
var c_name = document.createElement("td");
var c_ver = document.createElement("td");
var c_ipv4addr = document.createElement("td");
var c_ipv4state = document.createElement("td");
var c_ipv4latency = document.createElement("td");
c_ipv4latency.style.textAlign = "right";
var c_ipv4statets = document.createElement("td");
c_ipv4statets.style.textAlign = "right";
var c_ipv6addr = document.createElement("td");
var c_ipv6state = document.createElement("td");
var c_ipv6latency = document.createElement("td");
c_ipv6latency.style.textAlign = "right";
var c_ipv6statets = document.createElement("td");
c_ipv6statets.style.textAlign = "right";
row.appendChild(c_name);
row.appendChild(c_ver);
row.appendChild(c_ipv4addr);
row.appendChild(c_ipv4state);
row.appendChild(c_ipv4latency);
row.appendChild(c_ipv4statets);
row.appendChild(c_ipv6addr);
row.appendChild(c_ipv6state);
row.appendChild(c_ipv6latency);
row.appendChild(c_ipv6statets);
if (data.dyn) {
c_name.innerHTML = "<b>" + data.name + "</b>";
} else {
c_name.innerHTML = data.name;
}
c_ver.innerHTML = data.cver;
c_ipv4addr.innerHTML = data.connections[0].addr;
c_ipv4state.innerHTML = data.connections[0].state;
if (data.connections.length > 1) {
c_ipv6addr.innerHTML = data.connections[1].addr;
c_ipv6state.innerHTML = data.connections[1].state;
}
var table = document.getElementById("ntablebody"); // find table to append to
table.appendChild(row); // append row to table
name_idx[c_name] = row;
}
function formatTS(ts) {
const milliseconds = ts * 1000;
const dateObject = new Date(milliseconds);
return dateObject.toLocaleString("de-DE");
}
function update_table(data) {
if (!(data.name in name_idx)) {
createRow(data);
setup();
}
for (var i = 0; i < data.connections.length; i++) {
name_idx[data.name].cells[2 + i * 4].innerHTML = data.connections[i].addr;
name_idx[data.name].cells[5 + i * 4].innerHTML = formatTS(
data.connections[i].statetime
);
if (data.connections[i].state == "up") {
state = "up";
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
} else {
if (data.connections[i].state == "unknown") {
state = "";
latency = "";
name_idx[data.name].cells[2 + i * 4].innerHTML = "";
name_idx[data.name].cells[5 + i * 4].innerHTML = "";
} else {
state = "<b>" + data.connections[i].state + "</b>";
latency = "-";
}
}
name_idx[data.name].cells[3 + i * 4].innerHTML = state;
name_idx[data.name].cells[4 + i * 4].innerHTML = latency;
}
}
function WS_Connect() {
if ("WebSocket" in window) {
//N.B: subprotocol field causes chrome to error 1006
var ws_hbd = new WebSocket("{{heartbeat_ws_url}}", /* "hdb" */ );
ws_hbd.onopen = function () {
// Web Socket is connected, send data using send()
console.log("ws connect {{heartbeat_ws_url}}");
// Hide modal window if visible
var modal = document.getElementById("connectionModal");
if (modal) {
modal.classList.remove("show");
}
ws_hbd.send("heartbeat_web");
};
ws_hbd.onerror = function (event) {
console.log(event);
};
ws_hbd.onmessage = function (event) {
/* console.log(event.data); */
var state = JSON.parse(event.data);
/* console.log("State: " + state.type); */
if (state.type == "host") {
update_table(state.data);
} else if (state.type == "message") {
var msgs = document.getElementById("messages");
msgs.insertAdjacentHTML("afterbegin", state.data + "<br>");
}
cnt++;
};
ws_hbd.onclose = function (event) {
/* console.log(event); */
console.log("Connection is closed, reopening");
// Show modal window
var modal = document.getElementById("connectionModal");
if (modal) {
modal.classList.add("show");
}
setTimeout(function () {
WS_Connect();
}, 3000);
};
} else {
// The browser doesn't support WebSocket
console.log("WebSocket NOT supported by your Browser!");
}
}
WS_Connect();
</script>
<body>
{% include 'menu.html' %}
<div id="content" class="content" style="overflow: hidden">
<div id="table" class="table" style="overflow: hidden">
<!-- <h2>{{title}}</h2> -->
<table id="ntable" class="sortable">
<thead>
<tr>
<th>Name</th>
<th>Ver</th>
<th>IPv4 Addr</th>
<th>State</th>
<th style="text-align: right">Latencey</th>
<th style="text-align: right">Last State</th>
<th>IPv6 Addr</th>
<th>State</th>
<th style="text-align: right">Latencey</th>
<th style="text-align: right">Last State</th>
</tr>
</thead>
<tbody id="ntablebody"></tbody>
</table>
</div>
<div id="log" class="log" style="overflow: auto;">
<h2>Log of Events</h2>
<div id="messages">
</div>
</div>
</div>
{% include 'foot.html' %}
<!-- Connection status modal -->
<div id="connectionModal" class="connection-modal">
<div class="connection-modal-content">
<p>⚠️ Connection is closed, reopening...</p>
</div>
</div>
<script>
setup();
</script>
</body>
</html>
-3
View File
@@ -1,3 +0,0 @@
<label for="drawer-toggle" id="drawer-toggle-label"></label>
<header>{{ header }}</header>
-220
View File
@@ -1,220 +0,0 @@
"""UDP listener and datagram processing."""
import asyncio
import zlib
import logging
from .proto import stodict, oldmtodict
from hbd.utils import dur
logger = logging.getLogger(__name__)
class EchoServerProtocol(asyncio.DatagramProtocol):
def __init__(self, config=None, handler=None):
super().__init__()
self.config = config or {}
self.handler = handler
def connection_made(self, transport):
self.transport = transport
logger.info("UDP Server listening...")
def datagram_received(self, data, addr):
logger.debug("Received from %s", addr)
try:
msg = parse_message(data)
if self.handler:
# handler can be a callable provided by the application
# pass the transport so handlers can send replies (ACKs/commands)
self.handler(msg, addr, self.transport)
except Exception:
logger.exception("Error while processing datagram from %s", addr)
def parse_message(data: bytes):
"""Parse a raw datagram into a message dict.
Uses the protocol decoding helpers and falls back to old format when
decoding returns an empty dict (compat with older clients).
"""
msg = stodict(data)
if not msg:
# fallback to old format
msg = oldmtodict(data)
return msg
def dicttos(ID, d, compress=False):
s = []
for k in d:
if isinstance(d[k], float):
s.append("%s=%0.5f" % (k, d[k]))
else:
s.append("%s=%s" % (k, d[k]))
pk = ";".join(s)
if compress:
zpk = zlib.compress(pk.encode(), 6)
ID = "!" + ID + ":"
opk = ID.encode() + zpk
else:
zpk = pk
opk = ID + ":" + zpk
return opk
def handle_datagram(msg: dict, addr, transport, ctx: dict):
"""Handle a parsed datagram message.
ctx is a dictionary with runtime dependencies:
- config: dict of configuration
- hbdclass: module providing Host/Connection classes
- log: callable(loghost, message)
- pushmsg: callable(message)
- msg_to_websockets: callable(typ, data)
- DEBUG, verbose
"""
if not msg:
return
now = __import__("time").time()
cfg = ctx.get("config", {})
hbdcls = ctx.get("hbdclass")
log = ctx.get("log")
pushmsg = ctx.get("pushmsg")
msg_to_websockets = ctx.get("msg_to_websockets")
DEBUG = ctx.get("DEBUG", 0)
verbose = ctx.get("verbose", False)
# normalize addr (ip, port)
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
name = msg.get("name", "unknown")
from hbd.utils import shortname
uname = shortname(name)
if uname not in hbdcls.Host.hosts:
host = hbdcls.Host(uname)
host.dyn = uname in cfg.get("dyndnshosts", [])
if verbose:
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
newh = True
else:
host = hbdcls.Host.hosts[uname]
newh = False
cid = msg.get("id", 0)
try:
rtt = float(msg.get("rtt", None))
except Exception:
rtt = None
if msg.get("ID") == "HTB":
host.doesack = msg.get("acks", -1)
host.setcver(msg.get("ver", 0))
try:
conn, res = host.conndata(cid, ip, rtt, now)
except Exception as e:
if DEBUG > 0:
print("conndata failed: %s" % e)
return
if res:
if log:
log(uname, res)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s" % (host.name, res))
interval = int(msg.get("interval", 0) or 0)
shutdown = msg.get("shutdown", 0)
service = msg.get("service", "unknown")
message = msg.get("msg", None)
boot = msg.get("boot", 0)
if boot:
if log:
log(uname, "booted")
if uname in cfg.get("watchhosts", []):
m = "%s booted" % (host.name)
if pushmsg:
pushmsg(m)
if message:
if log:
log(uname, "msg: %s" % message, service=service)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg(message)
if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now)
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
if log:
log(uname, m)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s is back" % (uname, conn.afam))
if boot or newh:
host.upcount = host.doesack
else:
host.upcount += 1
if shutdown:
if log:
log(uname, "%s shutdown" % conn.afam)
if uname in cfg.get("watchhosts", []):
if pushmsg:
pushmsg("%s %s shutdown" % (uname, conn.afam))
conn.newstate(hbdcls.Connection.DOWN, now)
if interval > 0:
host.interval = interval
# send ACK back
rmsg = {"time": __import__("time").time()}
if host.cver < 1:
opkt = b"ACK"
else:
opkt = dicttos("ACK", rmsg, host.cver > 1)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send ack: %s" % e))
# send any commands we have queued
while len(host.cmds):
op, rmsg = host.cmds[0]
if op == "CMD":
del host.cmds[0]
if log:
log(uname, "command sent")
if host.cver < 1:
rmsg = rmsg["cmd"]
elif op == "UPD":
del host.cmds[0]
if log:
log(uname, "update initiated")
if host.cver < 1:
if log:
log(uname, " ver 0 does not support UPD")
continue
if host.cver < 1:
opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg)
if isinstance(opkt, str):
opkt = opkt.encode()
else:
opkt = dicttos(op, rmsg, True)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send cmd/update: %s" % e))
if msg_to_websockets:
try:
msg_to_websockets("host", host.stateinfo())
except Exception:
pass
-380
View File
@@ -1,380 +0,0 @@
"""
host and connection class shared between hbd and
the websit's heartbeat.py
"""
import time
import json
import copy
import queue
num = 0
MAXRTTS = 10
DEBUG = 2
def log(host, m):
if DEBUG:
print("class log: %s %s" % (host, m))
class Connection:
# map of addrs to names
htab = {}
UNKNOWN = "unknown"
UP = "up"
DOWN = "down"
OVERDUE = "overdue"
def __init__(self, host, cid, addr, afam):
self.host = host
self.cid = cid
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.addr = addr
self.afam = afam
self.rtts = [0]
self.lastbeat = time.time()
self.statetime = self.lastbeat
self.deltastatetime = "computed"
self.state = Connection.UNKNOWN
if host:
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr))
def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr))
def clearstate(self):
d = {}
d["addr"] = ""
d["rtt"] = ""
d["lastbeat"] = ""
d["state"] = ""
d["statetime"] = ""
d["deltastatetime"] = ""
d["rttstate"] = ""
return d
def statedict(self, Null=False):
d = self.clearstate()
now = time.time()
if not Null:
d["addr"] = self.addr
if self.rtts[-1]:
d["rtt"] = "%0.1f" % self.rtts[-1]
elif self.state == Connection.UNKNOWN:
d["rtt"] = ""
else:
d["rtt"] = "?"
d["lastbeat"] = self.lastbeat
if self.state == Connection.OVERDUE:
d["state"] = "<b>%s</b>" % self.state
else:
d["state"] = self.state
if self.state == Connection.UP:
d["rttstate"] = d["rtt"]
elif self.state == Connection.OVERDUE:
d["rttstate"] = ""
else:
d["rttstate"] = d["state"]
d["statetime"] = time.strftime(
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
)
delta = now - self.statetime
if self.state == Connection.UNKNOWN:
d["deltastatetime"] = ""
elif delta > 86400:
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
elif delta > 3600:
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
elif delta > 60:
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
else:
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
d["deltastatetime"] = "%i secs" % (delta)
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
d = self.clearstate()
return d
def headerdict(self, afam):
d = {}
d["addr"] = "%s Addr" % afam
d["rtt"] = "Latencey"
d["lastbeat"] = "Last Contact"
d["state"] = "State"
d["statetime"] = "Last State"
d["rttstate"] = "Reach"
d["deltastatetime"] = "Last State"
return d
def jsons(self):
return json.dumps(self.__dict__)
# set new state, return number of secs in previous state
def newstate(self, state, now, when=0):
self.state = state
delta = now - when
s = delta - self.statetime
self.statetime = delta
return s
def getstate(self):
return self.state
def newaddr(self, addr, rtt, now):
self.lastbeat = now
self.rtts.append(rtt)
if len(self.rtts) > MAXRTTS:
del self.rtts[0]
if self.addr == addr:
r = None
else:
r = "changed from %s to %s" % (self.addr, addr)
try:
del Connection.htab[self.addr]
except:
pass
self.addr = addr
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr))
return r
#
class Host:
# Table of Hosts
hosts = {}
dnsQ = queue.Queue()
def __init__(self, name):
global num
self.name = name
if name:
num += 1
Host.hosts[name] = self
self.num = num
self.dyn = False
self.watched = False
self.upcount = 0
self.interval = 0
self.doesack = -1
self.cmds = []
self.cver = 0
self.connections = {}
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
def statedict(self):
d = {}
d["name"] = self.name
if self.dyn:
d["name"] += "*"
if self.watched:
d["name"] = "<b>%s</b>" % d["name"]
d["dyn"] = str(self.dyn)
d["ver"] = str(self.cver)
d["num"] = self.num
for c in ["IPv4", "IPv6"]:
if c in self.connections:
cs = self.connections[c].statedict()
else:
cs = ubConnection.statedict(True)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def headerdict(self):
d = {}
d["name"] = "Name"
d["dyn"] = "Dyn"
d["ver"] = "Ver"
d["num"] = "??"
for c in ["IPv4", "IPv6"]:
cs = ubConnection.headerdict(c)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def registerDns(self):
for af in self.connections:
self.connections[af].registerDns()
def stateinfo(self):
ddict = {}
for d in self.__dict__:
if d == "connections":
cl = []
for c in self.connections:
# dirty ugly hack: fix conn to host backpointer
cld = copy.deepcopy(self.connections[c].__dict__)
cld["host"] = cld["host"].name
cl.append(cld)
ddict[d] = cl
else:
ddict[d] = self.__dict__[d]
return ddict
def jsons(self):
return json.dumps(self.stateinfo())
def setcver(self, cver):
self.cver = cver
def isDynDns(self):
return self.dyn
def isIPv4(self, addr):
if isinstance(addr, tuple):
return addr[0].find(".") > 0
else:
return addr.find(".") > 0
def conndata(self, cid, addr, rtt, now):
if addr[0:7] == "::ffff:":
addr = addr[7:]
if self.isIPv4(addr):
afam = "IPv4"
else:
afam = "IPv6"
if afam not in self.connections:
self.connections[afam] = Connection(self, cid, addr, afam)
conn = self.connections[afam]
res = conn.newaddr(addr, rtt, now)
return conn, res
# called when reloading class from pickle, add new fields here
def fixup(self):
for c in ["IPv4", "IPv6"]:
if c in self.connections:
addr = self.connections[c].addr
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.connections[c].addr = addr
pass
# def dispstate(self):
# if self.state in ["down", "overdue"]:
# state = "<b>%s</b>" % self.state
# elif self.state in ["up", "UP"]:
# state = ""
# for x in list(self.connections.keys()):
# try:
# state += " %5.1f" % (self.connections[x].rtts[-1])
# except:
# state += " %5s" % (self.connections[x].rtts[-1])
# elif self.state in ["unknown", "UNKNOWN"]:
# state = ""
# else:
# state = "%s" % self.state
# return state
def dispstats(self):
if self.doesack != -1:
if self.upcount > 0:
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
r = ""
for v in range(3):
a, u = self.hdwcounts[v]
if (self.upcount - u) != 0:
vs = "%0.0f" % (
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
)
if vs == "0":
vs = ""
else:
vs = "-"
r += '<td align="right">%s</td>' % vs
return r
else:
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
return '<td align="right">N/A</td><td></td<td></td>>'
hostfields_long = [
"name",
"IPv4.addr",
"IPv4.state",
("IPv4.rtt", 'style="text-align: right;"'),
("IPv4.statetime", 'style="text-align: right;"'),
"IPv6.addr",
"IPv6.state",
("IPv6.rtt", 'style="text-align: right;"'),
("IPv6.statetime", 'style="text-align: right;"'),
"ver",
]
hostfields_short = [
"name",
("IPv4.rttstate", 'style="text-align: right;"'),
("IPv4.deltastatetime", 'style="text-align: right;"'),
("IPv6.rttstate", 'style="text-align: right;"'),
("IPv6.deltastatetime", 'style="text-align: right;"'),
]
def gene(self, tag, v, attrib=None):
if attrib:
a = " %s" % attrib
else:
a = ""
return "<%s%s>%s</%s>" % (tag, a, v, tag)
def htmltable(self, tag, hd, short):
if short:
hostfields = Host.hostfields_short
else:
hostfields = Host.hostfields_long
h = []
for f in hostfields:
if isinstance(f, tuple):
h.append(self.gene(tag, hd[f[0]], f[1]))
else:
h.append(self.gene(tag, hd[f]))
return self.gene("tr", "\n".join(h))
def buildhosttable(self, short=False):
if DEBUG > 1:
print("DBG buildhosttable: start")
res = []
res.append('<table id="ntable" class="sortable">')
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
hosts_sorted = list(Host.hosts.keys())
if len(hosts_sorted):
hosts_sorted.sort()
for h in hosts_sorted:
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
res.append("</table>")
if DEBUG > 1:
print("DBG buildhosttable: %s" % res)
return res
def buildmsgtable(self, msgs):
res = []
le = max(40 - len(Host.hosts), 3)
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs) - le:]:
res.append("%s<BR>" % m)
return res
# create fake "unbound objects", remove in Python 3.0
ubHost = Host(None)
ubConnection = Connection(None, "", "", "")
Executable
+4
View File
@@ -0,0 +1,4 @@
#!/bin/sh
#echo "OK - all is well"
echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
+27 -11
View File
@@ -4,26 +4,41 @@ build-backend = "setuptools.build_meta"
[project]
name = "hbd"
version = "5.0.4"
description = "Heartbeat daemon (hbd) — receive heartbeats and act on them"
version = "5.0.7"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md"
requires-python = ">=3.11"
license = "MIT"
keywords = ["heartbeat", "monitoring", "dns", "websocket"]
keywords = ["heartbeat", "monitoring", "dns", "websocket", "system-monitoring"]
authors = [
{ name = "heartbeat contributors" }
]
# Core dependencies (required for both client and server)
dependencies = [
"websockets>=13.2",
"mattermostdriver>=7.3.0",
"PyYAML>=6.0",
"aiohttp>=3.11",
"Jinja2>=3.1.6",
"fastapi>=0.128.0",
]
[project.optional-dependencies]
# Client-only dependencies (hbc - system monitoring client)
client = [
"psutil>=5.9.0",
]
# Server-only dependencies (hbd - heartbeat daemon/server)
server = [
"websockets>=13.2",
"mattermostdriver>=7.3.0",
"aiohttp>=3.11",
"Jinja2>=3.1.6",
]
# Install both client and server
all = [
"hbd[client,server]",
]
# Development dependencies
dev = [
"pytest>=7.0",
"pytest-cov>=4.0",
@@ -35,15 +50,16 @@ dev = [
]
[project.scripts]
hbd = "hbd.cli:main"
hbc = "hbd.hbc:main"
hbd = "hbd.server.cli:main"
hbc = "hbd.client.main:main"
[tool.setuptools.packages.find]
where = ["."]
include = ["hbd*"]
[tool.setuptools.package-data]
"hbd" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
"hbd.server" = ["*.yaml", "static/*", "static/*/*", "templates/*"]
"hbd.client" = ["*.yaml"]
[tool.black]
+3 -1
View File
@@ -3,7 +3,7 @@
set -e
uv version --bump patch
VER=$(uv version --short)
sed -i "" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
# commit pyproject.toml
git commit -m "version $VER" pyproject.toml hbd/__init__.py
@@ -11,3 +11,5 @@ git push
# tag version
git tag -a v$VER -m "Version $VER"
git push --tags
rm hbd/__init__.py.bak
+390
View File
@@ -0,0 +1,390 @@
#!/usr/bin/env python3
"""
Demo script for HTTP API endpoints.
Tests and demonstrates the plugin data and alert APIs.
"""
import requests
import json
import sys
from datetime import datetime
from time import sleep
BASE_URL = "http://localhost:50004"
def print_section(title):
"""Print a formatted section header."""
print(f"\n{'=' * 70}")
print(f" {title}")
print('=' * 70)
def format_timestamp(timestamp):
"""Convert Unix timestamp to readable format."""
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
def format_duration(seconds):
"""Format duration in human-readable format."""
if seconds < 60:
return f"{int(seconds)}s"
elif seconds < 3600:
minutes = int(seconds / 60)
secs = int(seconds % 60)
return f"{minutes}m {secs}s"
elif seconds < 86400:
hours = int(seconds / 3600)
minutes = int((seconds % 3600) / 60)
return f"{hours}h {minutes}m"
else:
days = int(seconds / 86400)
hours = int((seconds % 86400) / 3600)
return f"{days}d {hours}h"
def test_hosts_api():
"""Test GET /api/0/hosts endpoint."""
print_section("1. List All Monitored Hosts")
try:
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5)
response.raise_for_status()
hosts = response.json()
print(f"Found {len(hosts)} hosts:\n")
for host in hosts:
name = host.get('name', 'unknown')
dyn = host.get('dyn', False)
conn_count = len(host.get('connections', []))
print(f"{name}")
print(f" - Protocol: IPv{ver}")
print(f" - Dynamic: {dyn}")
print(f" - Connections: {conn_count}")
return hosts
except requests.RequestException as e:
print(f"❌ Error: {e}")
return []
def test_host_plugins_api(hostname):
"""Test GET /api/0/hosts/{hostname}/plugins endpoint."""
print_section(f"2. Get All Plugins for Host: {hostname}")
try:
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/plugins", timeout=5)
response.raise_for_status()
data = response.json()
plugins = data.get('plugins', {})
print(f"Found {len(plugins)} plugins:\n")
for plugin_name, plugin_data in plugins.items():
timestamp = plugin_data.get('timestamp', 0)
sample_count = plugin_data.get('sample_count', 0)
metrics = plugin_data.get('data', {})
print(f" 📦 {plugin_name}")
print(f" Last update: {format_timestamp(timestamp)}")
print(f" Samples: {sample_count}")
print(f" Metrics: {len(metrics)}")
# Show first few metrics
for i, (metric, value) in enumerate(metrics.items()):
if i < 3: # Show only first 3 metrics
if isinstance(value, float):
print(f" - {metric}: {value:.2f}")
elif isinstance(value, dict):
print(f" - {metric}: [nested data, {len(value)} keys]")
else:
print(f" - {metric}: {value}")
if len(metrics) > 3:
print(f" ... and {len(metrics) - 3} more")
print()
return list(plugins.keys())
except requests.RequestException as e:
print(f"❌ Error: {e}")
return []
def test_plugin_detail_api(hostname, plugin_name, limit=5):
"""Test GET /api/0/hosts/{hostname}/plugins/{plugin_name} endpoint."""
print_section(f"3. Get Detailed Data: {hostname}/{plugin_name}")
try:
url = f"{BASE_URL}/api/0/hosts/{hostname}/plugins/{plugin_name}"
params = {'limit': limit}
response = requests.get(url, params=params, timeout=5)
response.raise_for_status()
data = response.json()
samples = data.get('samples', [])
print(f"Retrieved {len(samples)} samples (limit={limit}):\n")
for i, sample in enumerate(samples):
timestamp = sample.get('timestamp', 0)
metrics = sample.get('data', {})
print(f" [{i+1}] {format_timestamp(timestamp)}")
for metric, value in sorted(metrics.items())[:5]: # Show first 5 metrics
if isinstance(value, float):
print(f" {metric}: {value:.2f}")
elif isinstance(value, dict):
print(f" {metric}: [nested: {len(value)} keys]")
else:
print(f" {metric}: {value}")
print()
return samples
except requests.RequestException as e:
print(f"❌ Error: {e}")
return []
def test_host_alerts_api(hostname):
"""Test GET /api/0/hosts/{hostname}/alerts endpoint."""
print_section(f"4. Get Alerts for Host: {hostname}")
try:
response = requests.get(f"{BASE_URL}/api/0/hosts/{hostname}/alerts", timeout=5)
response.raise_for_status()
data = response.json()
alerts = data.get('alerts', [])
summary = data.get('summary', {})
print(f"Summary:")
print(f" ✓ OK: {summary.get('ok', 0)}")
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
print(f" 🔴 Critical: {summary.get('critical', 0)}")
print(f" ❓ Unknown: {summary.get('unknown', 0)}")
print()
# Show non-OK alerts
active_alerts = [a for a in alerts if a.get('level') != 'OK']
if active_alerts:
print(f"Active Alerts ({len(active_alerts)}):")
for alert in active_alerts:
metric = alert.get('metric_path', 'unknown')
level = alert.get('level', 'UNKNOWN')
value = alert.get('last_value', 0)
since = alert.get('since', 0)
duration = datetime.now().timestamp() - since
icon = '⚠️' if level == 'WARNING' else '🔴'
print(f" {icon} {metric}")
print(f" Level: {level}")
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
print(f" Duration: {format_duration(duration)}")
print()
else:
print("✓ No active alerts - all systems normal!")
return data
except requests.RequestException as e:
print(f"❌ Error: {e}")
return {}
def test_all_alerts_api():
"""Test GET /api/0/alerts endpoint."""
print_section("5. Get All Active Alerts Across All Hosts")
try:
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
response.raise_for_status()
data = response.json()
alerts = data.get('alerts', [])
summary = data.get('summary', {})
host_count = data.get('host_count', 0)
print(f"Monitoring {host_count} hosts")
print(f"Active Alerts: {summary.get('total', 0)}")
print(f" 🔴 Critical: {summary.get('critical', 0)}")
print(f" ⚠️ Warning: {summary.get('warning', 0)}")
print()
if alerts:
print("Alert Details:")
for alert in alerts:
hostname = alert.get('hostname', 'unknown')
metric = alert.get('metric_path', 'unknown')
level = alert.get('level', 'UNKNOWN')
value = alert.get('last_value', 0)
since = alert.get('since', 0)
duration = datetime.now().timestamp() - since
notification_count = alert.get('notification_count', 0)
icon = '⚠️' if level == 'WARNING' else '🔴'
print(f" {icon} {hostname} / {metric}")
print(f" Level: {level}")
print(f" Value: {value:.2f}" if isinstance(value, float) else f" Value: {value}")
print(f" Duration: {format_duration(duration)}")
print(f" Notifications: {notification_count}")
print()
else:
print("✅ All systems normal - no active alerts!")
return data
except requests.RequestException as e:
print(f"❌ Error: {e}")
return {}
def test_messages_api():
"""Test GET /api/0/messages endpoint."""
print_section("6. Get Recent Messages")
try:
response = requests.get(f"{BASE_URL}/api/0/messages", timeout=5)
response.raise_for_status()
messages = response.json()
print(f"Last {len(messages)} messages:\n")
for msg in messages[-5:]: # Show last 5
timestamp = msg.get('time', 0)
host = msg.get('host', 'unknown')
text = msg.get('msg', '')
print(f" [{format_timestamp(timestamp)}] {host}: {text}")
return messages
except requests.RequestException as e:
print(f"❌ Error: {e}")
return []
def test_error_handling():
"""Test API error handling."""
print_section("7. Error Handling Tests")
# Test non-existent host
print("Testing non-existent host...")
try:
response = requests.get(f"{BASE_URL}/api/0/hosts/nonexistenthost/plugins", timeout=5)
if response.status_code == 404:
error_data = response.json()
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
else:
print(f" ⚠️ Unexpected status code: {response.status_code}")
except Exception as e:
print(f" ❌ Error: {e}")
# Test non-existent plugin
print("\nTesting non-existent plugin...")
try:
# Get first host
hosts = requests.get(f"{BASE_URL}/api/0/hosts", timeout=5).json()
if hosts:
hostname = hosts[0]['name']
response = requests.get(
f"{BASE_URL}/api/0/hosts/{hostname}/plugins/nonexistentplugin",
timeout=5
)
if response.status_code == 404:
error_data = response.json()
print(f" ✓ Correctly returned 404: {error_data.get('error', 'No error message')}")
else:
print(f" ⚠️ Unexpected status code: {response.status_code}")
except Exception as e:
print(f" ❌ Error: {e}")
def demo_monitoring_loop():
"""Demonstrate continuous monitoring."""
print_section("8. Continuous Monitoring Demo (5 iterations)")
print("Monitoring alerts every 3 seconds (Ctrl+C to stop)...\n")
try:
for i in range(5):
response = requests.get(f"{BASE_URL}/api/0/alerts", timeout=5)
response.raise_for_status()
data = response.json()
summary = data.get('summary', {})
critical = summary.get('critical', 0)
warning = summary.get('warning', 0)
timestamp = datetime.now().strftime('%H:%M:%S')
status = "🔴 CRITICAL" if critical > 0 else "⚠️ WARNING" if warning > 0 else "✅ OK"
print(f"[{timestamp}] {status} - Critical: {critical}, Warning: {warning}")
if i < 4: # Don't sleep after last iteration
sleep(3)
except KeyboardInterrupt:
print("\n\nMonitoring stopped by user")
except Exception as e:
print(f"\n❌ Error: {e}")
def main():
"""Run all API tests."""
print("""
╔══════════════════════════════════════════════════════════════╗
║ Heartbeat Daemon HTTP API Demo & Test Suite ║
╚══════════════════════════════════════════════════════════════╝
""")
print(f"Testing API at: {BASE_URL}")
print(f"Ensure the heartbeat daemon is running!")
# Test basic connectivity
try:
response = requests.get(f"{BASE_URL}/api/0/hosts", timeout=2)
response.raise_for_status()
print("✅ API is reachable\n")
except Exception as e:
print(f"❌ Cannot connect to API: {e}")
print("\nPlease ensure:")
print(" 1. Heartbeat daemon is running")
print(" 2. HTTP server is enabled in configuration")
print(f" 3. Server is listening on port {BASE_URL.split(':')[-1]}")
sys.exit(1)
# Run test suite
hosts = test_hosts_api()
if not hosts:
print("\n⚠️ No hosts found. Ensure clients are sending heartbeats.")
return
# Pick first host for detailed testing
hostname = hosts[0].get('name', '')
if hostname:
plugins = test_host_plugins_api(hostname)
if plugins:
# Test detailed plugin data
test_plugin_detail_api(hostname, plugins[0], limit=3)
# Test alert endpoints
test_host_alerts_api(hostname)
# Test global endpoints
test_all_alerts_api()
test_messages_api()
# Test error handling
test_error_handling()
# Continuous monitoring demo
demo_monitoring_loop()
print_section("Test Suite Complete")
print("""
Next Steps:
• View the web UI at http://localhost:50004/live
• Check plugin metrics at http://localhost:50004/plugins
• Monitor alerts at http://localhost:50004/alerts
• Read API documentation: docs/HTTP_API.md
""")
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("\n\nDemo interrupted by user")
sys.exit(0)
+99
View File
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""
Test all plugins together.
"""
import asyncio
import logging
from pathlib import Path
# Setup path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from hbd.plugin import PluginRegistry, PluginLoader
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s: %(message)s"
)
async def test_all_plugins():
"""Test loading all plugins."""
print("=" * 70)
print("Testing All Plugins")
print("=" * 70)
# Create registry and loader
registry = PluginRegistry()
loader = PluginLoader(registry)
# Configuration for plugins
config = {
"cpu_monitor": {
"interval": 30,
"per_core": False
},
"nagios_runner": {
"interval": 60,
"commands": [
{
"name": "test_ok",
"command": "echo 'OK - test passed | metric=100%;;;0;100'"
},
{
"name": "test_warning",
"command": "echo 'WARNING - test result | value=85%;80;90;0;100' && exit 1"
}
]
}
}
# Load plugins
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
print(f"\n1. Loading plugins from: {plugin_dir}")
count = await loader.load_from_directory(plugin_dir, config)
print(f" ✓ Loaded {count} plugins")
# List loaded plugins
print(f"\n2. Loaded plugins:")
for plugin in registry.get_all():
print(f" - {plugin.name} v{plugin.version}")
print(f" Type: {plugin.__class__.__name__}")
print(f" Interval: {plugin.interval}s")
print(f" Description: {plugin.description}")
# Test collection for each plugin
print(f"\n3. Testing data collection:")
for plugin in registry.get_all():
print(f"\n {plugin.name}:")
try:
data = await plugin.collect()
print(f" ✓ Collected {len(data)} fields")
# Show sample of data
sample_count = min(5, len(data))
for key, value in list(data.items())[:sample_count]:
value_str = str(value)
if len(value_str) > 50:
value_str = value_str[:47] + "..."
print(f" {key}: {value_str}")
if len(data) > sample_count:
print(f" ... and {len(data) - sample_count} more fields")
except Exception as e:
print(f" ✗ Error: {e}")
# Cleanup
print(f"\n4. Cleanup...")
await loader.unload_all()
print(f" ✓ All plugins unloaded")
print(f"\n" + "=" * 70)
print(f"Successfully tested {count} plugins!")
print("=" * 70)
if __name__ == "__main__":
asyncio.run(test_all_plugins())
+160
View File
@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""
Test script for all monitoring plugins.
Tests all available plugins including the new ones:
- memory_monitor
- disk_monitor
- network_monitor
- filesystem_info
"""
import asyncio
import sys
import os
import logging
# Add parent directory to path so we can import hbd
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from hbd.plugin import PluginLoader
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def format_bytes(bytes_val):
"""Format bytes into human readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_val < 1024.0:
return f"{bytes_val:.2f} {unit}"
bytes_val /= 1024.0
return f"{bytes_val:.2f} PB"
def print_plugin_data(plugin_name, data, indent=2):
"""Pretty print plugin data."""
prefix = " " * indent
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, dict):
print(f"{prefix}{key}:")
print_plugin_data(plugin_name, value, indent + 2)
elif isinstance(value, list):
print(f"{prefix}{key}: [{len(value)} items]")
if len(value) <= 5: # Only show small lists
for item in value:
if isinstance(item, dict):
print_plugin_data(plugin_name, item, indent + 2)
else:
print(f"{prefix} - {item}")
else:
# Format output based on key name for better readability
if '_bytes' in key or key.endswith('_sent') or key.endswith('_recv') or 'memory_' in key or 'swap_' in key:
if isinstance(value, (int, float)) and value > 1024:
print(f"{prefix}{key}: {format_bytes(value)} ({value:,})")
else:
print(f"{prefix}{key}: {value}")
elif 'percent' in key:
print(f"{prefix}{key}: {value:.1f}%")
elif isinstance(value, float):
print(f"{prefix}{key}: {value:.2f}")
elif isinstance(value, int) and value > 1000:
print(f"{prefix}{key}: {value:,}")
else:
print(f"{prefix}{key}: {value}")
else:
print(f"{prefix}{data}")
async def main():
"""Main test function."""
print("="*60)
print("Plugin System Test Suite")
print("="*60)
# Load all available plugins using the plugin loader
from hbd.plugin import PluginRegistry, PluginLoader
from pathlib import Path
registry = PluginRegistry()
loader = PluginLoader(registry)
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
if not plugin_dir.exists():
print(f"✗ Plugin directory not found: {plugin_dir}")
return 1
# Load plugins from directory
count = await loader.load_from_directory(plugin_dir, {})
print(f"\nLoaded {count} plugins:")
plugins = registry.get_all()
for plugin in plugins:
print(f" - {plugin.name}: {plugin.__class__.__doc__.split('.')[0] if plugin.__class__.__doc__ else 'No description'}")
# Test each plugin
results = {}
for plugin in plugins:
# Skip nagios_runner as it needs specific configuration
if plugin.name == 'nagios_runner':
print(f"\n{'='*60}")
print(f"Skipping: {plugin.name} (requires specific configuration)")
print(f"{'='*60}")
results[plugin.name] = True # Mark as success since it loaded OK
continue
print(f"\n{'='*60}")
print(f"Testing: {plugin.name}")
print(f"{'='*60}")
try:
# Collect data
data = await plugin.collect()
if data:
if 'error' in data:
print(f"✗ Collection error: {data['error']}")
results[plugin.name] = False
else:
print(f"✓ Data collected: {len(data)} top-level fields")
print_plugin_data(plugin.name, data)
results[plugin.name] = True
else:
print(f"⚠ No data collected")
results[plugin.name] = False
except Exception as e:
print(f"✗ Failed to collect data: {e}")
import traceback
traceback.print_exc()
results[plugin.name] = False
# Summary
print(f"\n{'='*60}")
print("Test Summary")
print(f"{'='*60}")
success_count = sum(1 for v in results.values() if v)
total_count = len(results)
print(f"\nResults: {success_count}/{total_count} plugins successful")
for name, success in results.items():
status = "" if success else ""
print(f" {status} {name}")
if success_count == total_count:
print("\n🎉 All plugins passed!")
return 0
else:
print(f"\n{total_count - success_count} plugin(s) failed")
return 1
if __name__ == '__main__':
exit_code = asyncio.run(main())
sys.exit(exit_code)
+331
View File
@@ -0,0 +1,331 @@
#!/usr/bin/env python3
"""
Test script for message journal functionality.
Tests:
- Journal initialization
- Message logging
- File rotation based on size
- Backup management
"""
import asyncio
import sys
import os
import json
import tempfile
import shutil
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from hbd.journal import MessageJournal, get_journal
async def test_basic_logging():
"""Test basic message logging."""
print("="*60)
print("Test 1: Basic Message Logging")
print("="*60)
# Create temporary directory for journal
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
print(f"Using temp directory: {temp_dir}")
try:
# Create journal with config
config = {
'journal_enabled': True,
'journal_dir': temp_dir,
'journal_file': 'test.journal',
'journal_max_size': 1024, # 1KB for testing
'journal_max_backups': 3
}
journal = MessageJournal(config)
await journal.initialize()
# Log some test messages
test_messages = [
{
'ID': 'HTB',
'name': 'testhost1',
'interval': 30,
},
{
'ID': 'PLG',
'plugin': 'cpu_monitor',
'cpu_percent': 45.2,
'load_1min': 1.5
},
{
'ID': 'HTB',
'name': 'testhost2',
'interval': 60,
'boot': 1
}
]
for i, msg in enumerate(test_messages):
await journal.log_message(msg, ('192.168.1.100', 50000 + i), 1000.0 + i)
print(f"✓ Logged message {i+1}: {msg['ID']}")
# Check journal file exists
journal_path = Path(temp_dir) / 'test.journal'
if journal_path.exists():
print(f"✓ Journal file created: {journal_path}")
# Read and verify content
with open(journal_path, 'r') as f:
lines = f.readlines()
print(f"✓ Journal has {len(lines)} entries")
# Parse first entry
entry = json.loads(lines[0])
print(f"✓ First entry structure: {list(entry.keys())}")
assert 'timestamp' in entry
assert 'datetime' in entry
assert 'source_ip' in entry
assert 'message' in entry
print("✓ Entry structure validated")
else:
print("✗ Journal file not created")
return False
# Get stats
stats = journal.get_stats()
print(f"\nJournal stats:")
print(f" Enabled: {stats['enabled']}")
print(f" Current size: {stats['current_size']} bytes")
print(f" Max size: {stats['max_size']} bytes")
print(f" Rotation threshold: {stats['rotation_threshold']}")
await journal.close()
print("\n✅ Test 1 PASSED")
return True
except Exception as e:
print(f"\n✗ Test 1 FAILED: {e}")
import traceback
traceback.print_exc()
return False
finally:
# Cleanup
shutil.rmtree(temp_dir, ignore_errors=True)
async def test_rotation():
"""Test log rotation based on size."""
print("\n" + "="*60)
print("Test 2: Log Rotation")
print("="*60)
# Create temporary directory for journal
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
print(f"Using temp directory: {temp_dir}")
try:
# Create journal with small max size
config = {
'journal_enabled': True,
'journal_dir': temp_dir,
'journal_file': 'test.journal',
'journal_max_size': 500, # 500 bytes - very small for testing
'journal_max_backups': 3
}
journal = MessageJournal(config)
await journal.initialize()
# Log many messages to trigger rotation
print("Logging messages to trigger rotation...")
for i in range(20):
msg = {
'ID': 'HTB',
'name': f'testhost{i}',
'interval': 30,
'data': 'x' * 50 # Add some padding
}
await journal.log_message(msg, ('192.168.1.100', 50000 + i), 1000.0 + i)
# Give rotation time to complete
await asyncio.sleep(0.01)
print(f"✓ Logged 20 messages")
# Check for rotated files
journal_dir = Path(temp_dir)
all_files = list(journal_dir.glob('test.journal*'))
print(f"✓ Found {len(all_files)} journal files")
for f in sorted(all_files):
size = f.stat().st_size
print(f" - {f.name}: {size} bytes")
# Should have current file + some backups
if len(all_files) > 1:
print(f"✓ Rotation occurred ({len(all_files) - 1} backup files)")
else:
print("⚠ No rotation occurred (may not have reached threshold)")
# Check max backups limit
backup_files = [f for f in all_files if f.name != 'test.journal']
if len(backup_files) <= config['journal_max_backups']:
print(f"✓ Backup count within limit: {len(backup_files)} <= {config['journal_max_backups']}")
else:
print(f"✗ Too many backups: {len(backup_files)} > {config['journal_max_backups']}")
return False
await journal.close()
print("\n✅ Test 2 PASSED")
return True
except Exception as e:
print(f"\n✗ Test 2 FAILED: {e}")
import traceback
traceback.print_exc()
return False
finally:
# Cleanup
shutil.rmtree(temp_dir, ignore_errors=True)
async def test_disabled_journal():
"""Test that disabled journal doesn't write anything."""
print("\n" + "="*60)
print("Test 3: Disabled Journal")
print("="*60)
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
print(f"Using temp directory: {temp_dir}")
try:
config = {
'journal_enabled': False,
'journal_dir': temp_dir,
'journal_file': 'test.journal'
}
journal = MessageJournal(config)
await journal.initialize()
# Try to log a message
msg = {'ID': 'HTB', 'name': 'testhost'}
await journal.log_message(msg, ('192.168.1.100', 50000), 1000.0)
# Check that no file was created
journal_path = Path(temp_dir) / 'test.journal'
if not journal_path.exists():
print("✓ No journal file created (as expected)")
else:
print("✗ Journal file was created despite being disabled")
return False
await journal.close()
print("\n✅ Test 3 PASSED")
return True
except Exception as e:
print(f"\n✗ Test 3 FAILED: {e}")
import traceback
traceback.print_exc()
return False
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
async def test_global_instance():
"""Test global journal instance."""
print("\n" + "="*60)
print("Test 4: Global Journal Instance")
print("="*60)
temp_dir = tempfile.mkdtemp(prefix="journal_test_")
try:
config = {
'journal_enabled': True,
'journal_dir': temp_dir,
'journal_file': 'global.journal'
}
# Get global instance
journal1 = get_journal(config)
journal2 = get_journal() # Should return same instance
if journal1 is journal2:
print("✓ Global instance returns same object")
else:
print("✗ Global instance returns different objects")
return False
await journal1.initialize()
# Log through convenience function
from hbd.journal import log_message
msg = {'ID': 'HTB', 'name': 'testhost'}
await log_message(msg, ('192.168.1.100', 50000))
journal_path = Path(temp_dir) / 'global.journal'
if journal_path.exists():
print("✓ Global journal logged message")
else:
print("✗ Global journal did not log message")
return False
await journal1.close()
print("\n✅ Test 4 PASSED")
return True
except Exception as e:
print(f"\n✗ Test 4 FAILED: {e}")
import traceback
traceback.print_exc()
return False
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
async def main():
"""Run all tests."""
print("Message Journal Test Suite")
print("="*60)
tests = [
test_basic_logging,
test_rotation,
test_disabled_journal,
test_global_instance
]
results = []
for test in tests:
result = await test()
results.append(result)
# Summary
print("\n" + "="*60)
print("Test Summary")
print("="*60)
passed = sum(results)
total = len(results)
print(f"Passed: {passed}/{total}")
for i, (test, result) in enumerate(zip(tests, results), 1):
status = "✅ PASS" if result else "❌ FAIL"
print(f" {status} - Test {i}: {test.__name__}")
if passed == total:
print("\n🎉 All tests passed!")
return 0
else:
print(f"\n{total - passed} test(s) failed")
return 1
if __name__ == '__main__':
exit_code = asyncio.run(main())
sys.exit(exit_code)
+150
View File
@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""
Test the Nagios Runner Plugin.
"""
import asyncio
import logging
from pathlib import Path
# Setup path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from hbd.plugins.nagios_runner import NagiosRunnerPlugin, NAGIOS_OK, NAGIOS_WARNING
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
)
async def test_nagios_runner():
"""Test Nagios runner plugin."""
print("=" * 70)
print("Testing Nagios Runner Plugin")
print("=" * 70)
# Create test configuration with simple shell commands
# These mimic Nagios plugin output format
config = {
"interval": 60,
"timeout": 10,
"commands": [
{
"name": "check_uptime",
"command": "echo 'OK - uptime is 5 days | uptime=432000s;;;0'"
},
{
"name": "check_memory",
"command": "echo 'OK - Memory usage 45% | memory=45%;80;90;0;100'"
},
{
"name": "check_cpu",
"command": "echo 'WARNING - CPU load high | load1=5.2;5.0;10.0;0 load5=4.8;4.0;8.0;0 load15=3.2;3.0;6.0;0' && exit 1"
},
{
"name": "check_disk",
"command": "echo 'OK - Disk usage 62% | /=62%;80;90;0;100 /home=45%;80;90;0;100'"
}
]
}
print("\n1. Creating Nagios Runner plugin with test configuration")
print(f" Configured {len(config['commands'])} test commands")
plugin = NagiosRunnerPlugin(config)
print(f"\n2. Initializing plugin...")
initialized = await plugin.initialize()
print(f" Initialized: {initialized}")
if not initialized:
print(" ERROR: Plugin failed to initialize!")
return
print(f"\n3. Collecting metrics from Nagios plugins...")
data = await plugin.collect()
print(f" ✓ Collected {len(data)} data points")
print(f"\n4. Results:")
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
print(f" Plugins Executed: {data.get('plugin_count')}")
# Show individual plugin results
print(f"\n5. Individual Plugin Results:")
for cmd_config in config["commands"]:
name = cmd_config["name"]
status = data.get(f"{name}_status", "N/A")
status_code = data.get(f"{name}_status_code", "N/A")
output = data.get(f"{name}_output", "N/A")
print(f"\n {name}:")
print(f" Status: {status} (code: {status_code})")
print(f" Output: {output}")
# Show performance data if present
perf_keys = [k for k in data.keys() if k.startswith(f"{name}_") and
k not in [f"{name}_status", f"{name}_status_code", f"{name}_output"]]
if perf_keys:
print(f" Performance Data:")
for key in perf_keys:
metric_name = key.replace(f"{name}_", "")
print(f" {metric_name}: {data[key]}")
print(f"\n6. Testing Nagios plugin detection (if available)...")
# Try to find actual Nagios plugins on the system
common_nagios_paths = [
"/usr/lib/nagios/plugins",
"/usr/local/nagios/libexec",
"/usr/lib64/nagios/plugins"
]
nagios_plugin_dir = None
for path in common_nagios_paths:
if Path(path).exists():
nagios_plugin_dir = Path(path)
print(f" ✓ Found Nagios plugins at: {nagios_plugin_dir}")
break
if nagios_plugin_dir:
# Try check_users if it exists
check_users = nagios_plugin_dir / "check_users"
if check_users.exists():
print(f"\n Testing real Nagios plugin: check_users")
real_config = {
"commands": [{
"name": "users",
"command": f"{check_users} -w 10 -c 20"
}]
}
real_plugin = NagiosRunnerPlugin(real_config)
await real_plugin.initialize()
real_data = await real_plugin.collect()
print(f" Status: {real_data.get('users_status')}")
print(f" Output: {real_data.get('users_output')}")
# Show any performance data
for key in real_data:
if key.startswith("users_") and "status" not in key and "output" not in key:
print(f" {key}: {real_data[key]}")
else:
print(f" check_users not found at {check_users}")
else:
print(f" No Nagios plugins directory found")
print(f" Install nagios-plugins to test with real plugins:")
print(f" sudo apt-get install nagios-plugins # Debian/Ubuntu")
print(f" sudo yum install nagios-plugins-all # RHEL/CentOS")
print(f"\n7. Cleanup...")
await plugin.cleanup()
print(f" ✓ Cleanup complete")
print(f"\n" + "=" * 70)
print("Test complete!")
print("=" * 70)
if __name__ == "__main__":
asyncio.run(test_nagios_runner())
+119
View File
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Test script for plugin system.
"""
import asyncio
import logging
from pathlib import Path
# Setup path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from hbd.plugin import PluginRegistry, PluginLoader
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
)
async def test_plugins():
"""Test plugin loading and collection."""
print("=" * 60)
print("Testing Plugin System")
print("=" * 60)
# Create registry and loader
registry = PluginRegistry()
loader = PluginLoader(registry)
# Load plugins
plugin_dir = Path(__file__).parent / "hbd" / "plugins"
print(f"\n1. Loading plugins from: {plugin_dir}")
if not plugin_dir.exists():
print(f" ERROR: Plugin directory does not exist!")
return
count = await loader.load_from_directory(plugin_dir)
print(f" Loaded {count} plugins")
# List loaded plugins
print(f"\n2. Loaded plugins:")
for plugin in registry.get_all():
print(f" - {plugin.name} v{plugin.version} ({plugin.__class__.__name__})")
print(f" Description: {plugin.description}")
print(f" Interval: {plugin.interval}s")
# Test InfoPlugins
print(f"\n3. Testing InfoPlugins (collect once):")
from hbd.plugin import InfoPlugin
for plugin in registry.get_by_type(InfoPlugin):
print(f"\n Collecting {plugin.name}...")
try:
data = await plugin.collect()
print(f" ✓ Success! Got {len(data)} fields")
for key, value in list(data.items())[:5]: # Show first 5 fields
print(f" {key}: {value}")
if len(data) > 5:
print(f" ... and {len(data) - 5} more fields")
except Exception as e:
print(f" ✗ Error: {e}")
# Test MonitorPlugins
print(f"\n4. Testing MonitorPlugins (periodic collection):")
from hbd.plugin import MonitorPlugin
for plugin in registry.get_by_type(MonitorPlugin):
print(f"\n Collecting {plugin.name}...")
try:
data = await plugin.collect()
print(f" ✓ Success! Got {len(data)} fields")
for key, value in list(data.items())[:8]: # Show first 8 fields
print(f" {key}: {value}")
if len(data) > 8:
print(f" ... and {len(data) - 8} more fields")
except Exception as e:
print(f" ✗ Error: {e}")
# Test protocol encoding
print(f"\n5. Testing protocol encoding:")
from hbd.proto import dicttos, stodict
# Create sample plugin data
test_data = {
"plugin": "test_plugin",
"cpu_percent": 42.5,
"memory_mb": 1024,
"processes": 156,
"load_avg": [1.2, 0.8, 0.5],
"disk_info": {"sda": {"used": 50, "total": 100}}
}
print(f" Original data: {test_data}")
# Encode
encoded = dicttos("PLG", test_data)
print(f" Encoded ({len(encoded)} bytes): {encoded[:50]}...")
# Decode
decoded = stodict(encoded)
print(f" Decoded: {decoded}")
# Verify
if decoded.get("ID") == "PLG" and decoded.get("plugin") == "test_plugin":
print(f" ✓ Protocol encoding/decoding works!")
else:
print(f" ✗ Protocol encoding/decoding failed!")
# Cleanup
print(f"\n6. Cleaning up...")
await loader.unload_all()
print(f" ✓ Cleanup complete")
print(f"\n" + "=" * 60)
print("Test complete!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(test_plugins())
+60
View File
@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""
Debug plugin loading.
"""
import asyncio
import logging
from pathlib import Path
# Setup path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from hbd.plugin import PluginRegistry, PluginLoader
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s %(name)s %(levelname)s: %(message)s"
)
async def test_manual_load():
"""Test manual plugin loading."""
print("Testing manual plugin import...")
# Import plugins directly
from hbd.plugins.os_info import OSInfoPlugin
from hbd.plugins.cpu_monitor import CPUMonitorPlugin
# Create instances
os_plugin = OSInfoPlugin()
cpu_plugin = CPUMonitorPlugin()
print(f"OS Plugin: {os_plugin.name} v{os_plugin.version}")
print(f"CPU Plugin: {cpu_plugin.name} v{cpu_plugin.version}")
# Initialize
print("\nInitializing plugins...")
os_init = await os_plugin.initialize()
cpu_init = await cpu_plugin.initialize()
print(f"OS plugin initialized: {os_init}")
print(f"CPU plugin initialized: {cpu_init}")
# Collect data
if os_init:
print("\nCollecting OS info...")
os_data = await os_plugin.collect()
print(f"Got {len(os_data)} fields:")
for k, v in list(os_data.items())[:10]:
print(f" {k}: {v}")
if cpu_init:
print("\nCollecting CPU info...")
cpu_data = await cpu_plugin.collect()
print(f"Got {len(cpu_data)} fields:")
for k, v in cpu_data.items():
print(f" {k}: {v}")
if __name__ == "__main__":
asyncio.run(test_manual_load())
+495
View File
@@ -0,0 +1,495 @@
#!/usr/bin/env python3
"""
Test suite for the threshold checking and alerting system.
Tests cover:
- Threshold configuration parsing
- Threshold evaluation (all operators)
- Hysteresis functionality
- Alert state tracking
- State change detection
- Notification triggering
- Re-notification logic
"""
import sys
import time
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from hbd.threshold import (
ThresholdChecker,
ThresholdConfig,
AlertLevel,
AlertState,
ComparisonOperator,
)
def test_threshold_config_basic():
"""Test basic threshold configuration."""
print("Test 1: Basic threshold configuration...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
)
# Test below warning
result = config.evaluate(50.0)
assert result == AlertLevel.OK, f"Expected OK, got {result}"
# Test at warning
result = config.evaluate(80.0)
assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
# Test above warning but below critical
result = config.evaluate(85.0)
assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
# Test above critical
result = config.evaluate(95.0)
assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
print(" ✓ Basic threshold configuration works")
def test_threshold_operators():
"""Test all comparison operators."""
print("\nTest 2: Comparison operators...")
# Greater than operator
config_gt = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
critical=90.0,
operator=">",
)
assert config_gt.evaluate(85.0) == AlertLevel.WARNING
assert config_gt.evaluate(75.0) == AlertLevel.OK
# Less than operator (for inverse thresholds like available memory)
config_lt = ThresholdConfig(
metric_path="memory.available_mb",
warning=1000,
critical=500,
operator="<",
)
assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
# Greater than or equal
config_gte = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
operator=">=",
)
assert config_gte.evaluate(80.0) == AlertLevel.WARNING
assert config_gte.evaluate(79.9) == AlertLevel.OK
# Less than or equal
config_lte = ThresholdConfig(
metric_path="test.metric",
warning=20.0,
operator="<=",
)
assert config_lte.evaluate(20.0) == AlertLevel.WARNING
assert config_lte.evaluate(20.1) == AlertLevel.OK
print(" ✓ All comparison operators work correctly")
def test_hysteresis():
"""Test hysteresis to prevent flapping."""
print("\nTest 3: Hysteresis...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
hysteresis=0.1, # 10% hysteresis
)
# Start at OK, go to WARNING
result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
assert result == AlertLevel.WARNING, "Should enter WARNING state"
# Try to recover with insufficient improvement (within hysteresis)
# Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
# Recover with sufficient improvement
result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
assert result == AlertLevel.OK, "Should recover to OK"
# Test critical hysteresis
result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
# Try to recover from critical with insufficient improvement
# Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
# Sufficient improvement to drop from CRITICAL (below 81)
result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
# Now test dropping from CRITICAL to WARNING
result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
# Drop to between warning and critical, but still in hysteresis zone
result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
# Drop below critical hysteresis but still above warning threshold
# At 80.1, we're above WARNING (80) so should evaluate to WARNING
result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
print(" ✓ Hysteresis prevents flapping")
def test_alert_state():
"""Test alert state tracking."""
print("\nTest 4: Alert state tracking...")
alert = AlertState("cpu_monitor.cpu_percent")
# Initial state
assert alert.level == AlertLevel.OK
assert alert.notification_count == 0
# Update to WARNING - should trigger notification
changed = alert.update(AlertLevel.WARNING, 85.0)
assert changed == True, "State change should return True"
assert alert.level == AlertLevel.WARNING
assert alert.last_value == 85.0
# Update with same level - no notification
changed = alert.update(AlertLevel.WARNING, 86.0)
assert changed == False, "No state change should return False"
assert alert.last_value == 86.0
# Escalate to CRITICAL
changed = alert.update(AlertLevel.CRITICAL, 95.0)
assert changed == True, "Escalation should trigger notification"
assert alert.level == AlertLevel.CRITICAL
# Recover to OK
changed = alert.update(AlertLevel.OK, 50.0)
assert changed == True, "Recovery should trigger notification"
assert alert.level == AlertLevel.OK
print(" ✓ Alert state tracking works correctly")
def test_threshold_checker_parsing():
"""Test parsing threshold configuration from YAML structure."""
print("\nTest 5: Configuration parsing...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"operator": ">",
"hysteresis": 0.1,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
"memory_monitor": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
"available_mb": {
"warning": 1000,
"critical": 500,
"operator": "<",
},
},
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
checker = ThresholdChecker(config)
# Verify thresholds were parsed
assert "cpu_monitor.cpu_percent" in checker.thresholds
assert "cpu_monitor.load_1min" in checker.thresholds
assert "memory_monitor.percent" in checker.thresholds
assert "memory_monitor.available_mb" in checker.thresholds
assert "disk_monitor./.percent" in checker.thresholds
assert "disk_monitor./home.percent" in checker.thresholds
# Verify operators were parsed correctly
assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly")
def test_check_plugin_data():
"""Test checking plugin data against thresholds."""
print("\nTest 6: Plugin data checking...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
}
}
notifications = []
def notification_callback(msg):
notifications.append(msg)
checker = ThresholdChecker(config, notification_callback=notification_callback)
alert_states = {}
# First check - OK
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 0, "No thresholds violated, no state changes"
assert len(notifications) == 0, "No notifications should be sent"
# Second check - WARNING
plugin_data = {
"cpu_percent": 85.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One metric should change state"
assert state_changes[0][0] == "cpu_monitor.cpu_percent"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1, "One notification should be sent"
assert "WARNING" in notifications[0]
assert "testhost" in notifications[0]
# Third check - CRITICAL
plugin_data = {
"cpu_percent": 95.0,
"load_1min": 9.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should change state"
assert len(notifications) == 2, "Two notifications should be sent"
# Fourth check - Recovery
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 1.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should recover"
assert len(notifications) == 2, "Two recovery notifications"
assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
print(" ✓ Plugin data checking and notifications work")
def test_nested_metrics():
"""Test checking nested metrics like disk partitions."""
print("\nTest 7: Nested metrics (partitions)...")
config = {
"thresholds": {
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
plugin_data = {
"partitions": {
"/": {
"percent": 75.0,
"free_gb": 50.0,
},
"/home": {
"percent": 88.0, # Should trigger WARNING
"free_gb": 100.0,
},
},
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="disk_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One partition should trigger alert"
assert "/home" in state_changes[0][0], "Should be /home partition"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1
print(" ✓ Nested metric checking works")
def test_alert_summary():
"""Test getting alert summaries."""
print("\nTest 8: Alert summaries...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {"warning": 80.0, "critical": 90.0},
"load_1min": {"warning": 4.0, "critical": 8.0},
},
"memory_monitor": {
"percent": {"warning": 85.0, "critical": 95.0},
},
}
}
checker = ThresholdChecker(config)
alert_states = {}
# Create some alert states
plugin_data = {
"cpu_percent": 85.0, # WARNING
"load_1min": 9.0, # CRITICAL
}
checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
plugin_data = {
"percent": 96.0, # CRITICAL
}
checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
# Get summary
summary = checker.get_alert_summary(alert_states)
assert summary["warning"] == 1, "Should have 1 warning"
assert summary["critical"] == 2, "Should have 2 critical"
# Get active alerts
active = checker.get_active_alerts(alert_states)
assert len(active) == 3, "Should have 3 active alerts"
print(" ✓ Alert summaries work correctly")
def run_all_tests():
"""Run all tests."""
print("=" * 70)
print("THRESHOLD SYSTEM TEST SUITE")
print("=" * 70)
try:
test_threshold_config_basic()
test_threshold_operators()
test_hysteresis()
test_alert_state()
test_threshold_checker_parsing()
test_check_plugin_data()
test_nested_metrics()
test_alert_summary()
print("\n" + "=" * 70)
print("✓ ALL TESTS PASSED")
print("=" * 70)
return 0
except AssertionError as e:
print(f"\n✗ TEST FAILED: {e}")
import traceback
traceback.print_exc()
return 1
except Exception as e:
print(f"\n✗ UNEXPECTED ERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(run_all_tests())
+1 -1
View File
@@ -44,6 +44,6 @@ def test_handle_cmd_sends_command():
handle_datagram(msg2, ("127.0.0.1", 50000), ftr, ctx)
# should have sent ACK and the command; last send should be non-empty
assert len(ftr.sent) >= 1
# the command for cver 0 will be sent as raw cmd string
# the command for ??0 will be sent as raw cmd string
# so at least one send contains b'doit' or similar
assert any(b"doit" in s[0] for s in ftr.sent)
+1 -1
View File
@@ -9,6 +9,6 @@ def test_parse_message_uncompressed():
def test_parse_message_compressed():
raw = dicttos("ACK", {"time": 1}, compress=True)
raw = dicttos("ACK", {"time": 1})
m = parse_message(raw)
assert "ID" in m