Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| daf5277507 | |||
| ee3b72878f | |||
| 6217f7a124 | |||
| 2468386f24 | |||
| 2015195112 | |||
| 3426185383 | |||
| 9eedbafe97 | |||
| a5f31c5cb5 | |||
| 2f72cf0118 | |||
| c56e77c2c1 | |||
| e9aa7a6f8b | |||
| a75a8a4087 | |||
| ba27d2e300 | |||
| 381e37efce | |||
| 97dfc08f4d | |||
| d281ac5a70 | |||
| 812bbf8555 | |||
| e6b7a1aa27 | |||
| 90f47ad018 | |||
| cc458e8972 |
@@ -11,3 +11,4 @@ dist/
|
||||
*.egg-info/
|
||||
ssl/
|
||||
uv.lock
|
||||
.hb.yaml
|
||||
|
||||
@@ -1,279 +0,0 @@
|
||||
#name: "w02"
|
||||
hb_port: 50003
|
||||
hbd_host: ''
|
||||
#logfile: "/home/andreas/public_html/messages/andreas"
|
||||
logfile: "/home/andreas/logs/heartbeat/heartbeat.log"
|
||||
#logfile: "/Users/andreas/public_html/messages/andreas"
|
||||
logfmt: "msg"
|
||||
grace: 40
|
||||
interval: 10
|
||||
autosave_interval: 300 # Autosave interval in seconds (default: 5 minutes)
|
||||
|
||||
|
||||
users:
|
||||
andreas:
|
||||
full_name: Andreas Wrede
|
||||
password: pbkdf2:sha256:260000:eece9cdaebc22247566f78983bf5b2a3:f8c74cc057c5590943c115a60bac62f9458e9ba0d2e7e7421b6f0fe5d860e18f # hbd passwd andreas
|
||||
avatar: /home/andreas/.avatar/Andreas-avatar3-small.png
|
||||
admin: true
|
||||
ops:
|
||||
full_name: Operations Team
|
||||
password: pbkdf2:sha256:260000:... # hbd passwd ops
|
||||
admin: false
|
||||
readonly:
|
||||
full_name: Read-Only User
|
||||
password: pbkdf2:sha256:260000:... # hbd
|
||||
|
||||
default_owner: andreas
|
||||
|
||||
hosts:
|
||||
weekend:
|
||||
owner: andreas
|
||||
managers: [ops]
|
||||
monitors: [readonly]
|
||||
|
||||
|
||||
# Notification Channels - Define notification providers centrally
|
||||
# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration
|
||||
notification_channels:
|
||||
|
||||
pushover_standard:
|
||||
type: pushover
|
||||
token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf
|
||||
user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK
|
||||
|
||||
signal_andreas:
|
||||
type: signal
|
||||
cli_path: /usr/local/bin/signal-cli
|
||||
user: +14168226179
|
||||
recipient: +14168226179
|
||||
|
||||
email_andreas:
|
||||
type: email
|
||||
recipients: [aew.hbd.notify@wrede.ca]
|
||||
sender: aew.hbd@wrede.ca
|
||||
smtp_server: smtp.fastmail.com
|
||||
smtp_port: 587
|
||||
smtp_user: andreas@wrede.ca
|
||||
smtp_password: pvtvefyp5gbhnch2
|
||||
|
||||
# Example additional channels (commented out)
|
||||
# pushover_urgent:
|
||||
# type: pushover
|
||||
# token: your-app-token
|
||||
# user: your-user-key
|
||||
#
|
||||
mattermost_devops:
|
||||
type: mattermost
|
||||
host: mattermost.example.com
|
||||
token: webhook-token
|
||||
channel: devops-alerts
|
||||
username: heartbeat-bot
|
||||
icon: https://example.com/heartbeat-icon.png
|
||||
|
||||
# Default notification channels (used if host doesn't specify channels)
|
||||
default_notification_channels: [pushover_standard]
|
||||
|
||||
# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications
|
||||
hosts:
|
||||
wentworth:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
y:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
winter:
|
||||
threshold_config: default
|
||||
watch: true
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
wally:
|
||||
threshold_config: freebsd_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
eris:
|
||||
threshold_config: truenas_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: false
|
||||
|
||||
haschloss:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
wayback:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
wertvoll:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
weekend:
|
||||
threshold_config: freebsd_server
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
cotgate:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
rvgate:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
dyndns: true
|
||||
|
||||
draper:
|
||||
threshold_config: default
|
||||
watch: false
|
||||
notification_channels: [pushover_standard]
|
||||
dyndns: true
|
||||
|
||||
# Hosts to drop/ignore
|
||||
drophosts: {"unknown", "wookie15", "wort"}
|
||||
|
||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
||||
|
||||
dyndomains: {"wrede.org"}
|
||||
|
||||
ws_port: 50005
|
||||
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
|
||||
# cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
|
||||
# cert_path: "test/"
|
||||
# CERT_PATH = "./test/"
|
||||
# wss_pem: "fullchain.pem"
|
||||
# wss_key: "privkey.pem"
|
||||
|
||||
journal_enabled: true # Enable/disable journaling
|
||||
journal_dir: /home/andreas/logs/heartbeat # Journal directory
|
||||
journal_file: messages.journal # Base filename
|
||||
journal_max_size: 104857600 # Max size (100MB default)
|
||||
journal_max_backups: 10 # Number of backups to keep
|
||||
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
rtt:
|
||||
warning: 200
|
||||
critical: 250.0
|
||||
|
||||
|
||||
freebsd_server:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
memory_percent:
|
||||
warning: 97.0
|
||||
critical: 100.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
nagios_runner:
|
||||
# overall_status_code:
|
||||
# warning: 1
|
||||
# critical: 2
|
||||
# operator: ">="
|
||||
load_status:
|
||||
warning: WARNING
|
||||
critical: CRITICAL
|
||||
operator: "=="
|
||||
ups_load:
|
||||
display: "load to high: {ups_output}"
|
||||
warning: 70
|
||||
critical: 80
|
||||
operator: ">="
|
||||
ups_status_code:
|
||||
display: "{ups_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
nextcloud_apps_status_code:
|
||||
display: "{nextcloud_apps_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
rtt:
|
||||
warning: 200
|
||||
critical: 250.0
|
||||
|
||||
truenas_server:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
memory_monitor:
|
||||
percent:
|
||||
warning: 3.0
|
||||
critical: 95.0
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 90.0
|
||||
nagios_runner:
|
||||
# overall_status_code:
|
||||
# warning: 1
|
||||
# critical: 2
|
||||
# operator: ">="
|
||||
load_status:
|
||||
warning: WARNING
|
||||
critical: CRITICAL
|
||||
operator: "=="
|
||||
ups_load:
|
||||
display: "load to high: {ups_output}"
|
||||
WARNING: 70
|
||||
CRITICAL: 80
|
||||
OPERATOR: ">="
|
||||
ups_status_code:
|
||||
DISPLAY: "{ups_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
nextcloud_apps_status_code:
|
||||
display: "{nextcloud_apps_output}"
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
rtt:
|
||||
warning: 120
|
||||
critical: 250.0
|
||||
|
||||
|
||||
Vendored
+6
-5
@@ -4,12 +4,13 @@
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python: Run hbd (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "hbd.server.cli",
|
||||
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x"],
|
||||
"args": ["-c", "~/.hb.yaml", "-f", "-v"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
@@ -28,14 +29,14 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Python: Run hbd with debugpy (listen)",
|
||||
"name": "Python: Run hbc (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "debugpy",
|
||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", ".hb.yaml", "-f", "-v"],
|
||||
"module": "hbd.client.main",
|
||||
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -76,7 +76,7 @@ See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integr
|
||||
### Creating Custom Plugins
|
||||
|
||||
```python
|
||||
from hbd.plugin import MonitorPlugin
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
class DiskMonitorPlugin(MonitorPlugin):
|
||||
name = "disk_monitor"
|
||||
@@ -89,7 +89,7 @@ class DiskMonitorPlugin(MonitorPlugin):
|
||||
}
|
||||
```
|
||||
|
||||
Place plugins in `hbd/plugins/` and they'll be automatically discovered and loaded by the client.
|
||||
Place plugins in `hbd/client/plugins/` and they'll be automatically discovered and loaded by the client.
|
||||
|
||||
---
|
||||
|
||||
@@ -368,7 +368,7 @@ See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation includin
|
||||
|
||||
Prerequisites:
|
||||
|
||||
- Python 3.10+ (project uses language features from recent Python)
|
||||
- Python 3.11+ (project uses language features from recent Python)
|
||||
- `nsupdate` (for DNS updates) if using dynamic DNS
|
||||
|
||||
Install dependencies (recommended into a venv):
|
||||
@@ -389,7 +389,7 @@ hbd -c .hb.yaml -f -v
|
||||
You can also run it directly via the package entrypoint after installation:
|
||||
|
||||
```bash
|
||||
python -m hbd.cli -c /path/to/config.yaml
|
||||
python -m hbd.server.cli -c /path/to/config.yaml
|
||||
```
|
||||
|
||||
### Running the Client
|
||||
@@ -397,14 +397,23 @@ python -m hbd.cli -c /path/to/config.yaml
|
||||
The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
|
||||
|
||||
```bash
|
||||
# Basic usage pointing to server
|
||||
python -m hbd.hbc --server your-server.example.com
|
||||
# Basic usage pointing to server (host is a positional argument)
|
||||
hbc your-server.example.com
|
||||
|
||||
# With custom configuration
|
||||
python -m hbd.hbc --server 192.168.1.100 --port 50003 --interval 30
|
||||
# Run as daemon with a config file
|
||||
hbc -d -c /etc/hbc.yaml your-server.example.com
|
||||
|
||||
# Run with specific plugins enabled/disabled
|
||||
python -m hbd.hbc --server hbd.local --disable-plugin os_info
|
||||
# Send a one-off boot message
|
||||
hbc --boot your-server.example.com
|
||||
|
||||
# Verbose output
|
||||
hbc -v your-server.example.com
|
||||
```
|
||||
|
||||
You can also run it via the module entrypoint:
|
||||
|
||||
```bash
|
||||
python -m hbd.client.main your-server.example.com
|
||||
```
|
||||
|
||||
Client configuration can also be specified in YAML:
|
||||
@@ -438,30 +447,29 @@ This repository includes a ready-to-use `.vscode/launch.json` with configuration
|
||||
|
||||
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
|
||||
- Use **F5** and pick one of these configurations from the Run view:
|
||||
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
||||
- **Python: Run hbd (module)** — runs `hbd.server.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
|
||||
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
|
||||
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
|
||||
|
||||
To start `hbd` manually and wait for the debugger to attach, run:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v
|
||||
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli -c .hb.yaml -f -v
|
||||
```
|
||||
|
||||
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
||||
Set breakpoints in modules such as `hbd/server/udp.py`, `hbd/server/dns.py`, or `hbd/server/main.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
|
||||
|
||||
---
|
||||
|
||||
## 🛠 Configuration
|
||||
|
||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`):
|
||||
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/server/config.py`):
|
||||
|
||||
- `hb_port`: UDP port to listen for heartbeats (default: 50003)
|
||||
- `hbd_port`: internal control port (default: 50004)
|
||||
- `hbd_host`: bind address for HTTP/WSS
|
||||
- `pickfile`: path for persisted state
|
||||
- `logfile`: path to log file
|
||||
- `logfmt`: `text` or `msg`
|
||||
- `pushsrv`: push service (`pushover`|`mattermost`|`all`)
|
||||
- `interval` / `grace`: heartbeat timing configuration
|
||||
- `dyndomains`: list of dyndomains to update via `nsupdate`
|
||||
@@ -487,29 +495,39 @@ nsupdate_bin: /usr/bin/nsupdate
|
||||
pushsrv: pushover
|
||||
```
|
||||
|
||||
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys.
|
||||
> Tip: `SERVER_DEFAULTS` in `hbd/server/config.py` contains the canonical defaults and accepted configuration keys.
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Architecture & Modules
|
||||
|
||||
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
|
||||
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
||||
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
|
||||
The DNS worker now runs as an `asyncio` task and the package exposes a
|
||||
small thread-safe bridge so legacy synchronous code can `put()` updates
|
||||
into the queue; there is no longer a permanently-blocking background
|
||||
`threading.Thread`.
|
||||
- `hbd.notify` — email and push notification helpers
|
||||
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers
|
||||
- `hbd.http` — HTTP handler factory for the status UI/API
|
||||
- `hbd.journal` — message journal with size-based log rotation and backup management
|
||||
- `hbd.plugin` — plugin framework with base classes, registry, and dynamic loader
|
||||
- `hbd.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
|
||||
- `hbd.hbc` — heartbeat client that sends heartbeats and plugin data to server
|
||||
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
||||
- `hbd.cli` — CLI entrypoint and argument parsing
|
||||
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components
|
||||
The package is organized into three subpackages:
|
||||
|
||||
**`hbd.common`** — shared code used by both client and server:
|
||||
- `hbd.common.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
|
||||
- `hbd.common.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
|
||||
|
||||
**`hbd.server`** — the heartbeat daemon (`hbd`):
|
||||
- `hbd.server.cli` — CLI entrypoint and argument parsing
|
||||
- `hbd.server.main` — async orchestration to run UDP/HTTP/WSS components
|
||||
- `hbd.server.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
|
||||
- `hbd.server.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
|
||||
The DNS worker runs as an `asyncio` task and the package exposes a small thread-safe bridge
|
||||
so legacy synchronous code can `put()` updates into the queue.
|
||||
- `hbd.server.notify` — email and push notification helpers
|
||||
- `hbd.server.ws` — WebSocket server and thread-safe broadcast helpers
|
||||
- `hbd.server.http` — HTTP handler factory for the status UI/API
|
||||
- `hbd.server.journal` — message journal with size-based log rotation and backup management
|
||||
- `hbd.server.threshold` — threshold alerting engine
|
||||
- `hbd.server.monitor` — host state monitoring
|
||||
- `hbd.server.hbdclass` — `Host` class and shared server state
|
||||
- `hbd.server.config` — configuration loader and defaults
|
||||
|
||||
**`hbd.client`** — the heartbeat client (`hbc`):
|
||||
- `hbd.client.main` — client entrypoint; sends heartbeats and plugin data to the server
|
||||
- `hbd.client.plugin` — plugin framework with base classes, registry, and dynamic loader
|
||||
- `hbd.client.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
|
||||
- `hbd.client.config` — client configuration loader
|
||||
|
||||
This modular layout makes the code easier to test and maintain.
|
||||
|
||||
@@ -517,12 +535,12 @@ This modular layout makes the code easier to test and maintain.
|
||||
|
||||
- The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks.
|
||||
- On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit.
|
||||
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.hbdclass.Host.dnsQ`.
|
||||
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.server.hbdclass.Host.dnsQ`.
|
||||
|
||||
**Templates & Static Files**
|
||||
|
||||
- Template files are located under `hbd/templates` by default. The HTTP server resolves templates relative to the `hbd` package but the path can be overridden with the `templates_dir` config key.
|
||||
- Static assets (CSS/JS/images) are served from `hbd/static` via the `/static/<path>` HTTP route. Place your static files in that directory or configure the HTTP server as needed.
|
||||
- Template files are located under `hbd/server/templates`. The HTTP server resolves templates relative to the `hbd.server` package but the path can be overridden with the `templates_dir` config key.
|
||||
- Static assets (CSS/JS/images) are served from `hbd/server/static` via the `/static/<path>` HTTP route.
|
||||
|
||||
---
|
||||
|
||||
|
||||
+1
-1
@@ -59,7 +59,7 @@ Server-specific defaults:
|
||||
- `hb_port`: Port to listen for heartbeats (default: 50003)
|
||||
- `hbd_port`: HTTP API port (default: 50004)
|
||||
- `ws_port`: WebSocket port (default: 50005)
|
||||
- `logfile`, `logfmt`: Logging configuration
|
||||
- `logfile`: Log file path
|
||||
- `pushsrv`, `pushover_token`, etc.: Notification settings
|
||||
- `watchhosts`, `dyndnshosts`: Host monitoring
|
||||
- `smtpserver`, etc.: Email settings
|
||||
|
||||
@@ -81,7 +81,6 @@ The following settings **cannot** be reloaded and require a service restart:
|
||||
|
||||
- **Logging**
|
||||
- `logfile` - Log file path
|
||||
- `logfmt` - Log format
|
||||
|
||||
- **Journal Settings**
|
||||
- `journal_enabled` - Enable/disable journaling
|
||||
|
||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
||||
"""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.0.12"
|
||||
__version__ = "5.1.0"
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import yaml
|
||||
@@ -30,18 +33,19 @@ def load_config(path=None):
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = CLIENT_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
# default path (~/.hbc.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
logger.info("Loading configuration from %s", path)
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
@@ -50,5 +54,5 @@ def load_config(path=None):
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||
return cfg
|
||||
|
||||
+5
-5
@@ -644,13 +644,10 @@ def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.INFO
|
||||
log_level = logging.WARNING
|
||||
if args.verbose:
|
||||
log_level = logging.DEBUG
|
||||
log_level = logging.INFO
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
|
||||
@@ -660,6 +657,9 @@ def main(argv=None):
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
print("Daemonizing...")
|
||||
|
||||
@@ -311,7 +311,10 @@ class PluginLoader:
|
||||
return 0
|
||||
|
||||
loaded_count = 0
|
||||
plugin_config = config or {}
|
||||
raw_config = config or {}
|
||||
# Per-plugin config lives under the 'plugins' key; fall back to top-level
|
||||
# for backwards compatibility.
|
||||
plugin_config = raw_config.get("plugins", raw_config)
|
||||
|
||||
# Scan for Python files
|
||||
for plugin_file in directory.glob("*.py"):
|
||||
|
||||
@@ -81,7 +81,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
|
||||
# Validate commands
|
||||
if not self.commands:
|
||||
self.logger.warning(
|
||||
self.logger.info(
|
||||
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
|
||||
)
|
||||
|
||||
@@ -94,7 +94,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
if not self.commands:
|
||||
self.logger.error("No Nagios commands configured")
|
||||
self.logger.info("No Nagios commands configured")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
"""Ping Monitor Plugin for Heartbeat.
|
||||
|
||||
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||
|
||||
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
ping_monitor:
|
||||
interval: 60 # ping every 60 seconds (default)
|
||||
count: 3 # ICMP packets per ping run (default 3)
|
||||
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||
hosts:
|
||||
8.8.8.8:
|
||||
warning: 20.0 # ms
|
||||
critical: 100.0 # ms
|
||||
192.168.1.1:
|
||||
warning: 5.0
|
||||
critical: 20.0
|
||||
```
|
||||
|
||||
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||
by underscores so it is a valid identifier):
|
||||
|
||||
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||
|
||||
Server-side threshold config example:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
ping_monitor:
|
||||
8_8_8_8_rtt_avg:
|
||||
warning: 20.0
|
||||
critical: 100.0
|
||||
```
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
def _host_key(host: str) -> str:
|
||||
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||
|
||||
|
||||
class PingMonitorPlugin(MonitorPlugin):
|
||||
"""Ping one or more configured hosts and report RTT metrics."""
|
||||
|
||||
name = "ping_monitor"
|
||||
version = "1.0.0"
|
||||
description = "ICMP ping latency monitoring"
|
||||
interval = 60
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
cfg = config or {}
|
||||
self.interval = cfg.get("interval", 60)
|
||||
self.count = int(cfg.get("count", 3))
|
||||
self.timeout = int(cfg.get("timeout", 5))
|
||||
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||
raw_hosts = cfg.get("hosts", {})
|
||||
if isinstance(raw_hosts, list):
|
||||
self.hosts = {h: {} for h in raw_hosts}
|
||||
else:
|
||||
self.hosts = dict(raw_hosts)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
if not self.hosts:
|
||||
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||
return False
|
||||
self.logger.info(
|
||||
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||
len(self.hosts), self.interval, self.count, self.timeout,
|
||||
)
|
||||
return True
|
||||
|
||||
async def _ping(self, host: str) -> Dict[str, float]:
|
||||
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||
if sys.platform == "win32":
|
||||
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||
else:
|
||||
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(
|
||||
proc.communicate(),
|
||||
timeout=self.timeout * self.count + 2,
|
||||
)
|
||||
output = stdout.decode(errors="replace")
|
||||
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
|
||||
# Parse packet loss
|
||||
loss = 100.0
|
||||
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||
if loss_match:
|
||||
loss = float(loss_match.group(1))
|
||||
|
||||
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||
rtt_match = re.search(
|
||||
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||
output,
|
||||
)
|
||||
if rtt_match:
|
||||
return {
|
||||
"rtt_min": float(rtt_match.group(1)),
|
||||
"rtt_avg": float(rtt_match.group(2)),
|
||||
"rtt_max": float(rtt_match.group(3)),
|
||||
"loss": loss,
|
||||
}
|
||||
|
||||
# Host unreachable or all packets lost
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": loss}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
data: Dict[str, Any] = {}
|
||||
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||
for host, task in tasks.items():
|
||||
try:
|
||||
result = await task
|
||||
except Exception as e:
|
||||
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
key = _host_key(host)
|
||||
for metric, value in result.items():
|
||||
data[f"{key}_{metric}"] = value
|
||||
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||
return data
|
||||
+34
-4
@@ -16,12 +16,10 @@ SERVER_DEFAULTS = {
|
||||
"hbd_host": "", # Bind address (empty = all interfaces)
|
||||
|
||||
# Persistence
|
||||
"pickfile": "/tmp/hb.pick",
|
||||
"pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"), # File to store host state between restarts
|
||||
|
||||
# Logging
|
||||
"logfile": "/var/log/heartbeat.log",
|
||||
"logfmt": "text", # text or msg or json
|
||||
|
||||
"logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
|
||||
# Notification channels
|
||||
"notification_channels": {}, # Named channels with type and credentials
|
||||
"default_notification_channels": [], # Default channels if host doesn't specify
|
||||
@@ -69,6 +67,38 @@ SERVER_DEFAULTS = {
|
||||
"thresholds": {},
|
||||
}
|
||||
|
||||
THRESHOLD_DEFAULTS = {
|
||||
'thresholds': {
|
||||
'cpu_monitor': {
|
||||
'cpu_percent': {
|
||||
'warning': 80.0,
|
||||
'critical': 90.0
|
||||
}
|
||||
},
|
||||
'memory_monitor': {
|
||||
'percent': {
|
||||
'warning': 85.0,
|
||||
'critical': 95.0
|
||||
}
|
||||
},
|
||||
'disk_monitor': {
|
||||
'partitions': {
|
||||
'/': {
|
||||
'percent': {
|
||||
'warning': 85.0,
|
||||
'critical': 90.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'rtt': {
|
||||
'warning': 200,
|
||||
'critical': 250.0,
|
||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with server defaults.
|
||||
|
||||
+1
-2
@@ -851,8 +851,7 @@ async def start(
|
||||
site = web.TCPSite(runner, host, port)
|
||||
await site.start()
|
||||
|
||||
if verbose:
|
||||
print(f"HTTP server started on {host}:{port}")
|
||||
logger.info(f"HTTP server started on {host}:{port}")
|
||||
|
||||
try:
|
||||
await asyncio.Future()
|
||||
|
||||
+32
-7
@@ -27,6 +27,7 @@ def save_state(config, hbdclass):
|
||||
"""Save current state to pickle file. Safe to call at any time."""
|
||||
import pickle
|
||||
import os
|
||||
from . import users as users_mod
|
||||
|
||||
# Clear timer references before pickling (they can't be serialized)
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
@@ -48,6 +49,7 @@ def save_state(config, hbdclass):
|
||||
pick = pickle.Pickler(pickf)
|
||||
pick.dump(hbdclass.Host.hosts)
|
||||
pick.dump(data.msgs)
|
||||
pick.dump(users_mod.save_sessions())
|
||||
os.replace(tmpfile, pickfile)
|
||||
except Exception as e:
|
||||
logger.error("Failed to save state: %s", e)
|
||||
@@ -89,9 +91,13 @@ async def reload_configuration(config_obj, config_path, components):
|
||||
# Reload users
|
||||
users_mod.load_users(new_config)
|
||||
|
||||
# Re-apply host access from updated config to all known hosts
|
||||
# Re-apply host attributes from updated config to all known hosts
|
||||
from . import config as config_mod
|
||||
dyndnshosts = config_mod.get_dyndnshosts(new_config)
|
||||
watchhosts = config_mod.get_watchhosts(new_config)
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
host.dyn = hostname in dyndnshosts
|
||||
host.watched = hostname in watchhosts
|
||||
access = config_mod.get_host_access(new_config, hostname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
|
||||
@@ -126,6 +132,10 @@ async def reload_configuration(config_obj, config_path, components):
|
||||
|
||||
|
||||
async def _run_async(config, config_path=None):
|
||||
from .config import ReloadableConfig
|
||||
if not isinstance(config, ReloadableConfig):
|
||||
config = ReloadableConfig(config, config_path)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
shutdown_event = asyncio.Event()
|
||||
reload_event = asyncio.Event()
|
||||
@@ -187,14 +197,14 @@ async def _run_async(config, config_path=None):
|
||||
sock.bind(bind_addr)
|
||||
logger.info("Starting UDP server on %s:%s", *bind_addr)
|
||||
|
||||
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMPNS).
|
||||
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
|
||||
# If supported, read datagrams via recvmsg() so RTT uses the kernel
|
||||
# timestamp rather than the time.time() call after asyncio scheduling.
|
||||
use_kernel_ts = udp.enable_kernel_timestamps(sock)
|
||||
if use_kernel_ts:
|
||||
logger.info("SO_TIMESTAMPNS enabled: using kernel receive timestamps for RTT")
|
||||
logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
|
||||
else:
|
||||
logger.info("SO_TIMESTAMPNS not available: using time.time() for RTT")
|
||||
logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
|
||||
|
||||
def udp_handler(msg, addr, transport, recv_ts=None):
|
||||
ctx = dict(
|
||||
@@ -416,6 +426,13 @@ async def _run_async(config, config_path=None):
|
||||
except Exception as e:
|
||||
logger.warning("Error stopping DNS worker: %s", e)
|
||||
|
||||
# Save state (hosts + sessions) on clean shutdown
|
||||
try:
|
||||
save_state(config, hbdclass)
|
||||
logger.info("State saved on shutdown")
|
||||
except Exception as e:
|
||||
logger.warning("Error saving state on shutdown: %s", e)
|
||||
|
||||
logger.info("All tasks cancelled")
|
||||
|
||||
|
||||
@@ -424,6 +441,7 @@ def load_pickled_hosts(config, hbdclass):
|
||||
import os
|
||||
import pickle
|
||||
from . import config as config_mod
|
||||
from . import users as users_mod
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||
@@ -437,6 +455,10 @@ def load_pickled_hosts(config, hbdclass):
|
||||
try:
|
||||
hbdclass.Host.hosts = pick.load()
|
||||
data.msgs = pick.load()
|
||||
try:
|
||||
users_mod.load_sessions(pick.load())
|
||||
except Exception:
|
||||
pass # older pickle without sessions — fine
|
||||
pickf.close()
|
||||
except Exception as e:
|
||||
logger.exception("load pickled failed: %s", e)
|
||||
@@ -471,9 +493,12 @@ def run(config, config_path=None):
|
||||
"""
|
||||
import os
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO
|
||||
)
|
||||
log_level = logging.WARNING
|
||||
if config.get("verbose", False):
|
||||
log_level = logging.INFO
|
||||
if config.get("debug", 0) > 0:
|
||||
log_level = logging.DEBUG
|
||||
logging.basicConfig(level=log_level)
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||
|
||||
@@ -252,8 +252,6 @@ def get_settings_sections(config: dict) -> list:
|
||||
"Path to the pickle file used to persist host state across restarts."),
|
||||
field("logfile", "Event log", "path",
|
||||
"Path to the event log file."),
|
||||
field("logfmt", "Log format", "select",
|
||||
"Format for event log entries: text, msg, or json."),
|
||||
],
|
||||
},
|
||||
{
|
||||
|
||||
@@ -4,17 +4,26 @@
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 10px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
padding: 10px;
|
||||
margin: 0;
|
||||
background: #f5f5f5;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.container {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
max-width: 1600px;
|
||||
width: 100%;
|
||||
margin: 0 auto;
|
||||
max-height: calc(100vh - 120px);
|
||||
overflow-y: auto;
|
||||
padding-right: 10px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
h1 {
|
||||
@@ -53,11 +62,12 @@
|
||||
}
|
||||
|
||||
.log-section {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
background: white;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
@@ -112,24 +122,20 @@
|
||||
}
|
||||
|
||||
/* Scrollbar styling */
|
||||
.container::-webkit-scrollbar,
|
||||
.log-section::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-track,
|
||||
.log-section::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-thumb,
|
||||
.log-section::-webkit-scrollbar-thumb {
|
||||
background: #888;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.container::-webkit-scrollbar-thumb:hover,
|
||||
.log-section::-webkit-scrollbar-thumb:hover {
|
||||
background: #555;
|
||||
}
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
{% include 'head.html' %}
|
||||
|
||||
<style>
|
||||
html, body {
|
||||
overflow: visible;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 20px;
|
||||
background: #f5f5f5;
|
||||
|
||||
+78
-19
@@ -14,6 +14,7 @@ import time
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
from . import notify as notify_mod
|
||||
from .config import THRESHOLD_DEFAULTS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
eventlog = notify_mod.eventlog
|
||||
@@ -58,6 +59,7 @@ class AlertState:
|
||||
self.formatted_message = None # Formatted display message for UI
|
||||
self.acknowledged = False # Whether alert has been acknowledged
|
||||
self.acknowledged_at = None # Timestamp when acknowledged
|
||||
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -118,8 +120,11 @@ class AlertState:
|
||||
|
||||
# Helper to sanitize numeric values for JSON (handle inf/nan)
|
||||
def sanitize_value(val):
|
||||
if isinstance(val, float) and (math.isinf(val) or math.isnan(val)):
|
||||
return None
|
||||
if isinstance(val, float):
|
||||
if math.isinf(val):
|
||||
return "overdue"
|
||||
if math.isnan(val):
|
||||
return None
|
||||
return val
|
||||
|
||||
result = {
|
||||
@@ -146,6 +151,12 @@ class AlertState:
|
||||
|
||||
return result
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""Restore from pickle, backfilling fields added after the pickle was written."""
|
||||
self.__dict__.update(state)
|
||||
if not hasattr(self, 'consecutive_count'):
|
||||
self.consecutive_count = 0
|
||||
|
||||
def acknowledge(self):
|
||||
"""Acknowledge this alert to stop reminder notifications."""
|
||||
self.acknowledged = True
|
||||
@@ -167,6 +178,7 @@ class ThresholdConfig:
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
count: int = 1,
|
||||
):
|
||||
"""
|
||||
Initialize threshold configuration.
|
||||
@@ -178,6 +190,7 @@ class ThresholdConfig:
|
||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||
enabled: Whether this threshold is enabled
|
||||
count: Number of consecutive exceedances required before alerting (default 1)
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.warning = warning
|
||||
@@ -185,6 +198,7 @@ class ThresholdConfig:
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
self.display = display
|
||||
self.count = max(1, int(count))
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
@@ -391,8 +405,28 @@ class ThresholdChecker:
|
||||
logger.info("No threshold configurations defined")
|
||||
return
|
||||
|
||||
# Parse each named configuration
|
||||
# Build effective_defaults: THRESHOLD_DEFAULTS merged with the 'default' config (if present).
|
||||
# All other configs inherit any metric not explicitly defined from effective_defaults.
|
||||
effective_defaults: Dict[str, ThresholdConfig] = {}
|
||||
for plugin_name, plugin_thresholds in THRESHOLD_DEFAULTS.get("thresholds", {}).items():
|
||||
if isinstance(plugin_thresholds, dict):
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults)
|
||||
|
||||
if "default" in threshold_configs:
|
||||
default_data = threshold_configs["default"]
|
||||
if isinstance(default_data, dict) and "thresholds" in default_data:
|
||||
for plugin_name, plugin_thresholds in default_data["thresholds"].items():
|
||||
if isinstance(plugin_thresholds, dict):
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults)
|
||||
|
||||
self.threshold_configs["default"] = dict(effective_defaults)
|
||||
logger.info("Registered 'default' threshold config with %d metrics", len(effective_defaults))
|
||||
|
||||
# Parse each named configuration, seeding it with effective_defaults first
|
||||
for config_name, config_data in threshold_configs.items():
|
||||
if config_name == "default":
|
||||
continue # already handled above
|
||||
|
||||
if not isinstance(config_data, dict):
|
||||
logger.warning("Invalid threshold config '%s', skipping", config_name)
|
||||
continue
|
||||
@@ -402,7 +436,7 @@ class ThresholdChecker:
|
||||
continue
|
||||
|
||||
logger.info("Parsing threshold configuration: %s", config_name)
|
||||
self.threshold_configs[config_name] = {}
|
||||
self.threshold_configs[config_name] = dict(effective_defaults)
|
||||
|
||||
thresholds_config = config_data["thresholds"]
|
||||
for plugin_name, plugin_thresholds in thresholds_config.items():
|
||||
@@ -600,6 +634,7 @@ class ThresholdChecker:
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
count = rtt_thresholds.get("count", 1)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined, skipping")
|
||||
@@ -612,14 +647,16 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
display=display,
|
||||
count=count,
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
|
||||
warning,
|
||||
critical
|
||||
critical,
|
||||
count,
|
||||
)
|
||||
|
||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||
@@ -692,6 +729,26 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Apply consecutive-count gating: when currently OK, require threshold.count
|
||||
# consecutive exceedances before escalating to WARNING/CRITICAL.
|
||||
if new_level == AlertLevel.OK:
|
||||
# Value is fine (or recovered) — reset the pending counter immediately.
|
||||
alert_state.consecutive_count = 0
|
||||
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
|
||||
# First time we exceed while still OK: count up.
|
||||
alert_state.consecutive_count += 1
|
||||
if alert_state.consecutive_count < threshold.count:
|
||||
logger.debug(
|
||||
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
|
||||
alert_state.consecutive_count,
|
||||
threshold.count,
|
||||
metric_path,
|
||||
host_name,
|
||||
)
|
||||
return None
|
||||
# Count reached — fire the alert and reset the counter.
|
||||
alert_state.consecutive_count = 0
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
@@ -884,48 +941,50 @@ class ThresholdChecker:
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Use a display-friendly value (inf is the sentinel for "overdue")
|
||||
import math
|
||||
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
|
||||
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVERED"
|
||||
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
|
||||
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
if threshold_value is not None:
|
||||
# Use display format string
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=value,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
)
|
||||
message = f"{metric_path} = {value} {threshold_info}"
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
message = f"{metric_path} = {display_value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
if threshold_value is not None:
|
||||
# Use display format string
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=value,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
)
|
||||
message = f"{metric_path} = {value} {threshold_info}"
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
message = f"{metric_path} = {display_value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {value}"
|
||||
message = f"{metric_path} = {display_value}"
|
||||
|
||||
# Return the formatted threshold info for storing in AlertState
|
||||
formatted_threshold_msg = None
|
||||
if threshold_value is not None and new_level != AlertLevel.OK:
|
||||
formatted_threshold_msg = self._format_display(
|
||||
threshold.display,
|
||||
value=value,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
@@ -1037,7 +1096,7 @@ class ThresholdChecker:
|
||||
threshold: Threshold configuration
|
||||
plugin_data: Optional dictionary of all plugin data fields
|
||||
"""
|
||||
if alert_state.level == AlertLevel.OK:
|
||||
if alert_state.level != AlertLevel.CRITICAL:
|
||||
return
|
||||
|
||||
# Skip reminders if alert has been acknowledged
|
||||
|
||||
+31
-13
@@ -7,6 +7,8 @@ import time
|
||||
import zlib
|
||||
import logging
|
||||
|
||||
from platform import system as platform_system
|
||||
|
||||
from ..common.proto import stodict, oldmtodict
|
||||
from ..common.utils import dur
|
||||
from . import notify as notify_mod
|
||||
@@ -16,9 +18,18 @@ eventlog = notify_mod.eventlog
|
||||
|
||||
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
|
||||
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
|
||||
# Python's socket module on all platforms, so fall back to the Linux value (29)
|
||||
# when absent.
|
||||
_SO_TIMESTAMP = getattr(socket, 'SO_TIMESTAMP', 29)
|
||||
# Python's socket module on all platforms
|
||||
platform = platform_system()
|
||||
if platform == "Darwin":
|
||||
_SO_TIMESTAMP = 1024 # SO_TIMESTAMP on macOS (not in Python's socket module)
|
||||
elif platform == "Linux":
|
||||
_SO_TIMESTAMP = 29 # Linux value (not in older Python versions)
|
||||
elif platform == "FreeBSD":
|
||||
_SO_TIMESTAMP = 32 # FreeBSD value (not in older Python versions)
|
||||
else:
|
||||
logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
|
||||
_SO_TIMESTAMP = None
|
||||
|
||||
# struct timeval uses two native C longs: tv_sec and tv_usec
|
||||
_TIMEVAL = struct.Struct('@ll')
|
||||
|
||||
@@ -222,11 +233,15 @@ def restore_connection_timers(hbdclass, ctx):
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
remaining = max(1.0, (interval + grace) - elapsed)
|
||||
# Give hosts one full (interval + grace) of extra time on startup
|
||||
# so hosts that were silent while hbd was down are not immediately
|
||||
# flagged as overdue before they have a chance to check in.
|
||||
startup_grace = interval + grace
|
||||
remaining = max(startup_grace, 2 * startup_grace - elapsed)
|
||||
conn.reset_overdue_timer(remaining, on_overdue)
|
||||
logger.debug(
|
||||
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)",
|
||||
uname, afam, remaining, elapsed,
|
||||
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
|
||||
uname, afam, remaining, elapsed, startup_grace,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
@@ -397,13 +412,16 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||
# it was never down, it just hasn't been seen before.
|
||||
if not newh:
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
|
||||
@@ -226,3 +226,17 @@ def _purge_expired_sessions() -> None:
|
||||
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
|
||||
for t in expired:
|
||||
del _sessions[t]
|
||||
|
||||
|
||||
def save_sessions() -> dict:
|
||||
"""Return a snapshot of non-expired sessions suitable for pickling."""
|
||||
_purge_expired_sessions()
|
||||
return dict(_sessions)
|
||||
|
||||
|
||||
def load_sessions(snapshot: dict) -> None:
|
||||
"""Restore sessions from a pickled snapshot, dropping any that have expired."""
|
||||
global _sessions
|
||||
now = time.time()
|
||||
_sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
|
||||
logger.debug("Restored %d session(s) from pickle", len(_sessions))
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.0.12"
|
||||
version = "5.1.0"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
Reference in New Issue
Block a user