Compare commits

...

20 Commits

Author SHA1 Message Date
Andreas Wrede daf5277507 version 5.1.0
Release / release (push) Successful in 5s
2026-04-11 15:26:37 -04:00
Andreas Wrede ee3b72878f Add a ping monitor 2026-04-11 15:25:23 -04:00
Andreas Wrede 6217f7a124 fix bogus notification on new clients 2026-04-10 13:39:18 -04:00
Andreas Wrede 2468386f24 adjust default log, pick and config locations. renotify on critical only, make user sessions persistem 2026-04-10 13:24:57 -04:00
Andreas Wrede 2015195112 Grace interval on restart of hbd, fix SIGHUP processing 2026-04-10 12:58:38 -04:00
Andreas Wrede 3426185383 Set SO_TIMESTAMP correctly for the various platforms 2026-04-10 11:19:47 -04:00
Andreas Wrede 9eedbafe97 Show overdue in alerts instead of null 2026-04-10 09:20:28 -04:00
Andreas Wrede a5f31c5cb5 update picked data strucures 2026-04-10 09:18:38 -04:00
Andreas Wrede 2f72cf0118 typo 2026-04-10 09:17:57 -04:00
Andreas Wrede c56e77c2c1 Merge branch 'master' of git.wrede.ca:andreas/heartbeat 2026-04-10 08:20:40 -04:00
Andreas Wrede e9aa7a6f8b info only if no nagios command is defined 2026-04-10 08:19:59 -04:00
Andreas Wrede a75a8a4087 warn only if no nagios command is defined 2026-04-10 08:14:31 -04:00
Andreas Wrede ba27d2e300 Add count to rtt threshold 2026-04-10 08:07:50 -04:00
Andreas Wrede 381e37efce fix log-section height 2026-04-10 08:01:22 -04:00
Andreas Wrede 97dfc08f4d fix log level settiung 2026-04-10 08:00:51 -04:00
Andreas Wrede d281ac5a70 provide defaults for threshold_configs 2026-04-10 07:47:39 -04:00
Andreas Wrede 812bbf8555 Merge branch 'master' of git.wrede.ca:andreas/heartbeat 2026-04-09 13:02:17 -04:00
Andreas Wrede e6b7a1aa27 drop config file 2026-04-09 13:02:10 -04:00
Andreas Wrede 90f47ad018 drop config file 2026-04-09 13:00:07 -04:00
Andreas Wrede cc458e8972 update README 2026-04-09 08:33:25 -04:00
22 changed files with 461 additions and 410 deletions
+1
View File
@@ -11,3 +11,4 @@ dist/
*.egg-info/ *.egg-info/
ssl/ ssl/
uv.lock uv.lock
.hb.yaml
-279
View File
@@ -1,279 +0,0 @@
#name: "w02"
hb_port: 50003
hbd_host: ''
#logfile: "/home/andreas/public_html/messages/andreas"
logfile: "/home/andreas/logs/heartbeat/heartbeat.log"
#logfile: "/Users/andreas/public_html/messages/andreas"
logfmt: "msg"
grace: 40
interval: 10
autosave_interval: 300 # Autosave interval in seconds (default: 5 minutes)
users:
andreas:
full_name: Andreas Wrede
password: pbkdf2:sha256:260000:eece9cdaebc22247566f78983bf5b2a3:f8c74cc057c5590943c115a60bac62f9458e9ba0d2e7e7421b6f0fe5d860e18f # hbd passwd andreas
avatar: /home/andreas/.avatar/Andreas-avatar3-small.png
admin: true
ops:
full_name: Operations Team
password: pbkdf2:sha256:260000:... # hbd passwd ops
admin: false
readonly:
full_name: Read-Only User
password: pbkdf2:sha256:260000:... # hbd
default_owner: andreas
hosts:
weekend:
owner: andreas
managers: [ops]
monitors: [readonly]
# Notification Channels - Define notification providers centrally
# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration
notification_channels:
pushover_standard:
type: pushover
token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf
user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK
signal_andreas:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +14168226179
recipient: +14168226179
email_andreas:
type: email
recipients: [aew.hbd.notify@wrede.ca]
sender: aew.hbd@wrede.ca
smtp_server: smtp.fastmail.com
smtp_port: 587
smtp_user: andreas@wrede.ca
smtp_password: pvtvefyp5gbhnch2
# Example additional channels (commented out)
# pushover_urgent:
# type: pushover
# token: your-app-token
# user: your-user-key
#
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
# Default notification channels (used if host doesn't specify channels)
default_notification_channels: [pushover_standard]
# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications
hosts:
wentworth:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
y:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
winter:
threshold_config: default
watch: true
notification_channels: [pushover_standard]
dyndns: false
wally:
threshold_config: freebsd_server
watch: false
notification_channels: [pushover_standard]
dyndns: false
eris:
threshold_config: truenas_server
watch: false
notification_channels: [pushover_standard]
dyndns: false
haschloss:
threshold_config: default
watch: false
dyndns: true
wayback:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
wertvoll:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
weekend:
threshold_config: freebsd_server
watch: false
notification_channels: [pushover_standard]
dyndns: true
cotgate:
threshold_config: default
watch: false
dyndns: true
rvgate:
threshold_config: default
watch: false
dyndns: true
draper:
threshold_config: default
watch: false
notification_channels: [pushover_standard]
dyndns: true
# Hosts to drop/ignore
drophosts: {"unknown", "wookie15", "wort"}
nsupdate_bin: "/usr/local/bin/nsupdate"
dyndomains: {"wrede.org"}
ws_port: 50005
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
# cert_path: "/usr/local/etc/letsencrypt/live/hbd.wrede.ca/"
# cert_path: "test/"
# CERT_PATH = "./test/"
# wss_pem: "fullchain.pem"
# wss_key: "privkey.pem"
journal_enabled: true # Enable/disable journaling
journal_dir: /home/andreas/logs/heartbeat # Journal directory
journal_file: messages.journal # Base filename
journal_max_size: 104857600 # Max size (100MB default)
journal_max_backups: 10 # Number of backups to keep
threshold_configs:
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
percent:
warning: 85.0
critical: 95.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
rtt:
warning: 200
critical: 250.0
freebsd_server:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
memory_percent:
warning: 97.0
critical: 100.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
nagios_runner:
# overall_status_code:
# warning: 1
# critical: 2
# operator: ">="
load_status:
warning: WARNING
critical: CRITICAL
operator: "=="
ups_load:
display: "load to high: {ups_output}"
warning: 70
critical: 80
operator: ">="
ups_status_code:
display: "{ups_output}"
warning: 1
critical: 2
operator: ">="
nextcloud_apps_status_code:
display: "{nextcloud_apps_output}"
warning: 1
critical: 2
operator: ">="
rtt:
warning: 200
critical: 250.0
truenas_server:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
memory_monitor:
percent:
warning: 3.0
critical: 95.0
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 90.0
nagios_runner:
# overall_status_code:
# warning: 1
# critical: 2
# operator: ">="
load_status:
warning: WARNING
critical: CRITICAL
operator: "=="
ups_load:
display: "load to high: {ups_output}"
WARNING: 70
CRITICAL: 80
OPERATOR: ">="
ups_status_code:
DISPLAY: "{ups_output}"
warning: 1
critical: 2
operator: ">="
nextcloud_apps_status_code:
display: "{nextcloud_apps_output}"
warning: 1
critical: 2
operator: ">="
rtt:
warning: 120
critical: 250.0
+6 -5
View File
@@ -4,12 +4,13 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
"name": "Python: Run hbd (module)", "name": "Python: Run hbd (module)",
"type": "debugpy", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "hbd.server.cli", "module": "hbd.server.cli",
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x"], "args": ["-c", "~/.hb.yaml", "-f", "-v"],
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
"env": { "env": {
"PYTHONPATH": "${workspaceFolder}" "PYTHONPATH": "${workspaceFolder}"
@@ -28,14 +29,14 @@
] ]
}, },
{ {
"name": "Python: Run hbd with debugpy (listen)", "name": "Python: Run hbc (module)",
"type": "debugpy", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "debugpy", "module": "hbd.client.main",
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", ".hb.yaml", "-f", "-v"], "args": ["-c", "~/.hbc.yaml", "-v", "winter"],
"cwd": "${workspaceFolder}",
"env": { "PYTHONPATH": "${workspaceFolder}" }, "env": { "PYTHONPATH": "${workspaceFolder}" },
"console": "integratedTerminal", "console": "integratedTerminal",
"justMyCode": false
} }
] ]
} }
+54 -36
View File
@@ -76,7 +76,7 @@ See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integr
### Creating Custom Plugins ### Creating Custom Plugins
```python ```python
from hbd.plugin import MonitorPlugin from hbd.client.plugin import MonitorPlugin
class DiskMonitorPlugin(MonitorPlugin): class DiskMonitorPlugin(MonitorPlugin):
name = "disk_monitor" name = "disk_monitor"
@@ -89,7 +89,7 @@ class DiskMonitorPlugin(MonitorPlugin):
} }
``` ```
Place plugins in `hbd/plugins/` and they'll be automatically discovered and loaded by the client. Place plugins in `hbd/client/plugins/` and they'll be automatically discovered and loaded by the client.
--- ---
@@ -368,7 +368,7 @@ See [docs/HTTP_API.md](docs/HTTP_API.md) for complete API documentation includin
Prerequisites: Prerequisites:
- Python 3.10+ (project uses language features from recent Python) - Python 3.11+ (project uses language features from recent Python)
- `nsupdate` (for DNS updates) if using dynamic DNS - `nsupdate` (for DNS updates) if using dynamic DNS
Install dependencies (recommended into a venv): Install dependencies (recommended into a venv):
@@ -389,7 +389,7 @@ hbd -c .hb.yaml -f -v
You can also run it directly via the package entrypoint after installation: You can also run it directly via the package entrypoint after installation:
```bash ```bash
python -m hbd.cli -c /path/to/config.yaml python -m hbd.server.cli -c /path/to/config.yaml
``` ```
### Running the Client ### Running the Client
@@ -397,14 +397,23 @@ python -m hbd.cli -c /path/to/config.yaml
The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server: The heartbeat client (`hbc`) sends periodic heartbeats and plugin data to the server:
```bash ```bash
# Basic usage pointing to server # Basic usage pointing to server (host is a positional argument)
python -m hbd.hbc --server your-server.example.com hbc your-server.example.com
# With custom configuration # Run as daemon with a config file
python -m hbd.hbc --server 192.168.1.100 --port 50003 --interval 30 hbc -d -c /etc/hbc.yaml your-server.example.com
# Run with specific plugins enabled/disabled # Send a one-off boot message
python -m hbd.hbc --server hbd.local --disable-plugin os_info hbc --boot your-server.example.com
# Verbose output
hbc -v your-server.example.com
```
You can also run it via the module entrypoint:
```bash
python -m hbd.client.main your-server.example.com
``` ```
Client configuration can also be specified in YAML: Client configuration can also be specified in YAML:
@@ -438,30 +447,29 @@ This repository includes a ready-to-use `.vscode/launch.json` with configuration
- Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code). - Ensure the **Python** extension is installed and select the project `.venv` as the interpreter (bottom-left of VS Code).
- Use **F5** and pick one of these configurations from the Run view: - Use **F5** and pick one of these configurations from the Run view:
- **Python: Run hbd (module)** — runs `hbd.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended). - **Python: Run hbd (module)** — runs `hbd.server.cli` as a module and sets `PYTHONPATH` to the workspace root (recommended).
- **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger. - **Python: Run hbd with debugpy (listen)** — launches `debugpy` and `hbd` together; useful when you want the process to listen for a debugger.
- **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`. - **Python: Attach (localhost:5678)** — attach the debugger to a running process started with `debugpy`.
To start `hbd` manually and wait for the debugger to attach, run: To start `hbd` manually and wait for the debugger to attach, run:
```bash ```bash
PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.cli -c .hb.yaml -f -v PYTHONPATH=. python -m debugpy --listen 5678 --wait-for-client -m hbd.server.cli -c .hb.yaml -f -v
``` ```
Set breakpoints in modules such as `hbd/udp.py`, `hbd/dns.py`, or `hbd/server.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code. Set breakpoints in modules such as `hbd/server/udp.py`, `hbd/server/dns.py`, or `hbd/server/main.py`, and use the **Attach** configuration to connect. Use `justMyCode: false` if you need to step into third-party code.
--- ---
## 🛠 Configuration ## 🛠 Configuration
`hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/config.py`): `hbd` reads YAML configuration (optional). If `PyYAML` is not installed, built-in defaults are used. Example configuration keys (see `hbd/server/config.py`):
- `hb_port`: UDP port to listen for heartbeats (default: 50003) - `hb_port`: UDP port to listen for heartbeats (default: 50003)
- `hbd_port`: internal control port (default: 50004) - `hbd_port`: internal control port (default: 50004)
- `hbd_host`: bind address for HTTP/WSS - `hbd_host`: bind address for HTTP/WSS
- `pickfile`: path for persisted state - `pickfile`: path for persisted state
- `logfile`: path to log file - `logfile`: path to log file
- `logfmt`: `text` or `msg`
- `pushsrv`: push service (`pushover`|`mattermost`|`all`) - `pushsrv`: push service (`pushover`|`mattermost`|`all`)
- `interval` / `grace`: heartbeat timing configuration - `interval` / `grace`: heartbeat timing configuration
- `dyndomains`: list of dyndomains to update via `nsupdate` - `dyndomains`: list of dyndomains to update via `nsupdate`
@@ -487,29 +495,39 @@ nsupdate_bin: /usr/bin/nsupdate
pushsrv: pushover pushsrv: pushover
``` ```
> Tip: `config.DEFAULTS` in `hbd/config.py` contains the canonical defaults and accepted configuration keys. > Tip: `SERVER_DEFAULTS` in `hbd/server/config.py` contains the canonical defaults and accepted configuration keys.
--- ---
## 🔧 Architecture & Modules ## 🔧 Architecture & Modules
- `hbd.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data) The package is organized into three subpackages:
- `hbd.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
- `hbd.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`). **`hbd.common`** — shared code used by both client and server:
The DNS worker now runs as an `asyncio` task and the package exposes a - `hbd.common.proto` — serialization/deserialization of heartbeat messages (supports compressed payloads and plugin data)
small thread-safe bridge so legacy synchronous code can `put()` updates - `hbd.common.utils` — small utility helpers (`shortname`, `dur`, `initlog`)
into the queue; there is no longer a permanently-blocking background
`threading.Thread`. **`hbd.server`** — the heartbeat daemon (`hbd`):
- `hbd.notify` — email and push notification helpers - `hbd.server.cli` — CLI entrypoint and argument parsing
- `hbd.ws` — WebSocket server and thread-safe broadcast helpers - `hbd.server.main` — async orchestration to run UDP/HTTP/WSS components
- `hbd.http` — HTTP handler factory for the status UI/API - `hbd.server.udp` — UDP parsing and `handle_datagram` implementation (main state machine)
- `hbd.journal` — message journal with size-based log rotation and backup management - `hbd.server.dns` — `create_nsupdate_payload`, `nsupdate`, and an asyncio DNS worker (`start_dns_worker`).
- `hbd.plugin` — plugin framework with base classes, registry, and dynamic loader The DNS worker runs as an `asyncio` task and the package exposes a small thread-safe bridge
- `hbd.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner) so legacy synchronous code can `put()` updates into the queue.
- `hbd.hbc` — heartbeat client that sends heartbeats and plugin data to server - `hbd.server.notify` — email and push notification helpers
- `hbd.utils` — small utility helpers (`shortname`, `dur`, `initlog`) - `hbd.server.ws` — WebSocket server and thread-safe broadcast helpers
- `hbd.cli` — CLI entrypoint and argument parsing - `hbd.server.http` — HTTP handler factory for the status UI/API
- `hbd.server` — async orchestration to run UDP/HTTP/WSS components - `hbd.server.journal` — message journal with size-based log rotation and backup management
- `hbd.server.threshold` — threshold alerting engine
- `hbd.server.monitor` — host state monitoring
- `hbd.server.hbdclass` — `Host` class and shared server state
- `hbd.server.config` — configuration loader and defaults
**`hbd.client`** — the heartbeat client (`hbc`):
- `hbd.client.main` — client entrypoint; sends heartbeats and plugin data to the server
- `hbd.client.plugin` — plugin framework with base classes, registry, and dynamic loader
- `hbd.client.plugins/` — built-in plugins (os_info, cpu_monitor, memory_monitor, disk_monitor, network_monitor, filesystem_info, nagios_runner)
- `hbd.client.config` — client configuration loader
This modular layout makes the code easier to test and maintain. This modular layout makes the code easier to test and maintain.
@@ -517,12 +535,12 @@ This modular layout makes the code easier to test and maintain.
- The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks. - The main runtime is asyncio-based. Services (UDP listener, HTTP server, WebSocket server, monitor, and DNS worker) run as asyncio tasks.
- On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit. - On SIGINT/SIGTERM the server triggers a graceful shutdown: it cancels active tasks, signals the DNS worker via a sentinel, and cleans up resources before exit.
- The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.hbdclass.Host.dnsQ`. - The DNS update worker is implemented as an `asyncio` task; synchronous producers can still enqueue DNS updates via a small thread-safe bridge available at `hbd.server.hbdclass.Host.dnsQ`.
**Templates & Static Files** **Templates & Static Files**
- Template files are located under `hbd/templates` by default. The HTTP server resolves templates relative to the `hbd` package but the path can be overridden with the `templates_dir` config key. - Template files are located under `hbd/server/templates`. The HTTP server resolves templates relative to the `hbd.server` package but the path can be overridden with the `templates_dir` config key.
- Static assets (CSS/JS/images) are served from `hbd/static` via the `/static/<path>` HTTP route. Place your static files in that directory or configure the HTTP server as needed. - Static assets (CSS/JS/images) are served from `hbd/server/static` via the `/static/<path>` HTTP route.
--- ---
+1 -1
View File
@@ -59,7 +59,7 @@ Server-specific defaults:
- `hb_port`: Port to listen for heartbeats (default: 50003) - `hb_port`: Port to listen for heartbeats (default: 50003)
- `hbd_port`: HTTP API port (default: 50004) - `hbd_port`: HTTP API port (default: 50004)
- `ws_port`: WebSocket port (default: 50005) - `ws_port`: WebSocket port (default: 50005)
- `logfile`, `logfmt`: Logging configuration - `logfile`: Log file path
- `pushsrv`, `pushover_token`, etc.: Notification settings - `pushsrv`, `pushover_token`, etc.: Notification settings
- `watchhosts`, `dyndnshosts`: Host monitoring - `watchhosts`, `dyndnshosts`: Host monitoring
- `smtpserver`, etc.: Email settings - `smtpserver`, etc.: Email settings
-1
View File
@@ -81,7 +81,6 @@ The following settings **cannot** be reloaded and require a service restart:
- **Logging** - **Logging**
- `logfile` - Log file path - `logfile` - Log file path
- `logfmt` - Log format
- **Journal Settings** - **Journal Settings**
- `journal_enabled` - Enable/disable journaling - `journal_enabled` - Enable/disable journaling
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
""" """
__all__ = ["__version__"] __all__ = ["__version__"]
__version__ = "5.0.12" __version__ = "5.1.0"
+8 -4
View File
@@ -2,6 +2,9 @@
import logging import logging
import os import os
import logging
logger = logging.getLogger(__name__)
try: try:
import yaml import yaml
@@ -30,18 +33,19 @@ def load_config(path=None):
If YAML is not available or the file does not exist, defaults are returned. If YAML is not available or the file does not exist, defaults are returned.
Args: Args:
path: Path to YAML config file (default: ~/.hb.yaml) path: Path to YAML config file (default: ~/.hbc.yaml)
Returns: Returns:
Dictionary with configuration Dictionary with configuration
""" """
cfg = CLIENT_DEFAULTS.copy() cfg = CLIENT_DEFAULTS.copy()
if not path: if not path:
# default path (~/.hb.yaml) # default path (~/.hbc.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml") path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
if os.path.exists(path): if os.path.exists(path):
if yaml: if yaml:
logger.info("Loading configuration from %s", path)
with open(path) as fh: with open(path) as fh:
data = yaml.safe_load(fh) data = yaml.safe_load(fh)
# Merge YAML data with defaults # Merge YAML data with defaults
@@ -50,5 +54,5 @@ def load_config(path=None):
cfg[k] = v cfg[k] = v
else: else:
# yaml not installed: do not attempt to parse; user must ensure defaults # yaml not installed: do not attempt to parse; user must ensure defaults
pass logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
return cfg return cfg
+5 -5
View File
@@ -644,13 +644,10 @@ def main(argv=None):
parser = build_parser() parser = build_parser()
args = parser.parse_args(argv) args = parser.parse_args(argv)
# Load config
config = load_config(args.configfile)
# Setup logging # Setup logging
log_level = logging.INFO log_level = logging.WARNING
if args.verbose: if args.verbose:
log_level = logging.DEBUG log_level = logging.INFO
if args.debug: if args.debug:
log_level = logging.DEBUG log_level = logging.DEBUG
@@ -659,6 +656,9 @@ def main(argv=None):
format="%(asctime)s %(name)s %(levelname)s: %(message)s", format="%(asctime)s %(name)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S" datefmt="%Y-%m-%d %H:%M:%S"
) )
# Load config
config = load_config(args.configfile)
# Daemonize if requested # Daemonize if requested
if args.daemon: if args.daemon:
+4 -1
View File
@@ -311,7 +311,10 @@ class PluginLoader:
return 0 return 0
loaded_count = 0 loaded_count = 0
plugin_config = config or {} raw_config = config or {}
# Per-plugin config lives under the 'plugins' key; fall back to top-level
# for backwards compatibility.
plugin_config = raw_config.get("plugins", raw_config)
# Scan for Python files # Scan for Python files
for plugin_file in directory.glob("*.py"): for plugin_file in directory.glob("*.py"):
+2 -2
View File
@@ -81,7 +81,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
# Validate commands # Validate commands
if not self.commands: if not self.commands:
self.logger.warning( self.logger.info(
"No Nagios commands configured. Add 'nagios_runner.commands' to config." "No Nagios commands configured. Add 'nagios_runner.commands' to config."
) )
@@ -94,7 +94,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
self.logger.info(f"Initializing {self.name} plugin") self.logger.info(f"Initializing {self.name} plugin")
if not self.commands: if not self.commands:
self.logger.error("No Nagios commands configured") self.logger.info("No Nagios commands configured")
return False return False
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)") self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
+151
View File
@@ -0,0 +1,151 @@
"""Ping Monitor Plugin for Heartbeat.
Pings one or more hosts and reports round-trip time. Results are sent as
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
alerts using the same RTT threshold configuration format used for heartbeat RTT.
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
```yaml
plugins:
ping_monitor:
interval: 60 # ping every 60 seconds (default)
count: 3 # ICMP packets per ping run (default 3)
timeout: 5 # seconds before a host is considered unreachable (default 5)
hosts:
8.8.8.8:
warning: 20.0 # ms
critical: 100.0 # ms
192.168.1.1:
warning: 5.0
critical: 20.0
```
Reported metrics per host (metric key uses the hostname with dots/colons replaced
by underscores so it is a valid identifier):
ping.<hostname>.rtt_avg average RTT in ms (float, or inf if unreachable)
ping.<hostname>.rtt_min minimum RTT in ms
ping.<hostname>.rtt_max maximum RTT in ms
ping.<hostname>.loss packet loss percentage (0100)
Server-side threshold config example:
```yaml
threshold_configs:
default:
thresholds:
ping_monitor:
8_8_8_8_rtt_avg:
warning: 20.0
critical: 100.0
```
"""
import asyncio
import re
import sys
from typing import Any, Dict, Optional
from hbd.client.plugin import MonitorPlugin
def _host_key(host: str) -> str:
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
class PingMonitorPlugin(MonitorPlugin):
"""Ping one or more configured hosts and report RTT metrics."""
name = "ping_monitor"
version = "1.0.0"
description = "ICMP ping latency monitoring"
interval = 60
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
cfg = config or {}
self.interval = cfg.get("interval", 60)
self.count = int(cfg.get("count", 3))
self.timeout = int(cfg.get("timeout", 5))
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
raw_hosts = cfg.get("hosts", {})
if isinstance(raw_hosts, list):
self.hosts = {h: {} for h in raw_hosts}
else:
self.hosts = dict(raw_hosts)
async def initialize(self) -> bool:
if not self.hosts:
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
return False
self.logger.info(
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
len(self.hosts), self.interval, self.count, self.timeout,
)
return True
async def _ping(self, host: str) -> Dict[str, float]:
"""Run a system ping command and return rtt_min/avg/max/loss."""
if sys.platform == "win32":
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
else:
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await asyncio.wait_for(
proc.communicate(),
timeout=self.timeout * self.count + 2,
)
output = stdout.decode(errors="replace")
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": 100.0}
# Parse packet loss
loss = 100.0
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
if loss_match:
loss = float(loss_match.group(1))
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
rtt_match = re.search(
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
output,
)
if rtt_match:
return {
"rtt_min": float(rtt_match.group(1)),
"rtt_avg": float(rtt_match.group(2)),
"rtt_max": float(rtt_match.group(3)),
"loss": loss,
}
# Host unreachable or all packets lost
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": loss}
async def _collect_metrics(self) -> Dict[str, Any]:
data: Dict[str, Any] = {}
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
for host, task in tasks.items():
try:
result = await task
except Exception as e:
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": 100.0}
key = _host_key(host)
for metric, value in result.items():
data[f"{key}_{metric}"] = value
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
self.logger.debug("ping_monitor: %s -> %s", host, status)
return data
+34 -4
View File
@@ -16,12 +16,10 @@ SERVER_DEFAULTS = {
"hbd_host": "", # Bind address (empty = all interfaces) "hbd_host": "", # Bind address (empty = all interfaces)
# Persistence # Persistence
"pickfile": "/tmp/hb.pick", "pickfile": os.path.join(os.path.expanduser("~"), ".hb.pick"), # File to store host state between restarts
# Logging # Logging
"logfile": "/var/log/heartbeat.log", "logfile": os.path.join(os.path.expanduser("~"), ".hb.log"),
"logfmt": "text", # text or msg or json
# Notification channels # Notification channels
"notification_channels": {}, # Named channels with type and credentials "notification_channels": {}, # Named channels with type and credentials
"default_notification_channels": [], # Default channels if host doesn't specify "default_notification_channels": [], # Default channels if host doesn't specify
@@ -69,6 +67,38 @@ SERVER_DEFAULTS = {
"thresholds": {}, "thresholds": {},
} }
THRESHOLD_DEFAULTS = {
'thresholds': {
'cpu_monitor': {
'cpu_percent': {
'warning': 80.0,
'critical': 90.0
}
},
'memory_monitor': {
'percent': {
'warning': 85.0,
'critical': 95.0
}
},
'disk_monitor': {
'partitions': {
'/': {
'percent': {
'warning': 85.0,
'critical': 90.0
}
}
}
},
'rtt': {
'warning': 200,
'critical': 250.0,
'count': 3 # Optional: number of consecutive breaches before alerting
}
}
}
def load_config(path=None): def load_config(path=None):
"""Load configuration from a YAML file and merge with server defaults. """Load configuration from a YAML file and merge with server defaults.
+1 -2
View File
@@ -851,8 +851,7 @@ async def start(
site = web.TCPSite(runner, host, port) site = web.TCPSite(runner, host, port)
await site.start() await site.start()
if verbose: logger.info(f"HTTP server started on {host}:{port}")
print(f"HTTP server started on {host}:{port}")
try: try:
await asyncio.Future() await asyncio.Future()
+32 -7
View File
@@ -27,6 +27,7 @@ def save_state(config, hbdclass):
"""Save current state to pickle file. Safe to call at any time.""" """Save current state to pickle file. Safe to call at any time."""
import pickle import pickle
import os import os
from . import users as users_mod
# Clear timer references before pickling (they can't be serialized) # Clear timer references before pickling (they can't be serialized)
for hostname, host in list(hbdclass.Host.hosts.items()): for hostname, host in list(hbdclass.Host.hosts.items()):
@@ -48,6 +49,7 @@ def save_state(config, hbdclass):
pick = pickle.Pickler(pickf) pick = pickle.Pickler(pickf)
pick.dump(hbdclass.Host.hosts) pick.dump(hbdclass.Host.hosts)
pick.dump(data.msgs) pick.dump(data.msgs)
pick.dump(users_mod.save_sessions())
os.replace(tmpfile, pickfile) os.replace(tmpfile, pickfile)
except Exception as e: except Exception as e:
logger.error("Failed to save state: %s", e) logger.error("Failed to save state: %s", e)
@@ -89,9 +91,13 @@ async def reload_configuration(config_obj, config_path, components):
# Reload users # Reload users
users_mod.load_users(new_config) users_mod.load_users(new_config)
# Re-apply host access from updated config to all known hosts # Re-apply host attributes from updated config to all known hosts
from . import config as config_mod from . import config as config_mod
dyndnshosts = config_mod.get_dyndnshosts(new_config)
watchhosts = config_mod.get_watchhosts(new_config)
for hostname, host in hbdclass.Host.hosts.items(): for hostname, host in hbdclass.Host.hosts.items():
host.dyn = hostname in dyndnshosts
host.watched = hostname in watchhosts
access = config_mod.get_host_access(new_config, hostname) access = config_mod.get_host_access(new_config, hostname)
host.apply_access(access["owner"], access["managers"], access["monitors"]) host.apply_access(access["owner"], access["managers"], access["monitors"])
@@ -126,6 +132,10 @@ async def reload_configuration(config_obj, config_path, components):
async def _run_async(config, config_path=None): async def _run_async(config, config_path=None):
from .config import ReloadableConfig
if not isinstance(config, ReloadableConfig):
config = ReloadableConfig(config, config_path)
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
shutdown_event = asyncio.Event() shutdown_event = asyncio.Event()
reload_event = asyncio.Event() reload_event = asyncio.Event()
@@ -187,14 +197,14 @@ async def _run_async(config, config_path=None):
sock.bind(bind_addr) sock.bind(bind_addr)
logger.info("Starting UDP server on %s:%s", *bind_addr) logger.info("Starting UDP server on %s:%s", *bind_addr)
# Try to enable kernel receive timestamps (Linux SO_TIMESTAMPNS). # Try to enable kernel receive timestamps (Linux SO_TIMESTAMP).
# If supported, read datagrams via recvmsg() so RTT uses the kernel # If supported, read datagrams via recvmsg() so RTT uses the kernel
# timestamp rather than the time.time() call after asyncio scheduling. # timestamp rather than the time.time() call after asyncio scheduling.
use_kernel_ts = udp.enable_kernel_timestamps(sock) use_kernel_ts = udp.enable_kernel_timestamps(sock)
if use_kernel_ts: if use_kernel_ts:
logger.info("SO_TIMESTAMPNS enabled: using kernel receive timestamps for RTT") logger.info("SO_TIMESTAMP enabled: using kernel receive timestamps for RTT")
else: else:
logger.info("SO_TIMESTAMPNS not available: using time.time() for RTT") logger.info("SO_TIMESTAMP not available: using time.time() for RTT")
def udp_handler(msg, addr, transport, recv_ts=None): def udp_handler(msg, addr, transport, recv_ts=None):
ctx = dict( ctx = dict(
@@ -416,6 +426,13 @@ async def _run_async(config, config_path=None):
except Exception as e: except Exception as e:
logger.warning("Error stopping DNS worker: %s", e) logger.warning("Error stopping DNS worker: %s", e)
# Save state (hosts + sessions) on clean shutdown
try:
save_state(config, hbdclass)
logger.info("State saved on shutdown")
except Exception as e:
logger.warning("Error saving state on shutdown: %s", e)
logger.info("All tasks cancelled") logger.info("All tasks cancelled")
@@ -424,6 +441,7 @@ def load_pickled_hosts(config, hbdclass):
import os import os
import pickle import pickle
from . import config as config_mod from . import config as config_mod
from . import users as users_mod
pickfile = config.get("pickfile", "hbd.pickle") pickfile = config.get("pickfile", "hbd.pickle")
dyndnshosts = config_mod.get_dyndnshosts(config) dyndnshosts = config_mod.get_dyndnshosts(config)
@@ -437,6 +455,10 @@ def load_pickled_hosts(config, hbdclass):
try: try:
hbdclass.Host.hosts = pick.load() hbdclass.Host.hosts = pick.load()
data.msgs = pick.load() data.msgs = pick.load()
try:
users_mod.load_sessions(pick.load())
except Exception:
pass # older pickle without sessions — fine
pickf.close() pickf.close()
except Exception as e: except Exception as e:
logger.exception("load pickled failed: %s", e) logger.exception("load pickled failed: %s", e)
@@ -471,9 +493,12 @@ def run(config, config_path=None):
""" """
import os import os
logging.basicConfig( log_level = logging.WARNING
level=logging.DEBUG if config.get("debug", 0) > 0 else logging.INFO if config.get("verbose", False):
) log_level = logging.INFO
if config.get("debug", 0) > 0:
log_level = logging.DEBUG
logging.basicConfig(level=log_level)
load_pickled_hosts(config, hbdclass) load_pickled_hosts(config, hbdclass)
notify_mod.initlog(logfile=config.get("logfile", "messages.log")) notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
-2
View File
@@ -252,8 +252,6 @@ def get_settings_sections(config: dict) -> list:
"Path to the pickle file used to persist host state across restarts."), "Path to the pickle file used to persist host state across restarts."),
field("logfile", "Event log", "path", field("logfile", "Event log", "path",
"Path to the event log file."), "Path to the event log file."),
field("logfmt", "Log format", "select",
"Format for event log entries: text, msg, or json."),
], ],
}, },
{ {
+15 -9
View File
@@ -4,17 +4,26 @@
<style> <style>
body { body {
margin: 10px; display: flex;
flex-direction: column;
height: 100vh;
box-sizing: border-box;
padding: 10px;
margin: 0;
background: #f5f5f5; background: #f5f5f5;
overflow: hidden; overflow: hidden;
} }
.container { .container {
flex: 1;
min-height: 0;
max-width: 1600px; max-width: 1600px;
width: 100%;
margin: 0 auto; margin: 0 auto;
max-height: calc(100vh - 120px); display: flex;
overflow-y: auto; flex-direction: column;
padding-right: 10px; gap: 15px;
overflow: hidden;
} }
h1 { h1 {
@@ -53,11 +62,12 @@
} }
.log-section { .log-section {
flex: 1;
min-height: 0;
background: white; background: white;
border-radius: 6px; border-radius: 6px;
padding: 15px; padding: 15px;
box-shadow: 0 1px 4px rgba(0,0,0,0.1); box-shadow: 0 1px 4px rgba(0,0,0,0.1);
max-height: 400px;
overflow-y: auto; overflow-y: auto;
} }
@@ -112,24 +122,20 @@
} }
/* Scrollbar styling */ /* Scrollbar styling */
.container::-webkit-scrollbar,
.log-section::-webkit-scrollbar { .log-section::-webkit-scrollbar {
width: 8px; width: 8px;
} }
.container::-webkit-scrollbar-track,
.log-section::-webkit-scrollbar-track { .log-section::-webkit-scrollbar-track {
background: #f1f1f1; background: #f1f1f1;
border-radius: 4px; border-radius: 4px;
} }
.container::-webkit-scrollbar-thumb,
.log-section::-webkit-scrollbar-thumb { .log-section::-webkit-scrollbar-thumb {
background: #888; background: #888;
border-radius: 4px; border-radius: 4px;
} }
.container::-webkit-scrollbar-thumb:hover,
.log-section::-webkit-scrollbar-thumb:hover { .log-section::-webkit-scrollbar-thumb:hover {
background: #555; background: #555;
} }
+4
View File
@@ -3,6 +3,10 @@
{% include 'head.html' %} {% include 'head.html' %}
<style> <style>
html, body {
overflow: visible;
}
body { body {
margin: 20px; margin: 20px;
background: #f5f5f5; background: #f5f5f5;
+96 -37
View File
@@ -14,6 +14,7 @@ import time
from enum import Enum from enum import Enum
from typing import Dict, Any, Optional, Tuple, Callable from typing import Dict, Any, Optional, Tuple, Callable
from . import notify as notify_mod from . import notify as notify_mod
from .config import THRESHOLD_DEFAULTS
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog eventlog = notify_mod.eventlog
@@ -38,11 +39,11 @@ class ComparisonOperator(Enum):
class AlertState: class AlertState:
"""Represents the current alert state for a specific metric.""" """Represents the current alert state for a specific metric."""
def __init__(self, metric_path: str): def __init__(self, metric_path: str):
""" """
Initialize alert state. Initialize alert state.
Args: Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
""" """
@@ -58,6 +59,7 @@ class AlertState:
self.formatted_message = None # Formatted display message for UI self.formatted_message = None # Formatted display message for UI
self.acknowledged = False # Whether alert has been acknowledged self.acknowledged = False # Whether alert has been acknowledged
self.acknowledged_at = None # Timestamp when acknowledged self.acknowledged_at = None # Timestamp when acknowledged
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
def update( def update(
self, self,
@@ -118,8 +120,11 @@ class AlertState:
# Helper to sanitize numeric values for JSON (handle inf/nan) # Helper to sanitize numeric values for JSON (handle inf/nan)
def sanitize_value(val): def sanitize_value(val):
if isinstance(val, float) and (math.isinf(val) or math.isnan(val)): if isinstance(val, float):
return None if math.isinf(val):
return "overdue"
if math.isnan(val):
return None
return val return val
result = { result = {
@@ -146,6 +151,12 @@ class AlertState:
return result return result
def __setstate__(self, state):
"""Restore from pickle, backfilling fields added after the pickle was written."""
self.__dict__.update(state)
if not hasattr(self, 'consecutive_count'):
self.consecutive_count = 0
def acknowledge(self): def acknowledge(self):
"""Acknowledge this alert to stop reminder notifications.""" """Acknowledge this alert to stop reminder notifications."""
self.acknowledged = True self.acknowledged = True
@@ -157,7 +168,7 @@ class AlertState:
class ThresholdConfig: class ThresholdConfig:
"""Configuration for a single threshold check.""" """Configuration for a single threshold check."""
def __init__( def __init__(
self, self,
metric_path: str, metric_path: str,
@@ -167,10 +178,11 @@ class ThresholdConfig:
operator: str = ">", operator: str = ">",
hysteresis: float = 0.0, hysteresis: float = 0.0,
enabled: bool = True, enabled: bool = True,
count: int = 1,
): ):
""" """
Initialize threshold configuration. Initialize threshold configuration.
Args: Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
warning: Warning threshold value warning: Warning threshold value
@@ -178,6 +190,7 @@ class ThresholdConfig:
operator: Comparison operator (>, >=, <, <=, ==, !=) operator: Comparison operator (>, >=, <, <=, ==, !=)
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled enabled: Whether this threshold is enabled
count: Number of consecutive exceedances required before alerting (default 1)
""" """
self.metric_path = metric_path self.metric_path = metric_path
self.warning = warning self.warning = warning
@@ -185,6 +198,7 @@ class ThresholdConfig:
self.enabled = enabled self.enabled = enabled
self.hysteresis = hysteresis self.hysteresis = hysteresis
self.display = display self.display = display
self.count = max(1, int(count))
# Parse operator # Parse operator
try: try:
@@ -386,29 +400,49 @@ class ThresholdChecker:
def _parse_multi_config(self, config: Dict[str, Any]): def _parse_multi_config(self, config: Dict[str, Any]):
"""Parse multiple named threshold configurations.""" """Parse multiple named threshold configurations."""
threshold_configs = config.get("threshold_configs", {}) threshold_configs = config.get("threshold_configs", {})
if not threshold_configs: if not threshold_configs:
logger.info("No threshold configurations defined") logger.info("No threshold configurations defined")
return return
# Parse each named configuration # Build effective_defaults: THRESHOLD_DEFAULTS merged with the 'default' config (if present).
# All other configs inherit any metric not explicitly defined from effective_defaults.
effective_defaults: Dict[str, ThresholdConfig] = {}
for plugin_name, plugin_thresholds in THRESHOLD_DEFAULTS.get("thresholds", {}).items():
if isinstance(plugin_thresholds, dict):
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults)
if "default" in threshold_configs:
default_data = threshold_configs["default"]
if isinstance(default_data, dict) and "thresholds" in default_data:
for plugin_name, plugin_thresholds in default_data["thresholds"].items():
if isinstance(plugin_thresholds, dict):
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults)
self.threshold_configs["default"] = dict(effective_defaults)
logger.info("Registered 'default' threshold config with %d metrics", len(effective_defaults))
# Parse each named configuration, seeding it with effective_defaults first
for config_name, config_data in threshold_configs.items(): for config_name, config_data in threshold_configs.items():
if config_name == "default":
continue # already handled above
if not isinstance(config_data, dict): if not isinstance(config_data, dict):
logger.warning("Invalid threshold config '%s', skipping", config_name) logger.warning("Invalid threshold config '%s', skipping", config_name)
continue continue
if "thresholds" not in config_data: if "thresholds" not in config_data:
logger.warning("No thresholds in config '%s', skipping", config_name) logger.warning("No thresholds in config '%s', skipping", config_name)
continue continue
logger.info("Parsing threshold configuration: %s", config_name) logger.info("Parsing threshold configuration: %s", config_name)
self.threshold_configs[config_name] = {} self.threshold_configs[config_name] = dict(effective_defaults)
thresholds_config = config_data["thresholds"] thresholds_config = config_data["thresholds"]
for plugin_name, plugin_thresholds in thresholds_config.items(): for plugin_name, plugin_thresholds in thresholds_config.items():
if not isinstance(plugin_thresholds, dict): if not isinstance(plugin_thresholds, dict):
continue continue
self._parse_plugin_thresholds( self._parse_plugin_thresholds(
plugin_name, plugin_name,
plugin_thresholds, plugin_thresholds,
@@ -600,11 +634,12 @@ class ThresholdChecker:
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
enabled = rtt_thresholds.get("enabled", True) enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display") display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1)
if warning is None and critical is None: if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping") logger.warning("No RTT thresholds defined, skipping")
return return
threshold = ThresholdConfig( threshold = ThresholdConfig(
metric_path=metric_path, metric_path=metric_path,
warning=warning, warning=warning,
@@ -612,14 +647,16 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display display=display,
count=count,
) )
target_dict[metric_path] = threshold target_dict[metric_path] = threshold
logger.debug( logger.debug(
"Registered RTT threshold: warn=%s ms, crit=%s ms", "Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
warning, warning,
critical critical,
count,
) )
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
@@ -691,14 +728,34 @@ class ThresholdChecker:
value, value,
alert_state.level alert_state.level
) )
# Apply consecutive-count gating: when currently OK, require threshold.count
# consecutive exceedances before escalating to WARNING/CRITICAL.
if new_level == AlertLevel.OK:
# Value is fine (or recovered) — reset the pending counter immediately.
alert_state.consecutive_count = 0
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
# First time we exceed while still OK: count up.
alert_state.consecutive_count += 1
if alert_state.consecutive_count < threshold.count:
logger.debug(
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
alert_state.consecutive_count,
threshold.count,
metric_path,
host_name,
)
return None
# Count reached — fire the alert and reset the counter.
alert_state.consecutive_count = 0
# Determine which threshold was exceeded # Determine which threshold was exceeded
threshold_value = None threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None: if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None: elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning threshold_value = threshold.warning
# Update state and check for changes # Update state and check for changes
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
@@ -711,7 +768,7 @@ class ThresholdChecker:
elif new_level != AlertLevel.OK: elif new_level != AlertLevel.OK:
# Check if we should re-notify # Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None) self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
return None return None
def check_plugin_data( def check_plugin_data(
self, self,
@@ -884,48 +941,50 @@ class ThresholdChecker:
# Format operator symbol # Format operator symbol
op_symbol = threshold.operator.value op_symbol = threshold.operator.value
# Use a display-friendly value (inf is the sentinel for "overdue")
import math
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
# Format message # Format message
if new_level == AlertLevel.OK: if new_level == AlertLevel.OK:
lvl = "RECOVERED" lvl = "RECOVERED"
message = f"{metric_path} = {value} ({old_level.name} -> OK)" message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING: elif new_level == AlertLevel.WARNING:
lvl = "WARNING" lvl = "WARNING"
if threshold_value is not None: if threshold_value is not None:
# Use display format string
threshold_info = self._format_display( threshold_info = self._format_display(
threshold.display, threshold.display,
value=value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data
) )
message = f"{metric_path} = {value} {threshold_info}" message = f"{metric_path} = {display_value} {threshold_info}"
else: else:
message = f"{metric_path} = {value}" message = f"{metric_path} = {display_value}"
elif new_level == AlertLevel.CRITICAL: elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL" lvl = "CRITICAL"
if threshold_value is not None: if threshold_value is not None:
# Use display format string
threshold_info = self._format_display( threshold_info = self._format_display(
threshold.display, threshold.display,
value=value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data
) )
message = f"{metric_path} = {value} {threshold_info}" message = f"{metric_path} = {display_value} {threshold_info}"
else: else:
message = f"{metric_path} = {value}" message = f"{metric_path} = {display_value}"
else: else:
lvl = "UNKNOWN" lvl = "UNKNOWN"
message = f"{metric_path} = {value}" message = f"{metric_path} = {display_value}"
# Return the formatted threshold info for storing in AlertState # Return the formatted threshold info for storing in AlertState
formatted_threshold_msg = None formatted_threshold_msg = None
if threshold_value is not None and new_level != AlertLevel.OK: if threshold_value is not None and new_level != AlertLevel.OK:
formatted_threshold_msg = self._format_display( formatted_threshold_msg = self._format_display(
threshold.display, threshold.display,
value=value, value=display_value,
threshold_value=threshold_value, threshold_value=threshold_value,
op_symbol=op_symbol, op_symbol=op_symbol,
plugin_data=plugin_data plugin_data=plugin_data
@@ -1037,9 +1096,9 @@ class ThresholdChecker:
threshold: Threshold configuration threshold: Threshold configuration
plugin_data: Optional dictionary of all plugin data fields plugin_data: Optional dictionary of all plugin data fields
""" """
if alert_state.level == AlertLevel.OK: if alert_state.level != AlertLevel.CRITICAL:
return return
# Skip reminders if alert has been acknowledged # Skip reminders if alert has been acknowledged
if alert_state.acknowledged: if alert_state.acknowledged:
return return
+31 -13
View File
@@ -7,6 +7,8 @@ import time
import zlib import zlib
import logging import logging
from platform import system as platform_system
from ..common.proto import stodict, oldmtodict from ..common.proto import stodict, oldmtodict
from ..common.utils import dur from ..common.utils import dur
from . import notify as notify_mod from . import notify as notify_mod
@@ -16,9 +18,18 @@ eventlog = notify_mod.eventlog
# SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram. # SO_TIMESTAMP: kernel attaches a struct timeval to each received datagram.
# Supported on Linux, FreeBSD, and macOS. The constant is not exposed by # Supported on Linux, FreeBSD, and macOS. The constant is not exposed by
# Python's socket module on all platforms, so fall back to the Linux value (29) # Python's socket module on all platforms
# when absent. platform = platform_system()
_SO_TIMESTAMP = getattr(socket, 'SO_TIMESTAMP', 29) if platform == "Darwin":
_SO_TIMESTAMP = 1024 # SO_TIMESTAMP on macOS (not in Python's socket module)
elif platform == "Linux":
_SO_TIMESTAMP = 29 # Linux value (not in older Python versions)
elif platform == "FreeBSD":
_SO_TIMESTAMP = 32 # FreeBSD value (not in older Python versions)
else:
logger.warning("SO_TIMESTAMP may not be supported on this platform (%s)", platform)
_SO_TIMESTAMP = None
# struct timeval uses two native C longs: tv_sec and tv_usec # struct timeval uses two native C longs: tv_sec and tv_usec
_TIMEVAL = struct.Struct('@ll') _TIMEVAL = struct.Struct('@ll')
@@ -222,11 +233,15 @@ def restore_connection_timers(hbdclass, ctx):
if state == hbdclass.Connection.UP and interval > 0: if state == hbdclass.Connection.UP and interval > 0:
elapsed = now - conn.lastbeat elapsed = now - conn.lastbeat
remaining = max(1.0, (interval + grace) - elapsed) # Give hosts one full (interval + grace) of extra time on startup
# so hosts that were silent while hbd was down are not immediately
# flagged as overdue before they have a chance to check in.
startup_grace = interval + grace
remaining = max(startup_grace, 2 * startup_grace - elapsed)
conn.reset_overdue_timer(remaining, on_overdue) conn.reset_overdue_timer(remaining, on_overdue)
logger.debug( logger.debug(
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)", "Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs, startup grace %.0fs)",
uname, afam, remaining, elapsed, uname, afam, remaining, elapsed, startup_grace,
) )
restored += 1 restored += 1
@@ -397,13 +412,16 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if conn.getstate() != hbdcls.Connection.UP: if conn.getstate() != hbdcls.Connection.UP:
lasts = conn.state lasts = conn.state
d = conn.newstate(hbdcls.Connection.UP, now) d = conn.newstate(hbdcls.Connection.UP, now)
if d == 0 or lasts == "unknown": # Don't log/notify RECOVER for a brand-new host seen for the first time —
m = "%s is up" % (conn.afam) # it was never down, it just hasn't been seen before.
else: if not newh:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d)) if d == 0 or lasts == "unknown":
eventlog(uname, "RECOVER", m) m = "%s is up" % (conn.afam)
if uname in watchhosts: else:
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam)) m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
eventlog(uname, "RECOVER", m)
if uname in watchhosts:
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
if boot or newh: if boot or newh:
host.upcount = host.doesack host.upcount = host.doesack
+14
View File
@@ -226,3 +226,17 @@ def _purge_expired_sessions() -> None:
expired = [t for t, s in list(_sessions.items()) if s["expires"] < now] expired = [t for t, s in list(_sessions.items()) if s["expires"] < now]
for t in expired: for t in expired:
del _sessions[t] del _sessions[t]
def save_sessions() -> dict:
"""Return a snapshot of non-expired sessions suitable for pickling."""
_purge_expired_sessions()
return dict(_sessions)
def load_sessions(snapshot: dict) -> None:
"""Restore sessions from a pickled snapshot, dropping any that have expired."""
global _sessions
now = time.time()
_sessions = {t: s for t, s in snapshot.items() if s.get("expires", 0) > now}
logger.debug("Restored %d session(s) from pickle", len(_sessions))
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "hbd" name = "hbd"
version = "5.0.12" version = "5.1.0"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)" description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"