Compare commits
51 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f3d08d1c9e | |||
| 1e4263b793 | |||
| e931acb9f5 | |||
| 018409e71d | |||
| 1824f637b4 | |||
| a534c06b26 | |||
| d7b5c97a4e | |||
| ae447ac4a6 | |||
| d44ce3d124 | |||
| b1985d0eb2 | |||
| de778f680f | |||
| d7b368c7c6 | |||
| e790663f9f | |||
| 475319e248 | |||
| ca5ef384a8 | |||
| c93dbdc0f4 | |||
| 3a546a1e5c | |||
| 74c89d098c | |||
| 3301dbfe34 | |||
| d00d903e7d | |||
| babb5d61aa | |||
| 11d1c718b3 | |||
| a99b6b54c7 | |||
| 8da3d550eb | |||
| a76d0fc840 | |||
| 94cbb31c48 | |||
| ae60844a8a | |||
| 49fa310361 | |||
| 28e2180f7b | |||
| ce0590f015 | |||
| f50acca509 | |||
| 72fc82b91f | |||
| 46f8c32c0b | |||
| 691f62aa69 | |||
| cffc9805f9 | |||
| 917d6a401b | |||
| 2bd3a9beb6 | |||
| 5523c60866 | |||
| ab37ac7194 | |||
| f811a19d80 | |||
| 6239825f43 | |||
| b56245bb23 | |||
| 331c4e804d | |||
| 9fd945a481 | |||
| 26df08eeff | |||
| 5819dd6b25 | |||
| 6fb67f8615 | |||
| e70ae6f176 | |||
| a77f6d380c | |||
| 6aae2a1dab | |||
| 85ee0e1040 |
@@ -27,6 +27,7 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
||||
- Configurable retention and backup management
|
||||
- **Plugin system for extensible monitoring** ✅
|
||||
- Collect system metrics (CPU, memory, disk, network)
|
||||
- Monitor ZFS pool health, capacity, and I/O via `zpool(8)`
|
||||
- Execute existing Nagios monitoring plugins
|
||||
- Create custom plugins with simple Python classes
|
||||
- **Threshold alerting system** ✅
|
||||
@@ -34,6 +35,8 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
||||
- Hysteresis to prevent alert flapping
|
||||
- Automatic notifications on state changes
|
||||
- Re-notification for ongoing alerts
|
||||
- **Per-host watch flag** — set `watch: false` on any host to silence all notifications for that host without removing its configuration ✅
|
||||
- **Role-filtered dashboards** — Live Dashboard and Host Overview show only hosts where the logged-in user is owner or manager (admins see all) ✅
|
||||
- Modular codebase suitable for unit testing and CI ✅
|
||||
|
||||
---
|
||||
@@ -55,21 +58,26 @@ Heartbeat includes a comprehensive plugin architecture that extends monitoring b
|
||||
### Built-in Plugins
|
||||
|
||||
- `os_info`: Collects OS, kernel, distribution, and architecture information
|
||||
- `cpu_monitor`: Monitors CPU usage, load average, frequency, and process counts
|
||||
- `memory_monitor`: Monitors RAM and swap usage, available memory
|
||||
- `cpu_monitor`: Monitors CPU usage, load average, frequency, process counts, and uptime
|
||||
- `memory_monitor`: Monitors RAM and swap usage, available memory (ZFS ARC-aware)
|
||||
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
|
||||
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
|
||||
- `ping_monitor`: Measures round-trip latency to configured hosts
|
||||
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
|
||||
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
|
||||
- `zfs_monitor`: Monitors ZFS pool health, capacity, fragmentation, dedup ratio, and cumulative I/O via `zpool(8)`
|
||||
|
||||
### Nagios Integration
|
||||
|
||||
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
|
||||
|
||||
- Executes plugins via subprocess with timeout protection
|
||||
- Executes plugins asynchronously (non-blocking) with timeout protection
|
||||
- Captures both stdout and stderr; if stdout is empty, stderr is used as the status message
|
||||
- Handles signal-killed processes (negative exit code → UNKNOWN status)
|
||||
- Validates absolute command paths at startup and warns on missing or non-executable files
|
||||
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
|
||||
- Extracts performance data with thresholds
|
||||
- Reports aggregated status across all configured checks
|
||||
- Reports per-check status, exit code, and output; no aggregate rollup field
|
||||
|
||||
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
|
||||
|
||||
@@ -147,9 +155,11 @@ Heartbeat includes a sophisticated threshold alerting system that monitors plugi
|
||||
- **Multi-level alerts**: WARNING and CRITICAL severity levels
|
||||
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
|
||||
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
|
||||
- **Smart notifications**: Alerts only on state changes, not every check
|
||||
- **Smart notifications**: Alerts only on state changes, not every check; de-escalations (e.g. CRITICAL → WARNING) do not generate a notification
|
||||
- **Re-notifications**: Periodic reminders for ongoing alerts
|
||||
- **Short-duration suppression**: Recovery notifications are suppressed for down events under 4 seconds (avoids noise from transient blips)
|
||||
- **Journal integration**: All threshold events logged for audit trail
|
||||
- **`ping_monitor` thresholds**: Latency and packet-loss thresholds use the same format as all other plugin metrics
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -172,7 +182,8 @@ thresholds:
|
||||
warning: 80.0 # Warn when CPU > 80%
|
||||
critical: 90.0 # Critical when CPU > 90%
|
||||
operator: ">"
|
||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||
hysteresis: 0.02 # 2% hysteresis to prevent flapping
|
||||
display: "(threshold: {op_symbol} {threshold_value}%)" # optional
|
||||
|
||||
memory_monitor:
|
||||
percent:
|
||||
@@ -214,7 +225,7 @@ thresholds:
|
||||
<hostname>:
|
||||
warning: <milliseconds> # Warn when RTT > this value
|
||||
critical: <milliseconds> # Critical when RTT > this value
|
||||
hysteresis: 0.1 # Optional: 10% hysteresis (default)
|
||||
hysteresis: 0.02 # Optional: 2% hysteresis (default)
|
||||
```
|
||||
|
||||
**Example alerts:**
|
||||
@@ -265,7 +276,94 @@ All plugin metrics can be thresholded:
|
||||
- **Memory**: percent, available_mb, swap_percent
|
||||
- **Disk**: Per-partition percent, free_gb, free_mb
|
||||
- **Network**: errors_total, dropped packets, connection counts
|
||||
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL)
|
||||
- **Nagios**: Any field emitted by `nagios_runner` (`<name>_status_code`, `<name>_status`, `<name>_output`, performance data fields)
|
||||
|
||||
### Display Format Templates
|
||||
|
||||
Each threshold entry accepts an optional `display` field — a Python format string shown in notifications and on the Alerts dashboard:
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
status_code:
|
||||
warning: 1
|
||||
critical: 2
|
||||
operator: ">="
|
||||
display: "{check_name}: exit {value} (expected < {threshold_value})"
|
||||
```
|
||||
|
||||
Available variables:
|
||||
|
||||
| Variable | Description |
|
||||
|---|---|
|
||||
| `{value}` | Current metric value |
|
||||
| `{threshold_value}` | Threshold that was crossed |
|
||||
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
|
||||
| `{check_name}` | Prefix stripped by generic matching (see below) |
|
||||
| `{metric_name}` | Full field name within the plugin data |
|
||||
| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
|
||||
| `{status}` | For `nagios_runner` generic matches: the matched check's status name — OK/WARNING/CRITICAL/UNKNOWN (alias for `{check_name}_status`) |
|
||||
| any plugin field | Any other field present in the plugin's data |
|
||||
|
||||
### Generic Threshold Matching
|
||||
|
||||
When a metric name has no exact threshold entry, the server progressively strips leading underscore-separated segments and re-tries the lookup. This lets a single generic entry cover an entire family of metrics.
|
||||
|
||||
The classic use case is `nagios_runner`, which names each metric after the command that produced it:
|
||||
|
||||
```
|
||||
nagios_runner.check_disk_root_status_code → no exact match
|
||||
nagios_runner.disk_root_status_code → no match
|
||||
nagios_runner.root_status_code → no match
|
||||
nagios_runner.status_code → matched ✓
|
||||
```
|
||||
|
||||
Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
|
||||
|
||||
```yaml
|
||||
nagios_runner:
|
||||
status_code:
|
||||
operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||
display: "{check_name}: {output}"
|
||||
```
|
||||
|
||||
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
|
||||
|
||||
Exact matches always take priority. A generic entry only applies when no specific one is defined.
|
||||
|
||||
### Per-Host Threshold Profiles
|
||||
|
||||
Named threshold configurations let different hosts use different limits. A host's `threshold_config` can be a single name or a **list** — lists are applied left-to-right so profiles compose without duplication:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 80, critical: 90}
|
||||
memory_monitor:
|
||||
memory_percent: {warning: 85, critical: 95}
|
||||
|
||||
tight_cpu: # override CPU limits only
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 60, critical: 75}
|
||||
|
||||
db_disk: # add a database partition check
|
||||
thresholds:
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/var/lib/postgresql:
|
||||
percent: {warning: 75, critical: 88}
|
||||
|
||||
hosts:
|
||||
web-01:
|
||||
threshold_config: default # single profile
|
||||
|
||||
db-01:
|
||||
threshold_config: [tight_cpu, db_disk] # layered: CPU override + extra disk check
|
||||
```
|
||||
|
||||
Each named config's overrides are applied in order on top of the defaults. Metrics not mentioned in a profile are inherited unchanged.
|
||||
|
||||
See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration.
|
||||
|
||||
@@ -328,9 +426,10 @@ Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST AP
|
||||
### Web Dashboards
|
||||
|
||||
- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
|
||||
- **Live View** (`/live`): Real-time host connectivity, latency, and messages
|
||||
- **Plugin Metrics** (`/plugins`): Browse and visualize metrics from all plugins
|
||||
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering
|
||||
- **Live View** (`/live`): Real-time host connectivity, latency, and messages; hostnames link directly to the Host Overview page
|
||||
- **Host Overview** (`/plugins/<host>`): Per-host plugin metrics with ZFS pool visualization; filtered to hosts where the logged-in user is owner or manager (admins see all)
|
||||
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering; alert count pie chart shown in the navigation bar
|
||||
- **Settings** (`/settings`): Server configuration, user management, and threshold configuration viewer
|
||||
|
||||
### API Endpoints
|
||||
|
||||
@@ -416,12 +515,11 @@ You can also run it via the module entrypoint:
|
||||
python -m hbd.client.main your-server.example.com
|
||||
```
|
||||
|
||||
Client configuration can also be specified in YAML:
|
||||
Client configuration can also be specified in YAML (`~/.hbc.yaml`):
|
||||
|
||||
```yaml
|
||||
server: hbd.example.com
|
||||
port: 50003
|
||||
interval: 30
|
||||
hb_port: 50003 # Server port (default: 50003)
|
||||
interval: 30 # Heartbeat interval in seconds
|
||||
plugins:
|
||||
cpu_monitor:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
@@ -435,12 +533,20 @@ plugins:
|
||||
nagios_runner:
|
||||
interval: 300 # Check every 5 minutes (default)
|
||||
commands:
|
||||
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
- name: check_disk
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
```
|
||||
|
||||
The server hostname is always passed as a positional command-line argument; there is no `server:` config key.
|
||||
|
||||
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
|
||||
|
||||
**Connection retry:** If a server is temporarily unreachable, `hbc` retries `open()` indefinitely on every heartbeat interval. IPv6 connections that never succeeded during early startup are dropped after 3 consecutive failures (to handle hosts without IPv6 routing), while IPv4 connections always retry.
|
||||
|
||||
**Daemon logging:** When running with `-d`, `hbc` routes all log output to syslog (`LOG_DAEMON` facility) after daemonizing. Without `-d`, logs go to stderr as usual.
|
||||
|
||||
### hbc_mini — single-file client (no external dependencies)
|
||||
|
||||
`scripts/hbc_mini.py` is a self-contained version of the heartbeat client that requires only Python 3.8+ and no external packages. Copy it to any host and run it directly — no virtualenv, no `pip install`.
|
||||
@@ -496,8 +602,10 @@ python3 hbc_mini.py -m "maintenance starting" your-server.example.com
|
||||
|
||||
- No YAML config (use JSON instead)
|
||||
- No `filesystem_info` plugin
|
||||
- No `zfs_monitor` plugin (requires `zpool(8)` and the full plugin loader)
|
||||
- `cpu_monitor` does not report per-core usage or CPU frequency (no psutil)
|
||||
- Plugins cannot be loaded from external `.py` files — all plugins are compiled in
|
||||
- No IPv6 early-fail protection — connections that fail to open at startup are silently skipped rather than retried
|
||||
|
||||
Everything else — heartbeat protocol, ACK/CMD/UPD handling, `hb_install.sh`-based self-update, daemonize, syslog — is identical to the full client.
|
||||
|
||||
|
||||
@@ -104,11 +104,6 @@ The `nagios_runner` plugin collects:
|
||||
- `{name}_{metric}_min` - Minimum value (if present)
|
||||
- `{name}_{metric}_max` - Maximum value (if present)
|
||||
|
||||
**Overall:**
|
||||
- `overall_status` - Worst status from all commands
|
||||
- `overall_status_code` - Worst status code
|
||||
- `plugin_count` - Number of Nagios plugins executed
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```yaml
|
||||
|
||||
+174
-65
@@ -814,34 +814,32 @@ Planned features:
|
||||
|
||||
## Multi-Threshold Configuration
|
||||
|
||||
**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping.
|
||||
Support for multiple named threshold configurations with per-host mapping and composable layering.
|
||||
|
||||
### Overview
|
||||
|
||||
The multi-threshold feature allows you to:
|
||||
- Define multiple sets of threshold configurations
|
||||
- Map different hosts to different threshold sets
|
||||
- Define multiple named threshold configurations
|
||||
- Assign one or more configurations to each host
|
||||
- Compose configurations by layering — each named config's overrides are applied in order on top of the defaults
|
||||
- Use different sensitivity levels for different environments
|
||||
- Maintain a default configuration for unmapped hosts
|
||||
|
||||
### Configuration Structure
|
||||
|
||||
Named configurations are defined under `threshold_configs`. Each host selects which ones to use via `threshold_config` in the `hosts` section (a string for a single config, or a list to layer multiple):
|
||||
|
||||
```yaml
|
||||
# Optional: Set the default configuration name (defaults to "default")
|
||||
# Optional: set the default configuration name (defaults to "default")
|
||||
default_threshold_config: "default"
|
||||
|
||||
# Define multiple named threshold configurations
|
||||
threshold_configs:
|
||||
# Configuration name 1
|
||||
default:
|
||||
thresholds:
|
||||
# Standard threshold definitions
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
|
||||
# Configuration name 2
|
||||
high_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
@@ -849,7 +847,6 @@ threshold_configs:
|
||||
warning: 60.0
|
||||
critical: 75.0
|
||||
|
||||
# Configuration name 3
|
||||
low_sensitivity:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
@@ -857,14 +854,77 @@ threshold_configs:
|
||||
warning: 90.0
|
||||
critical: 95.0
|
||||
|
||||
# Map specific hosts to specific configurations
|
||||
host_threshold_mapping:
|
||||
prod-web-01: high_sensitivity
|
||||
prod-web-02: high_sensitivity
|
||||
dev-server-01: low_sensitivity
|
||||
# Unmapped hosts use default_threshold_config
|
||||
hosts:
|
||||
prod-web-01:
|
||||
threshold_config: high_sensitivity # single config
|
||||
|
||||
dev-server-01:
|
||||
threshold_config: low_sensitivity
|
||||
|
||||
# Hosts with no threshold_config use default_threshold_config
|
||||
```
|
||||
|
||||
### Composable Configurations (list form)
|
||||
|
||||
`threshold_config` can be a list. Configs are applied **left to right**: the defaults are the base, then each named config's overrides are layered on top. Later entries in the list win on any metric they define.
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 80, critical: 90}
|
||||
memory_monitor:
|
||||
memory_percent: {warning: 85, critical: 95}
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent: {warning: 80, critical: 90}
|
||||
|
||||
# Tighter CPU limits for busy servers
|
||||
high_cpu_load:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 60, critical: 75}
|
||||
|
||||
# Tighter disk limits for data-heavy servers
|
||||
busy_disk:
|
||||
thresholds:
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/:
|
||||
percent: {warning: 70, critical: 85}
|
||||
|
||||
hosts:
|
||||
# Gets default thresholds only
|
||||
web-01:
|
||||
threshold_config: default
|
||||
|
||||
# Gets tighter CPU limits, default memory and disk
|
||||
build-server:
|
||||
threshold_config: high_cpu_load
|
||||
|
||||
# Layers both: tighter CPU AND tighter disk, default memory
|
||||
db-01:
|
||||
threshold_config: [high_cpu_load, busy_disk]
|
||||
|
||||
# Three layers: busy_disk overrides high_cpu_load if they conflict
|
||||
storage-01:
|
||||
threshold_config: [default, high_cpu_load, busy_disk]
|
||||
```
|
||||
|
||||
**How layering works:**
|
||||
|
||||
Starting from the `default` thresholds:
|
||||
|
||||
| Layer | Applied config | Effect |
|
||||
|-------|---------------|--------|
|
||||
| Base | `default` | all default thresholds |
|
||||
| +1 | `high_cpu_load` | cpu_percent overridden to 60/75 |
|
||||
| +2 | `busy_disk` | disk percent overridden to 70/85; cpu_percent stays at 60/75 |
|
||||
|
||||
Each named config only overrides the metrics it explicitly defines. Metrics not mentioned in a config inherit from the layers beneath.
|
||||
|
||||
### Use Cases
|
||||
|
||||
#### 1. Environment-Based Thresholds
|
||||
@@ -887,11 +947,15 @@ threshold_configs:
|
||||
warning: 90.0 # More relaxed for dev
|
||||
critical: 98.0
|
||||
|
||||
host_threshold_mapping:
|
||||
prod-web-01: production
|
||||
prod-web-02: production
|
||||
dev-web-01: development
|
||||
dev-web-02: development
|
||||
hosts:
|
||||
prod-web-01:
|
||||
threshold_config: production
|
||||
prod-web-02:
|
||||
threshold_config: production
|
||||
dev-web-01:
|
||||
threshold_config: development
|
||||
dev-web-02:
|
||||
threshold_config: development
|
||||
```
|
||||
|
||||
#### 2. Server Role-Based Thresholds
|
||||
@@ -914,7 +978,7 @@ threshold_configs:
|
||||
warning: 70.0
|
||||
critical: 85.0
|
||||
memory_monitor:
|
||||
percent:
|
||||
memory_percent:
|
||||
warning: 90.0 # Databases can use high memory
|
||||
critical: 97.0
|
||||
disk_monitor:
|
||||
@@ -927,17 +991,23 @@ threshold_configs:
|
||||
cache:
|
||||
thresholds:
|
||||
memory_monitor:
|
||||
percent:
|
||||
memory_percent:
|
||||
warning: 95.0 # Redis/Memcached can use very high memory
|
||||
critical: 99.0
|
||||
|
||||
host_threshold_mapping:
|
||||
web-01: webserver
|
||||
web-02: webserver
|
||||
db-01: database
|
||||
db-02: database
|
||||
redis-01: cache
|
||||
memcached-01: cache
|
||||
hosts:
|
||||
web-01:
|
||||
threshold_config: webserver
|
||||
web-02:
|
||||
threshold_config: webserver
|
||||
db-01:
|
||||
threshold_config: database
|
||||
db-02:
|
||||
threshold_config: database
|
||||
redis-01:
|
||||
threshold_config: cache
|
||||
memcached-01:
|
||||
threshold_config: cache
|
||||
```
|
||||
|
||||
#### 3. Sensitivity Levels
|
||||
@@ -952,7 +1022,7 @@ threshold_configs:
|
||||
partitions:
|
||||
/:
|
||||
percent:
|
||||
warning: 70.0 # Very sensitive
|
||||
warning: 70.0
|
||||
critical: 80.0
|
||||
hysteresis: 0.15
|
||||
|
||||
@@ -976,52 +1046,91 @@ threshold_configs:
|
||||
critical: 98.0
|
||||
hysteresis: 0.05
|
||||
|
||||
host_threshold_mapping:
|
||||
payment-gateway: critical
|
||||
auth-server: critical
|
||||
web-01: standard
|
||||
web-02: standard
|
||||
test-server: relaxed
|
||||
hosts:
|
||||
payment-gateway:
|
||||
threshold_config: critical
|
||||
auth-server:
|
||||
threshold_config: critical
|
||||
web-01:
|
||||
threshold_config: standard
|
||||
web-02:
|
||||
threshold_config: standard
|
||||
test-server:
|
||||
threshold_config: relaxed
|
||||
```
|
||||
|
||||
### Backward Compatibility
|
||||
#### 4. Composable Profiles
|
||||
|
||||
The legacy single threshold configuration is fully supported:
|
||||
Build host-specific thresholds by combining small, focused configs:
|
||||
|
||||
```yaml
|
||||
# Old format - still works
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
```
|
||||
|
||||
This is equivalent to:
|
||||
|
||||
```yaml
|
||||
# New format
|
||||
threshold_configs:
|
||||
# Baseline — everything at default levels
|
||||
default:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
```
|
||||
cpu_percent: {warning: 80, critical: 90}
|
||||
memory_monitor:
|
||||
memory_percent: {warning: 85, critical: 95}
|
||||
|
||||
# Overlay: tighter CPU only
|
||||
tight_cpu:
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 60, critical: 75}
|
||||
|
||||
# Overlay: tighter memory only
|
||||
tight_memory:
|
||||
thresholds:
|
||||
memory_monitor:
|
||||
memory_percent: {warning: 70, critical: 85}
|
||||
|
||||
# Overlay: extra disk partition for database servers
|
||||
db_disk:
|
||||
thresholds:
|
||||
disk_monitor:
|
||||
partitions:
|
||||
/var/lib/postgresql:
|
||||
percent: {warning: 75, critical: 88}
|
||||
|
||||
hosts:
|
||||
# Plain web server
|
||||
web-01:
|
||||
threshold_config: default
|
||||
|
||||
# Build server: tight CPU, default memory and disk
|
||||
build-01:
|
||||
threshold_config: tight_cpu
|
||||
|
||||
# Database: tight CPU + tight memory + extra disk partition
|
||||
db-01:
|
||||
threshold_config: [tight_cpu, tight_memory, db_disk]
|
||||
|
||||
# Replica database: tight memory + extra disk, normal CPU
|
||||
db-02:
|
||||
threshold_config: [tight_memory, db_disk]
|
||||
```
|
||||
### Configuration Priority
|
||||
|
||||
1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config
|
||||
2. **Default config**: Use `default_threshold_config`
|
||||
3. **First alphabetically**: If default not found, use first config alphabetically
|
||||
4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds`
|
||||
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
|
||||
2. **Host `threshold_config` (string)**: Use that single named config directly
|
||||
3. **`host_threshold_mapping`** (legacy): Same as above, string only
|
||||
4. **`default_threshold_config`**: Used for hosts with no mapping
|
||||
5. **First alphabetically**: If the default config is not found, use the first config alphabetically
|
||||
6. **Legacy `thresholds` section**: Used when `threshold_configs` is absent entirely
|
||||
|
||||
### Example: Complete Multi-Threshold Setup
|
||||
### Backward Compatibility
|
||||
|
||||
See `hbd/config_multi_threshold_example.yaml` for a complete example with:
|
||||
- 4 named configurations (default, high_sensitivity, low_sensitivity, database)
|
||||
- Host-to-config mappings for production, development, and test systems
|
||||
- Specialized database server thresholds
|
||||
- Custom display messages with plugin data
|
||||
The legacy `host_threshold_mapping` top-level key and the flat `thresholds` section are still fully supported:
|
||||
|
||||
```yaml
|
||||
# Still works — equivalent to hosts: {prod-web-01: {threshold_config: high_sensitivity}}
|
||||
host_threshold_mapping:
|
||||
prod-web-01: high_sensitivity
|
||||
|
||||
# Still works — equivalent to threshold_configs: {default: {thresholds: ...}}
|
||||
thresholds:
|
||||
cpu_monitor:
|
||||
cpu_percent: {warning: 80, critical: 90}
|
||||
```
|
||||
|
||||
|
||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
||||
"""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.1.8"
|
||||
__version__ = "5.2.1"
|
||||
|
||||
+61
-20
@@ -56,6 +56,8 @@ class AsyncConnection:
|
||||
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||
self._dead = False
|
||||
self._ever_opened = False
|
||||
self._open_fail_count = 0 # consecutive failures before first success
|
||||
|
||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
@@ -73,6 +75,7 @@ class AsyncConnection:
|
||||
lambda: HeartbeatProtocol(self),
|
||||
family=self.af
|
||||
)
|
||||
self._ever_opened = True
|
||||
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
@@ -262,15 +265,51 @@ async def handle_update(conn: AsyncConnection, _msg: dict): # pyright: ignore[r
|
||||
|
||||
|
||||
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
"""Send periodic heartbeats.
|
||||
"""Send periodic heartbeats, retrying the connection if it is not open.
|
||||
|
||||
IPv6 connections that fail to open before their first successful send are
|
||||
dropped after IPV6_EARLY_FAIL_LIMIT attempts so that a network without IPv6
|
||||
does not keep a dead sender alive. IPv4 connections are retried indefinitely.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
interval: Heartbeat interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger("hbc.heartbeat")
|
||||
IPV6_EARLY_FAIL_LIMIT = 3
|
||||
|
||||
while running and not conn._dead:
|
||||
# Ensure transport is open before attempting to send.
|
||||
if not conn.transport:
|
||||
opened = await conn.open()
|
||||
if opened:
|
||||
conn._open_fail_count = 0
|
||||
else:
|
||||
conn._open_fail_count += 1
|
||||
# Drop an IPv6 connection that has never come up within the
|
||||
# first few attempts — it is likely unavailable on this network.
|
||||
if (not conn._ever_opened
|
||||
and conn.af == socket.AF_INET6
|
||||
and conn._open_fail_count >= IPV6_EARLY_FAIL_LIMIT):
|
||||
logger.warning(
|
||||
f"IPv6 connection to {conn.addr} unreachable after "
|
||||
f"{conn._open_fail_count} attempts, disabling"
|
||||
)
|
||||
conn._dead = True
|
||||
break
|
||||
# Retry after the normal interval; IPv4 retries forever.
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
continue
|
||||
|
||||
while running:
|
||||
try:
|
||||
msg = {
|
||||
"acks": conn.ackcount,
|
||||
@@ -279,19 +318,16 @@ async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
}
|
||||
await conn.sendto(msg, "HTB")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Heartbeat sender cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||
|
||||
# Wait for next interval or shutdown event
|
||||
try:
|
||||
if shutdown_event:
|
||||
await asyncio.wait_for(
|
||||
shutdown_event.wait(),
|
||||
timeout=interval
|
||||
)
|
||||
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(interval)
|
||||
@@ -427,16 +463,13 @@ async def cleanup(connections: List[AsyncConnection]):
|
||||
logger = logging.getLogger("hbc.cleanup")
|
||||
logger.info("Cleaning up connections")
|
||||
|
||||
for conn in connections:
|
||||
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||
if target:
|
||||
try:
|
||||
msg = {
|
||||
"shutdown": 1,
|
||||
"acks": conn.ackcount
|
||||
}
|
||||
await conn.sendto(msg)
|
||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending shutdown: {e}")
|
||||
|
||||
for conn in connections:
|
||||
conn.close()
|
||||
|
||||
# Give messages time to send
|
||||
@@ -481,12 +514,13 @@ async def async_main(args, config):
|
||||
addr = addr_info[4][0]
|
||||
|
||||
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||
if await conn.open():
|
||||
if not await conn.open():
|
||||
logger.warning(f"Initial open to {addr} failed, heartbeat sender will retry")
|
||||
connections.append(conn)
|
||||
conn_id += 1
|
||||
|
||||
if not connections:
|
||||
logger.error("No connections established")
|
||||
logger.error("No connections established (DNS resolution failed for all hosts)")
|
||||
return 1
|
||||
|
||||
logger.info(f"Created {len(connections)} connections")
|
||||
@@ -501,8 +535,8 @@ async def async_main(args, config):
|
||||
boot_msg["msg"] = args.message
|
||||
|
||||
boot_msg["acks"] = 0
|
||||
for conn in connections:
|
||||
await conn.sendto(boot_msg)
|
||||
target = next((c for c in connections if c.transport), connections[0])
|
||||
await target.sendto(boot_msg)
|
||||
|
||||
if args.message and not args.daemon:
|
||||
# Message-only mode
|
||||
@@ -525,6 +559,13 @@ async def async_main(args, config):
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, stop)
|
||||
|
||||
def _sighup():
|
||||
global dorestart
|
||||
dorestart = True
|
||||
stop()
|
||||
|
||||
loop.add_signal_handler(signal.SIGHUP, _sighup)
|
||||
|
||||
# Start async tasks
|
||||
# Heartbeat senders (one per connection)
|
||||
for conn in connections:
|
||||
@@ -695,7 +736,7 @@ def main(argv=None):
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
print("Daemonizing...")
|
||||
logging.info("Daemonizing...")
|
||||
daemonize()
|
||||
_reconfigure_logging_for_daemon(log_level)
|
||||
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||
|
||||
@@ -119,6 +119,13 @@ class CPUMonitorPlugin(MonitorPlugin):
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get CPU times: {e}")
|
||||
|
||||
# Uptime in seconds
|
||||
try:
|
||||
import time
|
||||
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get uptime: {e}")
|
||||
|
||||
self.logger.debug(
|
||||
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||
)
|
||||
|
||||
@@ -14,6 +14,24 @@ except ImportError:
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
def _zfs_arc_bytes() -> int:
|
||||
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
|
||||
|
||||
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
|
||||
kernel (it is not in SReclaimable), so it would otherwise be counted as
|
||||
used memory.
|
||||
"""
|
||||
try:
|
||||
with open("/proc/spl/kstat/zfs/arcstats") as fh:
|
||||
for line in fh:
|
||||
parts = line.split()
|
||||
if len(parts) >= 3 and parts[0] == "size":
|
||||
return int(parts[2])
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
return 0
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -101,11 +119,21 @@ class MemoryMonitorPlugin(MonitorPlugin):
|
||||
|
||||
# Virtual (physical) memory statistics
|
||||
vmem = psutil.virtual_memory()
|
||||
|
||||
# psutil's available already excludes page cache / file buffers
|
||||
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
|
||||
# does not include it in SReclaimable / MemAvailable even though it is
|
||||
# reclaimable.
|
||||
arc_bytes = _zfs_arc_bytes()
|
||||
available = min(vmem.available + arc_bytes, vmem.total)
|
||||
used = vmem.total - available
|
||||
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
|
||||
|
||||
metrics['memory_total'] = vmem.total
|
||||
metrics['memory_available'] = vmem.available
|
||||
metrics['memory_used'] = vmem.used
|
||||
metrics['memory_available'] = available
|
||||
metrics['memory_used'] = used
|
||||
metrics['memory_free'] = vmem.free
|
||||
metrics['memory_percent'] = vmem.percent
|
||||
metrics['memory_percent'] = percent
|
||||
|
||||
# Platform-specific memory details
|
||||
if hasattr(vmem, 'active'):
|
||||
|
||||
@@ -31,16 +31,13 @@ from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
# Nagios exit codes
|
||||
NAGIOS_OK = 0
|
||||
NAGIOS_WARNING = 1
|
||||
NAGIOS_CRITICAL = 2
|
||||
NAGIOS_UNKNOWN = 3
|
||||
|
||||
STATUS_NAMES = {
|
||||
NAGIOS_OK: "OK",
|
||||
NAGIOS_WARNING: "WARNING",
|
||||
NAGIOS_CRITICAL: "CRITICAL",
|
||||
NAGIOS_UNKNOWN: "UNKNOWN"
|
||||
0: "OK",
|
||||
1: "WARNING",
|
||||
2: "CRITICAL",
|
||||
3: "UNKNOWN",
|
||||
}
|
||||
|
||||
|
||||
@@ -129,9 +126,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
"""
|
||||
results = {}
|
||||
|
||||
# Track overall status (worst status wins)
|
||||
worst_status = NAGIOS_OK
|
||||
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name")
|
||||
command = cmd_config.get("command")
|
||||
@@ -149,10 +143,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
results[f"{name}_status_code"] = status_code
|
||||
results[f"{name}_output"] = output
|
||||
|
||||
# Track worst status
|
||||
if status_code > worst_status:
|
||||
worst_status = status_code
|
||||
|
||||
# Parse and add performance data
|
||||
if perfdata:
|
||||
for metric_name, metric_value in perfdata.items():
|
||||
@@ -167,12 +157,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
results[f"{name}_status"] = "ERROR"
|
||||
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||
results[f"{name}_output"] = str(e)
|
||||
worst_status = NAGIOS_UNKNOWN
|
||||
|
||||
# Add overall status
|
||||
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
|
||||
results["overall_status_code"] = worst_status
|
||||
results["plugin_count"] = len(self.commands)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
ZFS pool monitoring plugin for Heartbeat.
|
||||
|
||||
Collects per-pool health, capacity, and cumulative I/O statistics via zpool(8).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _int(s: str) -> Optional[int]:
|
||||
try:
|
||||
return int(s.strip().rstrip("KMGTkBkmgt%x"))
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
def _float(s: str) -> Optional[float]:
|
||||
try:
|
||||
return float(s.strip().rstrip("%x"))
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
class ZFSMonitorPlugin(MonitorPlugin):
|
||||
"""Monitor ZFS pool health, capacity, and I/O statistics.
|
||||
|
||||
Collects per pool:
|
||||
- health: ONLINE, DEGRADED, FAULTED, etc.
|
||||
- size / alloc / free: total, allocated and free bytes
|
||||
- capacity: percentage used (0-100)
|
||||
- frag: fragmentation percentage
|
||||
- dedup: deduplication ratio
|
||||
- read_ops / write_ops: cumulative I/O operations since last boot/clear
|
||||
- read_bw / write_bw: cumulative bytes transferred since last boot/clear
|
||||
|
||||
Configuration:
|
||||
interval: collection interval in seconds (default: 300)
|
||||
pools: list of pool names to monitor (default: all)
|
||||
"""
|
||||
|
||||
name = "zfs_monitor"
|
||||
description = "ZFS pool health, capacity, and I/O statistics"
|
||||
interval = 300
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.interval = self.config.get("interval", 300)
|
||||
self._pools_filter: Optional[List[str]] = self.config.get("pools", None)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
if not shutil.which("zpool"):
|
||||
self.skip_reason = "zpool not found"
|
||||
return False
|
||||
logger.info("ZFS monitor initialized (interval: %ds)", self.interval)
|
||||
return True
|
||||
|
||||
async def _run(self, *args: str) -> List[str]:
|
||||
"""Run a command and return its stdout lines, or [] on error."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*args,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
|
||||
return stdout.decode(errors="replace").splitlines()
|
||||
except (FileNotFoundError, asyncio.TimeoutError) as exc:
|
||||
logger.warning("zfs_monitor: %s: %s", args[0], exc)
|
||||
return []
|
||||
|
||||
async def _zpool_list(self) -> Dict[str, Dict]:
|
||||
"""Return per-pool health and capacity from `zpool list`."""
|
||||
lines = await self._run(
|
||||
"zpool", "list", "-H", "-p",
|
||||
"-o", "name,health,size,alloc,free,cap,frag,dedup",
|
||||
)
|
||||
pools: Dict[str, Dict] = {}
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 8:
|
||||
continue
|
||||
name = parts[0].strip()
|
||||
if self._pools_filter and name not in self._pools_filter:
|
||||
continue
|
||||
pools[name] = {
|
||||
"health": parts[1].strip(),
|
||||
"size": _int(parts[2]),
|
||||
"alloc": _int(parts[3]),
|
||||
"free": _int(parts[4]),
|
||||
"capacity": _float(parts[5]),
|
||||
"frag": _float(parts[6]),
|
||||
"dedup": _float(parts[7]),
|
||||
}
|
||||
return pools
|
||||
|
||||
async def _zpool_iostat(self) -> Dict[str, Dict]:
|
||||
"""Return per-pool cumulative I/O counters from `zpool iostat`."""
|
||||
lines = await self._run("zpool", "iostat", "-H", "-p")
|
||||
io: Dict[str, Dict] = {}
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
name = parts[0].strip()
|
||||
if not name or name.startswith(" "):
|
||||
continue
|
||||
io[name] = {
|
||||
"read_ops": _int(parts[3]),
|
||||
"write_ops": _int(parts[4]),
|
||||
"read_bw": _int(parts[5]),
|
||||
"write_bw": _int(parts[6]),
|
||||
}
|
||||
return io
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
pools, io = await asyncio.gather(self._zpool_list(), self._zpool_iostat())
|
||||
for name, stats in io.items():
|
||||
if name in pools:
|
||||
pools[name].update(stats)
|
||||
return {"pools": pools}
|
||||
|
||||
|
||||
plugin = ZFSMonitorPlugin
|
||||
@@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = {
|
||||
'warning': 200,
|
||||
'critical': 250.0,
|
||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||
},
|
||||
'nagios_runner': {
|
||||
'status_code': {
|
||||
'display': '{check_name} {output}',
|
||||
'operator': "nagios"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,7 +231,7 @@ def get_watchhosts(config):
|
||||
hosts_config = config.get("hosts", {})
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("watch", True):
|
||||
watchhosts.append(host_name)
|
||||
return watchhosts
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ class Connection:
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
d["rtt"] = "%d" % round(self.rtts[-1])
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
@@ -286,7 +286,7 @@ class Host:
|
||||
Host.hosts[name] = self
|
||||
self.num = num
|
||||
self.dyn = False
|
||||
self.watched = False
|
||||
self.watched = True
|
||||
self.upcount = 0
|
||||
self.interval = 0
|
||||
self.doesack = -1
|
||||
@@ -304,6 +304,7 @@ class Host:
|
||||
|
||||
def statedict(self):
|
||||
d = {}
|
||||
d["raw_name"] = self.name
|
||||
d["name"] = self.name
|
||||
if self.dyn:
|
||||
d["name"] += "*"
|
||||
|
||||
+26
-3
@@ -154,6 +154,25 @@ async def start(
|
||||
lst = [h.jsons() for h in hosts]
|
||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||
|
||||
async def api_alert_summary(request):
|
||||
"""GET /api/0/alert_summary — counts of ok/warning/critical hosts visible to caller."""
|
||||
user, err = _require_auth(request)
|
||||
if err:
|
||||
return err
|
||||
from .threshold import AlertLevel
|
||||
critical = warning = ok = 0
|
||||
for host in hbdclass.Host.hosts.values():
|
||||
if not _can_operate_host(user, host):
|
||||
continue
|
||||
levels = {s.level for s in host.alert_states.values()}
|
||||
if AlertLevel.CRITICAL in levels:
|
||||
critical += 1
|
||||
elif AlertLevel.WARNING in levels:
|
||||
warning += 1
|
||||
else:
|
||||
ok += 1
|
||||
return web.json_response({"critical": critical, "warning": warning, "ok": ok})
|
||||
|
||||
async def api_messages(request):
|
||||
lst = data.msgs[-30:]
|
||||
return web.json_response(lst)
|
||||
@@ -258,7 +277,9 @@ async def start(
|
||||
extra_scripts=extra_scripts,
|
||||
hbd_version=hbd_version,
|
||||
hosts=[
|
||||
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||
hbdclass.Host.hosts[h].stateinfo()
|
||||
for h in sorted(hbdclass.Host.hosts)
|
||||
if _can_operate_host(current_user, hbdclass.Host.hosts[h])
|
||||
],
|
||||
messages=data.msgs[-30:],
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
@@ -510,12 +531,13 @@ async def start(
|
||||
hosts_with_plugins = []
|
||||
for hostname in sorted(hbdclass.Host.hosts.keys()):
|
||||
host = hbdclass.Host.hosts[hostname]
|
||||
if not _can_view_host(current_user, host):
|
||||
if not _can_operate_host(current_user, host):
|
||||
continue
|
||||
if host.plugin_data:
|
||||
hosts_with_plugins.append({
|
||||
"name": hostname,
|
||||
"plugins": list(host.plugin_data.keys()),
|
||||
"is_owner": _can_own_host(current_user, host),
|
||||
})
|
||||
|
||||
tmpl = env.get_template("plugins.html")
|
||||
@@ -868,7 +890,7 @@ async def start(
|
||||
tmpl = env.get_template("settings.html")
|
||||
body = tmpl.render(
|
||||
title="Settings - Heartbeat",
|
||||
sections=settings_mod.get_settings_sections(config),
|
||||
sections=settings_mod.get_settings_sections(config, threshold_checker=threshold_checker),
|
||||
current_user=current_user.to_dict() if current_user else None,
|
||||
active_page="settings",
|
||||
)
|
||||
@@ -891,6 +913,7 @@ async def start(
|
||||
web.get("/api/0/users/{username}/avatar", api_user_avatar),
|
||||
# Hosts
|
||||
web.get("/api/0/hosts", api_hosts),
|
||||
web.get("/api/0/alert_summary", api_alert_summary),
|
||||
web.get("/api/0/messages", api_messages),
|
||||
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
|
||||
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
|
||||
|
||||
+8
-1
@@ -101,9 +101,10 @@ async def reload_configuration(config_obj, config_path, components):
|
||||
access = config_mod.get_host_access(new_config, hostname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
|
||||
# Reload threshold checker
|
||||
# Reload threshold checker and prune alerts orphaned by the new config
|
||||
if 'threshold_checker' in components:
|
||||
components['threshold_checker'].reload(new_config)
|
||||
components['threshold_checker'].purge_stale_alerts(hbdclass)
|
||||
|
||||
# Note: Changes to the following require restart:
|
||||
# - hb_port, hbd_port, ws_port (already bound)
|
||||
@@ -241,6 +242,10 @@ async def _run_async(config, config_path=None):
|
||||
)
|
||||
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||
|
||||
# Drop alert states that no longer have a matching threshold (stale after
|
||||
# upgrade or config change between runs).
|
||||
threshold_checker.purge_stale_alerts(hbdclass)
|
||||
|
||||
# HTTP server (asyncio-based via aiohttp)
|
||||
try:
|
||||
http_task = asyncio.create_task(
|
||||
@@ -250,6 +255,7 @@ async def _run_async(config, config_path=None):
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
tcss=None,
|
||||
threshold_checker=threshold_checker,
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
@@ -469,6 +475,7 @@ def run(config, config_path=None):
|
||||
if config.get("debug", 0) > 0:
|
||||
log_level = logging.DEBUG
|
||||
logging.basicConfig(level=log_level)
|
||||
logging.getLogger("aiohttp.access").setLevel(logging.DEBUG)
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||
|
||||
+48
-3
@@ -24,7 +24,7 @@ sensitive bool True when the raw value must never be shown
|
||||
# Credential field names that should always be masked.
|
||||
_SECRET_KEYS = frozenset({
|
||||
"password", "token", "user_key", "api_key", "secret",
|
||||
"smtp_password", "smtp_user",
|
||||
"smtp_password", "smtp_user", "api_password", "access_token",
|
||||
})
|
||||
|
||||
_CHANNEL_TYPE_LABELS = {
|
||||
@@ -88,7 +88,7 @@ def _sanitize_channel(name, cfg):
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_settings_sections(config: dict) -> list:
|
||||
def get_settings_sections(config: dict, threshold_checker=None) -> list:
|
||||
"""Return ordered list of setting sections for the settings page.
|
||||
|
||||
Each section:
|
||||
@@ -181,6 +181,41 @@ def get_settings_sections(config: dict) -> list:
|
||||
"notification_channels": attrs.get("notification_channels", []),
|
||||
})
|
||||
|
||||
# ---- Threshold configurations -----------------------------------------
|
||||
def _tc_to_row(tc):
|
||||
return {
|
||||
"metric": tc.metric_path,
|
||||
"operator": tc.operator.value,
|
||||
"warning": tc.warning,
|
||||
"critical": tc.critical,
|
||||
"hysteresis": tc.hysteresis,
|
||||
"count": tc.count,
|
||||
"enabled": tc.enabled,
|
||||
}
|
||||
|
||||
threshold_config_list = []
|
||||
if threshold_checker is not None:
|
||||
if threshold_checker.threshold_configs:
|
||||
for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
|
||||
# For the default config use the merged effective set;
|
||||
# for named overrides use only the explicitly defined metrics
|
||||
# (threshold_raw_configs) so inherited defaults are not repeated.
|
||||
if cfg_name == "default":
|
||||
display_metrics = cfg_metrics
|
||||
else:
|
||||
display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
|
||||
metrics = sorted(
|
||||
[_tc_to_row(tc) for tc in display_metrics.values()],
|
||||
key=lambda m: m["metric"],
|
||||
)
|
||||
threshold_config_list.append({"name": cfg_name, "metrics": metrics})
|
||||
elif threshold_checker.thresholds:
|
||||
metrics = sorted(
|
||||
[_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
|
||||
key=lambda m: m["metric"],
|
||||
)
|
||||
threshold_config_list.append({"name": "default", "metrics": metrics})
|
||||
|
||||
# ---- Hosts summary ----------------------------------------------------
|
||||
hosts_list = []
|
||||
for hname, hcfg in (config.get("hosts") or {}).items():
|
||||
@@ -188,7 +223,7 @@ def get_settings_sections(config: dict) -> list:
|
||||
continue
|
||||
hosts_list.append({
|
||||
"name": hname,
|
||||
"watch": bool(hcfg.get("watch", False)),
|
||||
"watch": bool(hcfg.get("watch", True)),
|
||||
"dyndns": bool(hcfg.get("dyndns", False)),
|
||||
"owner": hcfg.get("owner", ""),
|
||||
"managers": hcfg.get("managers", []),
|
||||
@@ -312,6 +347,16 @@ def get_settings_sections(config: dict) -> list:
|
||||
"hosts": hosts_list,
|
||||
"fields": [],
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"title": "Threshold Configurations",
|
||||
"description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
|
||||
"threshold_configs": threshold_config_list,
|
||||
"fields": [
|
||||
field("default_threshold_config", "Default config", "text",
|
||||
"Threshold config used for hosts with no explicit mapping."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "runtime",
|
||||
"title": "Runtime",
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
|
||||
<style>
|
||||
|
||||
html, body {
|
||||
height: auto;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
@@ -170,8 +175,12 @@
|
||||
|
||||
.alert-hostname {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
color: #0066cc;
|
||||
font-size: 1.1em;
|
||||
text-decoration: none;
|
||||
}
|
||||
.alert-hostname:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.alert-metric {
|
||||
@@ -400,6 +409,10 @@
|
||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||
}
|
||||
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||
}
|
||||
|
||||
// Build actions section
|
||||
let actionsHtml = '';
|
||||
@@ -424,7 +437,7 @@
|
||||
<div class="alert-main">
|
||||
<div class="alert-header">
|
||||
<span class="alert-level ${level}">${alert.level}</span>
|
||||
<span class="alert-hostname">${alert.hostname}</span>
|
||||
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||
</div>
|
||||
<div class="alert-metric">${alert.metric_path}</div>
|
||||
<div class="alert-details">
|
||||
|
||||
@@ -126,11 +126,17 @@
|
||||
}
|
||||
|
||||
/* Swiss railway clock — nav */
|
||||
.nav-clock {
|
||||
.nav-pie {
|
||||
flex-shrink: 0;
|
||||
line-height: 0;
|
||||
margin-left: auto;
|
||||
padding: 4px 4px 4px 0;
|
||||
}
|
||||
#alert-pie { display: block; cursor: default; }
|
||||
.nav-clock {
|
||||
flex-shrink: 0;
|
||||
line-height: 0;
|
||||
padding: 4px 4px 4px 0;
|
||||
cursor: pointer;
|
||||
}
|
||||
#swiss-clock { display: block; }
|
||||
|
||||
@@ -236,6 +236,8 @@
|
||||
color: #ff9800;
|
||||
font-weight: 700;
|
||||
}
|
||||
#ntable a.host-link { color: inherit; text-decoration: none; }
|
||||
#ntable a.host-link:hover { text-decoration: underline; }
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
@@ -245,11 +247,13 @@
|
||||
var HBD_VERSION = "{{ hbd_version }}";
|
||||
|
||||
function hostNameHtml(data) {
|
||||
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
|
||||
var nameHtml = data.name;
|
||||
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||
nameHtml += ' 🥀';
|
||||
}
|
||||
return data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||
var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||
return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
|
||||
}
|
||||
|
||||
function setup() {
|
||||
@@ -404,7 +408,7 @@
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = '<span class="state-up">up</span>';
|
||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||
latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
@@ -511,7 +515,7 @@
|
||||
<tbody id="ntablebody">
|
||||
{% for host in hosts %}
|
||||
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||
<td data-name="{{ host.name }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</td>
|
||||
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||
{%- set warning_acked = host.alert_warning_acked -%}
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
{% endif %}
|
||||
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
||||
</div>
|
||||
<div class="nav-pie" title="Host alert status">
|
||||
<canvas id="alert-pie" width="44" height="44"></canvas>
|
||||
</div>
|
||||
<div class="nav-clock" title="Click for full-screen clock">
|
||||
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
||||
</div>
|
||||
@@ -42,4 +45,52 @@
|
||||
});
|
||||
}
|
||||
})();
|
||||
|
||||
function drawAlertPie(critical, warning, ok) {
|
||||
var canvas = document.getElementById('alert-pie');
|
||||
if (!canvas) return;
|
||||
var ctx = canvas.getContext('2d');
|
||||
var SIZE = canvas.width;
|
||||
var R = SIZE / 2;
|
||||
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||
var total = critical + warning + ok;
|
||||
if (total === 0) {
|
||||
ctx.beginPath();
|
||||
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||
ctx.fillStyle = '#ccc';
|
||||
ctx.fill();
|
||||
return;
|
||||
}
|
||||
var slices = [
|
||||
{ value: critical, color: '#e53935' },
|
||||
{ value: warning, color: '#ffb300' },
|
||||
{ value: ok, color: '#43a047' }
|
||||
];
|
||||
var start = -Math.PI / 2;
|
||||
slices.forEach(function(s) {
|
||||
if (s.value === 0) return;
|
||||
var sweep = (s.value / total) * Math.PI * 2;
|
||||
ctx.beginPath();
|
||||
ctx.moveTo(R, R);
|
||||
ctx.arc(R, R, R - 1, start, start + sweep);
|
||||
ctx.closePath();
|
||||
ctx.fillStyle = s.color;
|
||||
ctx.fill();
|
||||
start += sweep;
|
||||
});
|
||||
}
|
||||
|
||||
function updateAlertPie() {
|
||||
fetch('/api/0/alert_summary').then(function(r) {
|
||||
if (!r.ok) return;
|
||||
return r.json();
|
||||
}).then(function(d) {
|
||||
if (d) drawAlertPie(d.critical || 0, d.warning || 0, d.ok || 0);
|
||||
}).catch(function() {});
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
updateAlertPie();
|
||||
setInterval(updateAlertPie, 30000);
|
||||
});
|
||||
</script>
|
||||
|
||||
@@ -131,6 +131,52 @@
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
.host-action-btn {
|
||||
font-size: 0.75em;
|
||||
font-weight: bold;
|
||||
padding: 3px 10px;
|
||||
border-radius: 4px;
|
||||
border: none;
|
||||
cursor: pointer;
|
||||
text-decoration: none;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.host-action-btn.update-btn {
|
||||
background: #e3f2fd;
|
||||
color: #1565c0;
|
||||
}
|
||||
.host-action-btn.update-btn:hover { background: #bbdefb; }
|
||||
.host-action-btn.delete-btn {
|
||||
background: #ffebee;
|
||||
color: #c62828;
|
||||
}
|
||||
.host-action-btn.delete-btn:hover { background: #ffcdd2; }
|
||||
|
||||
/* ── Action result toast ───────────────────────────────────── */
|
||||
#action-toast {
|
||||
position: fixed;
|
||||
bottom: 24px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%) translateY(20px);
|
||||
background: #323232;
|
||||
color: #fff;
|
||||
padding: 12px 22px;
|
||||
border-radius: 6px;
|
||||
font-size: 0.9em;
|
||||
max-width: 480px;
|
||||
text-align: center;
|
||||
opacity: 0;
|
||||
pointer-events: none;
|
||||
transition: opacity 0.25s, transform 0.25s;
|
||||
z-index: 9000;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
#action-toast.show {
|
||||
opacity: 1;
|
||||
transform: translateX(-50%) translateY(0);
|
||||
}
|
||||
#action-toast.error { background: #c62828; }
|
||||
|
||||
/* ── Host body ──────────────────────────────────────────────── */
|
||||
|
||||
.host-body {
|
||||
@@ -379,11 +425,17 @@
|
||||
<span class="nagios-badge" id="nagios-badge-{{ host.name }}">—</span>
|
||||
{% endif %}
|
||||
<span class="os-label" id="os-label-{{ host.name }}"></span>
|
||||
{% if host.is_owner %}
|
||||
<button class="host-action-btn update-btn"
|
||||
onclick="event.stopPropagation(); hostAction(this, '/u?h={{ host.name }}')">Update</button>
|
||||
<button class="host-action-btn delete-btn"
|
||||
onclick="event.stopPropagation(); hostDelete(this, '{{ host.name }}')">Delete</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="host-body">
|
||||
{% set plugin_order = ['os_info','cpu_monitor','memory_monitor','disk_monitor','network_monitor','nagios_runner','filesystem_info'] %}
|
||||
{% set plugin_order = ['os_info','cpu_monitor','memory_monitor','disk_monitor','network_monitor','zfs_monitor','nagios_runner','filesystem_info'] %}
|
||||
{% for plugin in plugin_order if plugin in host.plugins %}
|
||||
<div class="plugin-accordion collapsed"
|
||||
data-hostname="{{ host.name }}"
|
||||
@@ -447,6 +499,17 @@
|
||||
return pluginCache[hostname]?.[pluginName] ?? null;
|
||||
}
|
||||
|
||||
// Return worst nagios exit code (0-3) found in a nagios_runner data object.
|
||||
function nagiosWorstStatus(data) {
|
||||
let worst = 0;
|
||||
for (const [k, v] of Object.entries(data || {})) {
|
||||
if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) {
|
||||
worst = v;
|
||||
}
|
||||
}
|
||||
return worst;
|
||||
}
|
||||
|
||||
// ── Fetch helpers ───────────────────────────────────────────────────────
|
||||
|
||||
async function fetchPlugin(hostname, pluginName) {
|
||||
@@ -548,13 +611,13 @@
|
||||
? chips.join('')
|
||||
: '<span class="glance-loading">—</span>';
|
||||
|
||||
// Nagios badge
|
||||
// Nagios badge — derive worst status from individual check codes
|
||||
const nagios = getCache(hostname, 'nagios_runner');
|
||||
if (nagosBadge && nagios) {
|
||||
const status = (nagios.data.overall_status || '—').toUpperCase();
|
||||
const cls = status === 'OK' ? 'ok'
|
||||
: status === 'WARNING' ? 'warning'
|
||||
: status === 'CRITICAL' ? 'critical' : '';
|
||||
const worst = nagiosWorstStatus(nagios.data);
|
||||
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||
const status = names[worst] || '—';
|
||||
const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : '';
|
||||
nagosBadge.className = `nagios-badge ${cls}`;
|
||||
nagosBadge.textContent = status;
|
||||
}
|
||||
@@ -663,9 +726,10 @@
|
||||
break;
|
||||
}
|
||||
case 'nagios_runner': {
|
||||
const status = (d.overall_status || '?').toUpperCase();
|
||||
const count = d.plugin_count;
|
||||
text = status + (count != null ? ` — ${count} checks` : '');
|
||||
const worst = nagiosWorstStatus(d);
|
||||
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||
const codes = Object.keys(d).filter(k => k.endsWith('_status_code'));
|
||||
text = (names[worst] || '?') + (codes.length ? ` — ${codes.length} checks` : '');
|
||||
break;
|
||||
}
|
||||
case 'filesystem_info': {
|
||||
@@ -673,6 +737,19 @@
|
||||
text = `${count} filesystem${count !== 1 ? 's' : ''}`;
|
||||
break;
|
||||
}
|
||||
case 'zfs_monitor': {
|
||||
const pools = d.pools || {};
|
||||
const names = Object.keys(pools);
|
||||
if (names.length === 0) { text = 'No pools'; break; }
|
||||
const degraded = names.filter(n => pools[n].health && pools[n].health !== 'ONLINE');
|
||||
text = names.map(n => {
|
||||
const p = pools[n];
|
||||
const cap = p.capacity != null ? ` ${p.capacity.toFixed(0)}%` : '';
|
||||
return `${n}${cap}`;
|
||||
}).join(' · ');
|
||||
if (degraded.length) text += ` ⚠ ${degraded.map(n => pools[n].health).join(',')}`;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
text = 'Loaded';
|
||||
}
|
||||
@@ -694,6 +771,7 @@
|
||||
case 'memory_monitor': html = renderMemoryTable(cached.data); break;
|
||||
case 'disk_monitor': html = renderDiskTables(cached.data); break;
|
||||
case 'network_monitor':html = renderNetworkTables(cached.data); break;
|
||||
case 'zfs_monitor': html = renderZfsTables(cached.data); break;
|
||||
case 'nagios_runner': html = renderNagiosTable(cached.data); break;
|
||||
case 'filesystem_info':html = renderFilesystemTable(cached.data); break;
|
||||
default: html = renderGenericTable(cached.data); break;
|
||||
@@ -1024,6 +1102,66 @@
|
||||
return html;
|
||||
}
|
||||
|
||||
function renderZfsTables(d) {
|
||||
const pools = d.pools || {};
|
||||
const names = Object.keys(pools);
|
||||
if (names.length === 0) return '<div class="no-data">No ZFS pools found</div>';
|
||||
|
||||
const healthCls = h => {
|
||||
if (!h || h === 'ONLINE') return 'pct-ok';
|
||||
if (h === 'DEGRADED') return 'pct-warn';
|
||||
return 'pct-crit';
|
||||
};
|
||||
|
||||
let pt = '<table class="data-table"><thead><tr>'
|
||||
+ '<th>Pool</th><th>Health</th>'
|
||||
+ '<th class="num">Size</th><th class="num">Used</th>'
|
||||
+ '<th class="num">Free</th><th class="num">Cap %</th>'
|
||||
+ '<th class="num">Frag %</th><th class="num">Dedup</th>'
|
||||
+ '</tr></thead><tbody>';
|
||||
for (const name of names) {
|
||||
const p = pools[name];
|
||||
const cap = p.capacity != null ? p.capacity : 0;
|
||||
const capCls = cap > 90 ? 'pct-crit' : cap > 75 ? 'pct-warn' : 'pct-ok';
|
||||
pt += `<tr>
|
||||
<td class="iface-name">${escHtml(name)}</td>
|
||||
<td class="${healthCls(p.health)}">${escHtml(p.health || '—')}</td>
|
||||
<td class="num">${formatBytes(p.size || 0)}</td>
|
||||
<td class="num">${formatBytes(p.alloc || 0)}</td>
|
||||
<td class="num">${formatBytes(p.free || 0)}</td>
|
||||
<td class="num ${capCls}">${cap.toFixed(1)}%</td>
|
||||
<td class="num">${p.frag != null ? p.frag.toFixed(1) + '%' : '—'}</td>
|
||||
<td class="num">${p.dedup != null ? p.dedup.toFixed(2) + 'x' : '—'}</td>
|
||||
</tr>`;
|
||||
}
|
||||
pt += '</tbody></table>';
|
||||
|
||||
const hasIo = names.some(n => pools[n].read_ops != null);
|
||||
if (!hasIo) return pt;
|
||||
|
||||
let iot = '<table class="data-table"><thead><tr>'
|
||||
+ '<th>Pool</th>'
|
||||
+ '<th class="num">Read ops</th><th class="num">Write ops</th>'
|
||||
+ '<th class="num">Read BW</th><th class="num">Write BW</th>'
|
||||
+ '</tr></thead><tbody>';
|
||||
for (const name of names) {
|
||||
const p = pools[name];
|
||||
iot += `<tr>
|
||||
<td class="iface-name">${escHtml(name)}</td>
|
||||
<td class="num">${p.read_ops != null ? p.read_ops.toLocaleString() : '—'}</td>
|
||||
<td class="num">${p.write_ops != null ? p.write_ops.toLocaleString() : '—'}</td>
|
||||
<td class="num">${p.read_bw != null ? formatBytes(p.read_bw) : '—'}</td>
|
||||
<td class="num">${p.write_bw != null ? formatBytes(p.write_bw) : '—'}</td>
|
||||
</tr>`;
|
||||
}
|
||||
iot += '</tbody></table>';
|
||||
|
||||
return `<div class="flex-tables">
|
||||
<div><div class="table-section-label">Pools</div>${pt}</div>
|
||||
<div><div class="table-section-label">I/O (cumulative)</div>${iot}</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function renderGenericTable(d) {
|
||||
let html = '<table class="data-table"><thead><tr><th>Field</th><th>Value</th></tr></thead><tbody>';
|
||||
for (const [k, v] of Object.entries(d)) {
|
||||
@@ -1082,12 +1220,68 @@
|
||||
// ── Init ────────────────────────────────────────────────────────────────
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
// If a host fragment is in the URL, expand and scroll to that host;
|
||||
// otherwise expand the first host as before.
|
||||
const hash = window.location.hash;
|
||||
if (hash) {
|
||||
const hostname = decodeURIComponent(hash.slice(1));
|
||||
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
|
||||
if (card) {
|
||||
card.classList.remove('collapsed');
|
||||
fetchHostGlance(hostname);
|
||||
setTimeout(() => card.scrollIntoView({ behavior: 'smooth', block: 'start' }), 150);
|
||||
return;
|
||||
}
|
||||
}
|
||||
const first = document.querySelector('.host-card');
|
||||
if (first) {
|
||||
first.classList.remove('collapsed');
|
||||
fetchHostGlance(first.dataset.hostname);
|
||||
}
|
||||
});
|
||||
// ── Host action helpers ──────────────────────────────────────
|
||||
|
||||
let _toastTimer = null;
|
||||
function showToast(msg, isError) {
|
||||
const t = document.getElementById('action-toast');
|
||||
t.textContent = msg;
|
||||
t.classList.toggle('error', !!isError);
|
||||
t.classList.add('show');
|
||||
clearTimeout(_toastTimer);
|
||||
_toastTimer = setTimeout(() => t.classList.remove('show'), 4000);
|
||||
}
|
||||
|
||||
async function hostAction(btn, url) {
|
||||
btn.disabled = true;
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
const text = await res.text();
|
||||
showToast(text, !res.ok);
|
||||
} catch (e) {
|
||||
showToast('Request failed: ' + e.message, true);
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
async function hostDelete(btn, hostname) {
|
||||
if (!confirm('Delete host ' + hostname + '?')) return;
|
||||
btn.disabled = true;
|
||||
try {
|
||||
const res = await fetch('/d?h=' + encodeURIComponent(hostname));
|
||||
const text = await res.text();
|
||||
showToast(text, !res.ok);
|
||||
if (res.ok) {
|
||||
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
|
||||
if (card) card.remove();
|
||||
}
|
||||
} catch (e) {
|
||||
showToast('Request failed: ' + e.message, true);
|
||||
btn.disabled = false;
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<div id="action-toast"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -254,6 +254,17 @@
|
||||
.host-bool { text-align: center; }
|
||||
.dot-yes { color: #2e7d32; font-size: 1.1em; }
|
||||
.dot-no { color: #ddd; font-size: 1.1em; }
|
||||
|
||||
/* ---- Threshold configurations ---- */
|
||||
.thresh-config { margin: 12px 20px 20px; }
|
||||
.thresh-config-name {
|
||||
font-weight: 600; font-size: 0.9em; color: #1a237e;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
.mini-table .warn { color: #e65100; font-weight: 600; }
|
||||
.mini-table .crit { color: #b71c1c; font-weight: 600; }
|
||||
.mini-table .dim { color: #aaa; }
|
||||
.mini-table .metric-path { font-family: monospace; font-size: 0.88em; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
@@ -394,6 +405,49 @@
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{# ---- Threshold configurations section ---- #}
|
||||
{% if section.id == "thresholds" %}
|
||||
{% if section.threshold_configs %}
|
||||
{% for tc in section.threshold_configs %}
|
||||
<div class="thresh-config">
|
||||
<div class="thresh-config-name">{{ tc.name }}</div>
|
||||
{% if tc.metrics %}
|
||||
<div style="overflow-x: auto;">
|
||||
<table class="mini-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Metric</th>
|
||||
<th>Op</th>
|
||||
<th>Warning</th>
|
||||
<th>Critical</th>
|
||||
<th>Hysteresis</th>
|
||||
<th>Count</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for m in tc.metrics %}
|
||||
<tr {% if not m.enabled %} style="opacity:0.45"{% endif %}>
|
||||
<td class="metric-path">{{ m.metric }}</td>
|
||||
<td>{{ m.operator or '>' }}</td>
|
||||
<td class="warn">{{ m.warning if m.warning is not none else '—' }}</td>
|
||||
<td class="crit">{{ m.critical if m.critical is not none else '—' }}</td>
|
||||
<td class="dim">{{ '%.0f%%' % (m.hysteresis * 100) if m.hysteresis else '—' }}</td>
|
||||
<td class="dim">{{ m.count }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="val-empty">No thresholds defined.</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="field-row"><span class="val-empty">No threshold configurations defined.</span></div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{# ---- Hosts section ---- #}
|
||||
{% if section.id == "hosts" %}
|
||||
{% if section.hosts %}
|
||||
|
||||
+285
-91
@@ -9,10 +9,11 @@ This module provides a flexible threshold checking system that:
|
||||
- Supports multiple comparison operators
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
from typing import Dict, List, Any, Optional, Tuple, Callable
|
||||
from . import notify as notify_mod
|
||||
from .config import THRESHOLD_DEFAULTS
|
||||
|
||||
@@ -35,6 +36,7 @@ class ComparisonOperator(Enum):
|
||||
LTE = "<=" # Less than or equal
|
||||
EQ = "==" # Equal to
|
||||
NEQ = "!=" # Not equal to
|
||||
NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||
|
||||
|
||||
class AlertState:
|
||||
@@ -56,6 +58,7 @@ class AlertState:
|
||||
self.last_notification = None
|
||||
self.threshold_value = None # The threshold value that triggered alert
|
||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||
self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery
|
||||
self.formatted_message = None # Formatted display message for UI
|
||||
self.acknowledged = False # Whether alert has been acknowledged
|
||||
self.acknowledged_at = None # Timestamp when acknowledged
|
||||
@@ -151,6 +154,15 @@ class AlertState:
|
||||
if self.formatted_message is not None:
|
||||
result["formatted_message"] = self.formatted_message
|
||||
|
||||
# Compute and expose the recovery threshold so the UI can display it
|
||||
if (self.hysteresis and self.threshold_value is not None
|
||||
and self.operator is not None):
|
||||
ha = abs(self.threshold_value * self.hysteresis)
|
||||
if self.operator in ('>', '>='):
|
||||
result["recovery_threshold"] = round(self.threshold_value - ha, 4)
|
||||
elif self.operator in ('<', '<='):
|
||||
result["recovery_threshold"] = round(self.threshold_value + ha, 4)
|
||||
|
||||
return result
|
||||
|
||||
def __setstate__(self, state):
|
||||
@@ -158,6 +170,8 @@ class AlertState:
|
||||
self.__dict__.update(state)
|
||||
if not hasattr(self, 'consecutive_count'):
|
||||
self.consecutive_count = 0
|
||||
if not hasattr(self, 'hysteresis'):
|
||||
self.hysteresis = None
|
||||
|
||||
def acknowledge(self):
|
||||
"""Acknowledge this alert to stop reminder notifications."""
|
||||
@@ -226,6 +240,16 @@ class ThresholdConfig:
|
||||
if not self.enabled:
|
||||
return AlertLevel.OK
|
||||
|
||||
# Nagios exit-code semantics: value IS the severity
|
||||
if self.operator == ComparisonOperator.NAGIOS:
|
||||
try:
|
||||
code = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return AlertLevel.UNKNOWN
|
||||
return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get(
|
||||
code, AlertLevel.UNKNOWN
|
||||
)
|
||||
|
||||
try:
|
||||
# Convert value to float for comparison
|
||||
value = float(value)
|
||||
@@ -262,6 +286,10 @@ class ThresholdConfig:
|
||||
"""
|
||||
new_level = self.evaluate(value)
|
||||
|
||||
# Nagios exit codes are discrete integers — hysteresis doesn't apply
|
||||
if self.operator == ComparisonOperator.NAGIOS:
|
||||
return new_level
|
||||
|
||||
# If no hysteresis, return new level
|
||||
if self.hysteresis == 0.0:
|
||||
return new_level
|
||||
@@ -328,14 +356,17 @@ class ThresholdChecker:
|
||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||
journal: Optional MessageJournal instance for logging threshold events
|
||||
"""
|
||||
# Named threshold configurations: {config_name: {metric_path: ThresholdConfig}}
|
||||
# Named threshold configurations (pre-merged: defaults + overrides): {config_name: {metric_path: ThresholdConfig}}
|
||||
self.threshold_configs = {}
|
||||
|
||||
# Raw overrides only for each named config (no defaults baked in): {config_name: {metric_path: ThresholdConfig}}
|
||||
self.threshold_raw_configs: Dict[str, Dict[str, ThresholdConfig]] = {}
|
||||
|
||||
# Single threshold set for backward compatibility: {metric_path: ThresholdConfig}
|
||||
self.thresholds = {}
|
||||
|
||||
# Host to config name mapping: {host_name: config_name}
|
||||
self.host_config_mapping = {}
|
||||
# Host to ordered list of config names: {host_name: [config_name, ...]}
|
||||
self.host_config_mapping: Dict[str, List[str]] = {}
|
||||
|
||||
# Default config name to use when no mapping exists
|
||||
self.default_config = "default"
|
||||
@@ -372,6 +403,7 @@ class ThresholdChecker:
|
||||
|
||||
# Clear old configuration
|
||||
self.threshold_configs.clear()
|
||||
self.threshold_raw_configs.clear()
|
||||
self.thresholds.clear()
|
||||
self.host_config_mapping.clear()
|
||||
self.grace_seconds = float(config.get("grace", 2))
|
||||
@@ -391,10 +423,24 @@ class ThresholdChecker:
|
||||
Supports two formats:
|
||||
1. Legacy format with direct 'thresholds' section
|
||||
2. New format with 'threshold_configs' and 'host_threshold_mapping'
|
||||
|
||||
In all cases, THRESHOLD_DEFAULTS are seeded into threshold_configs["default"]
|
||||
so the Settings page always shows the built-in defaults.
|
||||
_parse_multi_config() overwrites this with the fully-merged effective defaults.
|
||||
"""
|
||||
# Always expose built-in defaults through threshold_configs["default"] so
|
||||
# the Settings page has something to display even in legacy/no-config mode.
|
||||
seed: Dict[str, ThresholdConfig] = {}
|
||||
for plugin_name, plugin_thresholds in THRESHOLD_DEFAULTS.get("thresholds", {}).items():
|
||||
if isinstance(plugin_thresholds, dict):
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=seed)
|
||||
if seed:
|
||||
self.threshold_configs["default"] = seed
|
||||
self.threshold_raw_configs["default"] = {}
|
||||
|
||||
# Check for new multi-config format
|
||||
if "threshold_configs" in config:
|
||||
self._parse_multi_config(config)
|
||||
self._parse_multi_config(config) # overwrites threshold_configs["default"]
|
||||
elif "thresholds" in config:
|
||||
# Legacy single threshold configuration
|
||||
self._parse_legacy_config(config)
|
||||
@@ -424,9 +470,10 @@ class ThresholdChecker:
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults)
|
||||
|
||||
self.threshold_configs["default"] = dict(effective_defaults)
|
||||
self.threshold_raw_configs["default"] = {}
|
||||
logger.info("Registered 'default' threshold config with %d metrics", len(effective_defaults))
|
||||
|
||||
# Parse each named configuration, seeding it with effective_defaults first
|
||||
# Parse each named configuration
|
||||
for config_name, config_data in threshold_configs.items():
|
||||
if config_name == "default":
|
||||
continue # already handled above
|
||||
@@ -440,33 +487,41 @@ class ThresholdChecker:
|
||||
continue
|
||||
|
||||
logger.info("Parsing threshold configuration: %s", config_name)
|
||||
self.threshold_configs[config_name] = dict(effective_defaults)
|
||||
|
||||
# Raw overrides only (used for multi-config layering)
|
||||
raw_overrides: Dict[str, ThresholdConfig] = {}
|
||||
thresholds_config = config_data["thresholds"]
|
||||
for plugin_name, plugin_thresholds in thresholds_config.items():
|
||||
if not isinstance(plugin_thresholds, dict):
|
||||
continue
|
||||
if isinstance(plugin_thresholds, dict):
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=raw_overrides)
|
||||
self.threshold_raw_configs[config_name] = raw_overrides
|
||||
|
||||
self._parse_plugin_thresholds(
|
||||
plugin_name,
|
||||
plugin_thresholds,
|
||||
target_dict=self.threshold_configs[config_name]
|
||||
)
|
||||
# Pre-merged version (defaults + overrides) for single-config fast path
|
||||
self.threshold_configs[config_name] = dict(effective_defaults)
|
||||
self.threshold_configs[config_name].update(raw_overrides)
|
||||
|
||||
# Parse host to config mapping from two possible sources
|
||||
# 1. New format: hosts section with threshold_config attribute
|
||||
# Parse host → config list mapping from two possible sources
|
||||
|
||||
def _normalise(value) -> List[str]:
|
||||
"""Accept a string or list; always return a list."""
|
||||
if isinstance(value, list):
|
||||
return [str(v) for v in value]
|
||||
return [str(value)]
|
||||
|
||||
# 1. hosts section with threshold_config attribute (string or list)
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and "threshold_config" in host_attrs:
|
||||
self.host_config_mapping[host_name] = host_attrs["threshold_config"]
|
||||
self.host_config_mapping[host_name] = _normalise(host_attrs["threshold_config"])
|
||||
|
||||
# 2. Legacy format: host_threshold_mapping section (for backward compatibility)
|
||||
# 2. Legacy host_threshold_mapping section (string values only)
|
||||
if "host_threshold_mapping" in config:
|
||||
legacy_mapping = config.get("host_threshold_mapping", {})
|
||||
if isinstance(legacy_mapping, dict):
|
||||
self.host_config_mapping.update(legacy_mapping)
|
||||
for host_name, value in legacy_mapping.items():
|
||||
self.host_config_mapping[host_name] = _normalise(value)
|
||||
|
||||
# Set default config (first one alphabetically or explicitly set)
|
||||
self.default_config = config.get("default_threshold_config", "default")
|
||||
@@ -531,11 +586,14 @@ class ThresholdChecker:
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
# Nagios operator maps exit codes directly; no numeric thresholds needed
|
||||
is_nagios_op = (operator == "nagios")
|
||||
default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})"
|
||||
display = threshold_config.get("display", default_display)
|
||||
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
if warning is None and critical is None and not is_nagios_op:
|
||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||
continue
|
||||
|
||||
@@ -635,7 +693,7 @@ class ThresholdChecker:
|
||||
warning = rtt_thresholds.get("warning")
|
||||
critical = rtt_thresholds.get("critical")
|
||||
operator = rtt_thresholds.get("operator", ">")
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
count = rtt_thresholds.get("count", 1)
|
||||
@@ -664,7 +722,10 @@ class ThresholdChecker:
|
||||
)
|
||||
|
||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||
"""Get the appropriate threshold configuration for a host.
|
||||
"""Get the effective threshold configuration for a host.
|
||||
|
||||
When threshold_config is a list, configs are applied left-to-right on top
|
||||
of the default thresholds so earlier entries can be overridden by later ones.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
@@ -676,23 +737,40 @@ class ThresholdChecker:
|
||||
if self.thresholds and not self.threshold_configs:
|
||||
return self.thresholds
|
||||
|
||||
# Multi-config mode: look up host-specific configuration
|
||||
if self.threshold_configs:
|
||||
config_name = self.host_config_mapping.get(host_name, self.default_config)
|
||||
if not self.threshold_configs:
|
||||
return {}
|
||||
|
||||
if config_name in self.threshold_configs:
|
||||
return self.threshold_configs[config_name]
|
||||
else:
|
||||
config_names = self.host_config_mapping.get(host_name)
|
||||
|
||||
# No host-specific mapping → return pre-merged default
|
||||
if not config_names:
|
||||
return self.threshold_configs.get(self.default_config, {})
|
||||
|
||||
# Single config → fast path using pre-merged copy
|
||||
if len(config_names) == 1:
|
||||
name = config_names[0]
|
||||
if name in self.threshold_configs:
|
||||
return self.threshold_configs[name]
|
||||
logger.warning(
|
||||
"Threshold config '%s' not found for host '%s', using default '%s'",
|
||||
config_name,
|
||||
host_name,
|
||||
self.default_config
|
||||
name, host_name, self.default_config,
|
||||
)
|
||||
return self.threshold_configs.get(self.default_config, {})
|
||||
|
||||
# No thresholds configured
|
||||
return {}
|
||||
# Multiple configs → start from defaults, layer raw overrides in order
|
||||
result = dict(self.threshold_configs.get(self.default_config, {}))
|
||||
for name in config_names:
|
||||
if name == self.default_config:
|
||||
continue # defaults already the base
|
||||
raw = self.threshold_raw_configs.get(name)
|
||||
if raw is None:
|
||||
logger.warning(
|
||||
"Threshold config '%s' not found for host '%s', skipping",
|
||||
name, host_name,
|
||||
)
|
||||
else:
|
||||
result.update(raw)
|
||||
return result
|
||||
|
||||
def check_value(
|
||||
self,
|
||||
@@ -760,6 +838,12 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Keep hysteresis on the state so the UI can show the recovery threshold
|
||||
if new_level != AlertLevel.OK:
|
||||
alert_state.hysteresis = threshold.hysteresis
|
||||
else:
|
||||
alert_state.hysteresis = None
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
@@ -769,6 +853,36 @@ class ThresholdChecker:
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||
|
||||
return None
|
||||
def _find_threshold(
|
||||
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
|
||||
) -> Tuple[Optional["ThresholdConfig"], Optional[str]]:
|
||||
"""Return (threshold, check_name) for *metric_path*, falling back to suffix matches.
|
||||
|
||||
Allows generic thresholds like ``nagios_runner.status_code`` to match
|
||||
fully-qualified paths like ``nagios_runner.check_disk_root_status_code``.
|
||||
The exact match is always tried first; then successive leading
|
||||
underscore-delimited segments are stripped from the field name until
|
||||
a match is found or no segments remain.
|
||||
|
||||
Returns:
|
||||
(ThresholdConfig, None) for an exact match.
|
||||
(ThresholdConfig, "check_disk_root") for a suffix match — the second
|
||||
element is the stripped prefix, available as ``{check_name}`` in
|
||||
display format templates.
|
||||
(None, None) when no threshold is found.
|
||||
"""
|
||||
if metric_path in thresholds:
|
||||
return thresholds[metric_path], None
|
||||
plugin, sep, field = metric_path.partition(".")
|
||||
if not sep:
|
||||
return None, None
|
||||
parts = field.split("_")
|
||||
for i in range(1, len(parts)):
|
||||
candidate = plugin + "." + "_".join(parts[i:])
|
||||
if candidate in thresholds:
|
||||
return thresholds[candidate], "_".join(parts[:i])
|
||||
return None, None
|
||||
|
||||
def check_plugin_data(
|
||||
self,
|
||||
host_name: str,
|
||||
@@ -797,11 +911,10 @@ class ThresholdChecker:
|
||||
for metric_name, value in data.items():
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
if metric_path not in thresholds:
|
||||
threshold, check_name = self._find_threshold(thresholds, metric_path)
|
||||
if threshold is None:
|
||||
continue
|
||||
|
||||
threshold = thresholds[metric_path]
|
||||
|
||||
# Get or create alert state
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
@@ -821,13 +934,15 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data, check_name=check_name, metric_name=metric_name)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data, check_name=check_name, metric_name=metric_name)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
@@ -887,6 +1002,8 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
@@ -903,6 +1020,8 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]] = None,
|
||||
check_name: Optional[str] = None,
|
||||
metric_name: Optional[str] = None,
|
||||
):
|
||||
"""Trigger a notification for an alert state change.
|
||||
|
||||
@@ -925,54 +1044,52 @@ class ThresholdChecker:
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Short metric label: strip the plugin-name prefix for readability
|
||||
short_path = metric_path.partition(".")[2] or metric_path
|
||||
|
||||
# Use a display-friendly value (inf is the sentinel for "overdue")
|
||||
import math
|
||||
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
|
||||
|
||||
# Format message
|
||||
# Format message — for the nagios operator there is no numeric threshold_value;
|
||||
# render the display template whenever one is available.
|
||||
has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS
|
||||
|
||||
def _fmt():
|
||||
return self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data,
|
||||
check_name=check_name,
|
||||
metric_name=metric_name,
|
||||
)
|
||||
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVER"
|
||||
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
||||
message = f"{short_path} = {display_value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
if threshold_value is not None:
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
)
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
if has_display:
|
||||
message = f"{short_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
message = f"{short_path} = {display_value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
if threshold_value is not None:
|
||||
threshold_info = self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
)
|
||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
||||
if has_display:
|
||||
message = f"{short_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{metric_path} = {display_value}"
|
||||
message = f"{short_path} = {display_value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {display_value}"
|
||||
if has_display:
|
||||
message = f"{short_path} = {display_value} {_fmt()}"
|
||||
else:
|
||||
message = f"{short_path} = {display_value}"
|
||||
|
||||
# Return the formatted threshold info for storing in AlertState
|
||||
formatted_threshold_msg = None
|
||||
if threshold_value is not None and new_level != AlertLevel.OK:
|
||||
formatted_threshold_msg = self._format_display(
|
||||
threshold.display,
|
||||
value=display_value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
)
|
||||
# Formatted threshold info stored on AlertState for the UI
|
||||
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
|
||||
|
||||
return lvl, message, formatted_threshold_msg
|
||||
|
||||
@@ -987,6 +1104,11 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
):
|
||||
"""Send notification and log to journal/eventlog."""
|
||||
from . import hbdclass
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is not None and not host.watched:
|
||||
eventlog(host_name, lvl, message, service="threshold")
|
||||
return
|
||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||
host_name,
|
||||
notify_mod.Notification(
|
||||
@@ -999,7 +1121,6 @@ class ThresholdChecker:
|
||||
# Log to journal
|
||||
if self.journal is not None:
|
||||
try:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(self.journal.log_threshold_event(
|
||||
host_name=host_name,
|
||||
@@ -1017,33 +1138,62 @@ class ThresholdChecker:
|
||||
self,
|
||||
display_format: str,
|
||||
value: Any,
|
||||
threshold_value: float,
|
||||
threshold_value: Optional[float],
|
||||
op_symbol: str,
|
||||
plugin_data: Optional[Dict[str, Any]] = None,
|
||||
check_name: Optional[str] = None,
|
||||
metric_name: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Format the display string using available data.
|
||||
|
||||
Args:
|
||||
display_format: Format string from threshold config
|
||||
value: Current metric value
|
||||
threshold_value: Threshold value that was exceeded
|
||||
op_symbol: Comparison operator symbol
|
||||
plugin_data: Optional dictionary of plugin data fields
|
||||
Available template variables:
|
||||
{value} - current metric value
|
||||
{threshold_value} - threshold that was exceeded
|
||||
{op_symbol} - comparison operator (>, <, >=, <=, ==, !=)
|
||||
{check_name} - prefix stripped for generic threshold match
|
||||
(e.g. "check_disk_root" when metric
|
||||
"check_disk_root_status_code" matched generic
|
||||
threshold "status_code")
|
||||
{metric_name} - field name within the plugin data dict
|
||||
Any key from plugin_data is also available.
|
||||
|
||||
Returns:
|
||||
Formatted display string
|
||||
"""
|
||||
if not display_format:
|
||||
display_format = "(threshold: {op_symbol} {threshold_value})" if threshold_value is not None else ""
|
||||
|
||||
# Build format context with standard variables
|
||||
format_context = {
|
||||
'value': value,
|
||||
'threshold_value': threshold_value,
|
||||
'op_symbol': op_symbol,
|
||||
}
|
||||
if threshold_value is not None:
|
||||
format_context['threshold_value'] = threshold_value
|
||||
|
||||
# Add generic-match context variables when available
|
||||
if check_name is not None:
|
||||
format_context['check_name'] = check_name
|
||||
if metric_name is not None:
|
||||
format_context['metric_name'] = metric_name
|
||||
|
||||
# Add all plugin data fields if available
|
||||
if plugin_data:
|
||||
format_context.update(plugin_data)
|
||||
|
||||
# For nagios_runner generic matches, expose the matched check's output
|
||||
# and status as short aliases {output} and {status} so display templates
|
||||
# don't need to use the full {check_disk_root_output} form.
|
||||
if check_name and plugin_data:
|
||||
if 'output' not in format_context:
|
||||
output = plugin_data.get(f"{check_name}_output")
|
||||
if output is not None:
|
||||
format_context['output'] = output
|
||||
if 'status' not in format_context:
|
||||
status = plugin_data.get(f"{check_name}_status")
|
||||
if status is not None:
|
||||
format_context['status'] = status
|
||||
|
||||
try:
|
||||
# Format the display string
|
||||
return display_format.format(**format_context)
|
||||
@@ -1073,17 +1223,22 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]],
|
||||
check_name: Optional[str] = None,
|
||||
metric_name: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Handle a state-change transition with grace-period logic.
|
||||
|
||||
Transitioning INTO alert: defers the notification for grace_seconds.
|
||||
Transitioning INTO alert (worsening): defers the notification for grace_seconds.
|
||||
De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
|
||||
the metric is still alerting so no RECOVER was sent.
|
||||
Transitioning TO OK:
|
||||
- Still in grace window (pending_since set): suppresses both the alert
|
||||
and the recovery — the spike never warranted a page.
|
||||
- Past grace: fires the RECOVER notification normally.
|
||||
"""
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, old_level, new_level, value, threshold, plugin_data
|
||||
host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
|
||||
check_name=check_name, metric_name=metric_name,
|
||||
)
|
||||
alert_state.formatted_message = formatted_msg
|
||||
|
||||
@@ -1096,12 +1251,20 @@ class ThresholdChecker:
|
||||
alert_state.pending_since = None
|
||||
else:
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
else:
|
||||
elif new_level.value > old_level.value:
|
||||
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification.
|
||||
alert_state.pending_since = time.time()
|
||||
logger.debug(
|
||||
"Alert deferred (%.0fs grace): %s on %s = %s",
|
||||
self.grace_seconds, metric_path, host_name, value,
|
||||
)
|
||||
else:
|
||||
# De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
|
||||
# alerting but did not recover, so no new notification.
|
||||
logger.debug(
|
||||
"De-escalation %s→%s for %s on %s, no notification",
|
||||
old_level.name, new_level.name, metric_path, host_name,
|
||||
)
|
||||
|
||||
def _check_pending_or_renotify(
|
||||
self,
|
||||
@@ -1111,6 +1274,8 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]],
|
||||
check_name: Optional[str] = None,
|
||||
metric_name: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Called when alert level is unchanged and non-OK.
|
||||
|
||||
@@ -1120,7 +1285,8 @@ class ThresholdChecker:
|
||||
if alert_state.pending_since is not None:
|
||||
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data
|
||||
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
|
||||
check_name=check_name, metric_name=metric_name,
|
||||
)
|
||||
alert_state.formatted_message = formatted_msg
|
||||
self._send_notification(
|
||||
@@ -1129,7 +1295,7 @@ class ThresholdChecker:
|
||||
alert_state.pending_since = None
|
||||
# else: still within grace window, do nothing
|
||||
else:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
|
||||
|
||||
def _check_renotify(
|
||||
self,
|
||||
@@ -1139,6 +1305,8 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]] = None,
|
||||
check_name: Optional[str] = None,
|
||||
metric_name: Optional[str] = None,
|
||||
):
|
||||
"""Check if we should send a repeat notification.
|
||||
|
||||
@@ -1176,6 +1344,7 @@ class ThresholdChecker:
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
short_path = metric_path.partition(".")[2] or metric_path
|
||||
|
||||
# Time to re-notify
|
||||
if threshold_value is not None:
|
||||
@@ -1185,12 +1354,17 @@ class ThresholdChecker:
|
||||
value=value,
|
||||
threshold_value=threshold_value,
|
||||
op_symbol=op_symbol,
|
||||
plugin_data=plugin_data
|
||||
plugin_data=plugin_data,
|
||||
check_name=check_name,
|
||||
metric_name=metric_name,
|
||||
)
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
from . import hbdclass
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is None or host.watched:
|
||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||
host_name,
|
||||
notify_mod.Notification(
|
||||
@@ -1199,9 +1373,29 @@ class ThresholdChecker:
|
||||
level=alert_state.level.name,
|
||||
),
|
||||
))
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
|
||||
def purge_stale_alerts(self, hbdclass) -> None:
|
||||
"""Remove alert states that have no matching threshold configuration.
|
||||
|
||||
Called after startup (pickle restore) and after each config reload so
|
||||
that alerts orphaned by configuration changes do not linger forever.
|
||||
Alerts whose metric_path is not present in the current threshold config
|
||||
for that host are silently dropped.
|
||||
"""
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
if not host.alert_states:
|
||||
continue
|
||||
configured = self.get_thresholds_for_host(hostname)
|
||||
stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
|
||||
for mp in stale:
|
||||
logger.info(
|
||||
"Purging stale alert state for %s / %s (no threshold configured)",
|
||||
hostname, mp,
|
||||
)
|
||||
del host.alert_states[mp]
|
||||
|
||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||
"""
|
||||
|
||||
+10
-2
@@ -211,6 +211,7 @@ def _make_timer_callbacks(uname, host, ctx):
|
||||
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL", msg)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||
@@ -335,8 +336,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
# Apply user-access settings from config
|
||||
access = config_mod.get_host_access(cfg, uname)
|
||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
@@ -407,6 +407,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[WARNING] {uname}", body=res, level="WARNING"),
|
||||
@@ -420,6 +421,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||
@@ -437,9 +439,14 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if not newh:
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
elif d < 4:
|
||||
# Transient blip (likely client restart) — skip log and notification
|
||||
m = None
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
if m:
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||
@@ -453,6 +460,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if shutdown:
|
||||
m = "%s shutdown" % conn.afam
|
||||
eventlog(uname, "INFO", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||
|
||||
+52
-9
@@ -13,7 +13,8 @@ from . import data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_connections: set = set()
|
||||
# Map of WebSocket → User object (or None when auth is disabled)
|
||||
_connections: dict = {}
|
||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||
_verbose: bool = False
|
||||
@@ -34,22 +35,52 @@ def setup(
|
||||
_verbose = verbose
|
||||
|
||||
|
||||
def _user_can_see_host(user, host_name: str) -> bool:
|
||||
"""Return True if *user* may see updates for *host_name* (manager or higher)."""
|
||||
from . import hbdclass, users as users_mod
|
||||
if user is None or not users_mod.users_enabled():
|
||||
return True
|
||||
if user.admin:
|
||||
return True
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is None:
|
||||
return False
|
||||
return host.is_manager(user.username)
|
||||
|
||||
|
||||
def _get_token(request) -> str:
|
||||
"""Extract session token from request (mirrors logic in http.py)."""
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth.startswith("Bearer "):
|
||||
return auth[7:].strip()
|
||||
token = request.headers.get("X-Auth-Token", "")
|
||||
if token:
|
||||
return token
|
||||
return request.cookies.get("hbd_session", "")
|
||||
|
||||
|
||||
async def handler(request):
|
||||
"""aiohttp WebSocket upgrade handler — register as GET /ws."""
|
||||
from aiohttp import web
|
||||
from . import users as users_mod
|
||||
|
||||
ws = web.WebSocketResponse()
|
||||
await ws.prepare(request)
|
||||
|
||||
_connections.add(ws)
|
||||
token = _get_token(request)
|
||||
user = users_mod.get_session_user(token) if token else None
|
||||
|
||||
_connections[ws] = user
|
||||
remote = request.remote
|
||||
logger.info("WebSocket connected from %s", remote)
|
||||
|
||||
try:
|
||||
# Send current host state to the new client
|
||||
# Send current host state, filtered to hosts this user may see
|
||||
if _get_hosts:
|
||||
try:
|
||||
for h in list(_get_hosts()):
|
||||
host_name = h.get("raw_name") or h.get("name", "")
|
||||
if _user_can_see_host(user, host_name):
|
||||
await ws.send_str(json.dumps({"type": "host", "data": h}))
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial hosts: %s", e)
|
||||
@@ -74,7 +105,7 @@ async def handler(request):
|
||||
except Exception as e:
|
||||
logger.exception("WebSocket handler error from %s: %s", remote, e)
|
||||
finally:
|
||||
_connections.discard(ws)
|
||||
_connections.pop(ws, None)
|
||||
logger.info("WebSocket disconnected from %s", remote)
|
||||
|
||||
return ws
|
||||
@@ -83,25 +114,37 @@ async def handler(request):
|
||||
def broadcast(typ: str, payload) -> bool:
|
||||
"""Thread-safe broadcast to all connected WebSocket clients.
|
||||
|
||||
For host and plugin updates, only sends to clients whose user has
|
||||
manager-or-higher access to that host. Other message types are
|
||||
broadcast to all clients.
|
||||
|
||||
Can be called from any thread; schedules sends on the event loop.
|
||||
Returns False if the loop is not running yet.
|
||||
"""
|
||||
if not _loop:
|
||||
return False
|
||||
|
||||
# Determine the host name for access-filtered message types
|
||||
host_name: Optional[str] = None
|
||||
if typ in ("host", "plugin"):
|
||||
host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
|
||||
|
||||
jmsg = json.dumps({"type": typ, "data": payload})
|
||||
|
||||
async def _send_all():
|
||||
dead = set()
|
||||
for ws in list(_connections):
|
||||
for ws, user in list(_connections.items()):
|
||||
try:
|
||||
if not ws.closed:
|
||||
await ws.send_str(jmsg)
|
||||
else:
|
||||
if ws.closed:
|
||||
dead.add(ws)
|
||||
continue
|
||||
if host_name is not None and not _user_can_see_host(user, host_name):
|
||||
continue
|
||||
await ws.send_str(jmsg)
|
||||
except Exception:
|
||||
dead.add(ws)
|
||||
for ws in dead:
|
||||
_connections.discard(ws)
|
||||
_connections.pop(ws, None)
|
||||
|
||||
asyncio.run_coroutine_threadsafe(_send_all(), _loop)
|
||||
return True
|
||||
|
||||
+4
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.1.8"
|
||||
version = "5.2.1"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
@@ -34,6 +34,9 @@ server = [
|
||||
"matrix-nio>=0.24",
|
||||
]
|
||||
|
||||
# Minimal client — hbc_mini only, no external dependencies
|
||||
mini = []
|
||||
|
||||
# Install both client and server
|
||||
all = [
|
||||
"hbd[client,server]",
|
||||
|
||||
@@ -4,12 +4,14 @@ set -e
|
||||
uv version --bump patch
|
||||
VER=$(uv version --short)
|
||||
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" hbd/__init__.py
|
||||
sed -i".bak" "s/__version__ = \"[0-9.]*\"\(.*\)$/__version__ = \"$VER\"\1/" scripts/hbc_mini.py
|
||||
|
||||
# commit pyproject.toml
|
||||
git commit -m "version $VER" pyproject.toml hbd/__init__.py
|
||||
git commit -m "version $VER" pyproject.toml hbd/__init__.py scripts/hbc_mini.py
|
||||
git push
|
||||
# tag version
|
||||
git tag -a v$VER -m "Version $VER"
|
||||
git push --tags
|
||||
|
||||
rm hbd/__init__.py.bak
|
||||
rm scripts/hbc_mini.py.bak
|
||||
|
||||
+45
-27
@@ -12,11 +12,14 @@
|
||||
set -e
|
||||
what=$1
|
||||
on_ha=0
|
||||
where=""
|
||||
venv=""
|
||||
[ "$2" = "HA" ] && on_ha=1
|
||||
[ -z "$what" ] && what="client"
|
||||
|
||||
if [ -d /homeassistant ]; then
|
||||
echo "cannot install in HA, running \"docker exec homeassistant $0 $@\""
|
||||
docker exec homeassistant $0 $@
|
||||
if [ -d /homeassistant ]; then # if running from HA command line
|
||||
echo "HA, running \"docker exec homeassistant /config/bin/hb_install.sh $@\""
|
||||
docker exec homeassistant /config/bin/hb_install.sh $@ HA
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
echo "Failed to install heartbeat in HA, please check the logs for more details"
|
||||
@@ -24,11 +27,12 @@ if [ -d /homeassistant ]; then
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
if [ -d /config ]; then
|
||||
echo "Installing on HA"
|
||||
|
||||
if [ $on_ha -eq 1 ] || [ -r /.dockerenv ] && [ -d /config/bin ]; then
|
||||
# Installing under docker on Home Assistant OS, using /config/bin for executables and /config/venvs for virtual environments
|
||||
echo "Home Assistant OS detected, installing under docker"
|
||||
where="/config/bin"
|
||||
venv="/config/venvs"
|
||||
on_ha=1
|
||||
else
|
||||
if [ ! -d $HOME/.local/bin ] && [ ! -d $HOME/bin ]; then
|
||||
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||
@@ -43,24 +47,32 @@ else
|
||||
echo "No suitable bin directory found in PATH, please add either $HOME/.local/bin or $HOME/bin to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
if [ "$what" = "mini" ]; then
|
||||
venv=""
|
||||
else
|
||||
venv="$HOME/venvs"
|
||||
fi
|
||||
fi
|
||||
echo "Installing $what to $where"
|
||||
if [ ! -z "$venv" ]; then
|
||||
echo "Using virtual environment at $venv/hbd"
|
||||
fi
|
||||
|
||||
echo "Installing heartbeat $what"
|
||||
|
||||
if [ ! -d $venv/hbd ]; then
|
||||
set +e
|
||||
python3 -m pip --version > /dev/null 2>&1
|
||||
rc=$?
|
||||
set -e
|
||||
if [ $rc -ne 0 ]; then
|
||||
# truenas does not have pip installed by default, so we need to fetch get-pip.py and install pip
|
||||
if [ "$venv" != "" ] && [ ! -d $venv/hbd ]; then
|
||||
arg=""
|
||||
have_pip=$(python3 -c "import pip" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||
if [ "$have_pip" = "Not Installed" ]; then
|
||||
# some systems do not have pip installed by default, so we need to fetch get-pip.py and install pip
|
||||
echo "pip is not installed, fetching get-pip.py and installing pip"
|
||||
arg="--without-pip"
|
||||
fi
|
||||
mkdir -p $venv
|
||||
have_venv=$(python3 -c "import venv" &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||
have_venv=$(python3 -c "import venv" 2>/dev/null &> /dev/null && echo "Installed" || echo "Not Installed")
|
||||
if [ "$have_venv" = "Not Installed" ]; then
|
||||
if [ "$have_pip" = "Not Installed" ]; then
|
||||
echo "python has no venv, and no pip to install virtualenv, cannot continue"
|
||||
exit 1
|
||||
fi
|
||||
echo "python venv module not found, installing virtualenv"
|
||||
python3 -m pip install --user virtualenv
|
||||
python3 -m virtualenv $venv/hbd --system-site-packages $arg
|
||||
@@ -74,24 +86,30 @@ if [ ! -d $venv/hbd ]; then
|
||||
deactivate
|
||||
fi
|
||||
|
||||
if [ ! -z "$venv" ]; then
|
||||
. $venv/hbd/bin/activate
|
||||
fi
|
||||
if [ "$what" = "mini" ]; then
|
||||
curl -s -o $where/hbc_mini https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hbc_mini.py
|
||||
chmod +x $where/hbc_mini
|
||||
else
|
||||
python3 -mpip install --upgrade --index-url https://git.wrede.ca/api/packages/andreas/pypi/simple/ --extra-index-url https://pypi.org/simple hbd[$what]
|
||||
fi
|
||||
|
||||
if [ ! -z "$venv" ]; then
|
||||
echo "linking executables to $where"
|
||||
if [ "$what" = "server" ]; then
|
||||
rm -f $where/hbd
|
||||
ln -sf $(which hbd) $where/hbd
|
||||
echo "hbd installed, you can run it with \"$where/hbd\" or \"hbd\" if $where is in your PATH"
|
||||
else
|
||||
elif [ "$what" = "client" ]; then
|
||||
rm -f $where/hbc
|
||||
ln -sf $(which hbc) $where/hbc
|
||||
# rm -f $where/hb_install.sh
|
||||
cp "$0" $where/hb_install.sh
|
||||
chmod +x $where/hb_install.sh
|
||||
if [ $on_ha -eq 1 ]; then
|
||||
echo "restarting hbc "
|
||||
job=$(grep run_hbc configuration.yaml | sed 's/run_hbc://')
|
||||
$job
|
||||
else
|
||||
echo "hbc installed, you can run it with \"$where/hbc\" or \"hbc\" if $where is in your PATH"
|
||||
fi
|
||||
rm -f $where/hb_install.sh
|
||||
ln -sf $(which hb_install.sh) $where/hb_install.sh
|
||||
fi
|
||||
echo "Installation complete. To upgrade, run the following:"
|
||||
echo " $where/hb_install.sh $what"
|
||||
echo "To install on another machine, run the following obtain the install script and run it:"
|
||||
echo "from https://git.wrede.ca/andreas/heartbeat/raw/branch/master/scripts/hb_install.sh"
|
||||
echo "and then run sh hb_install.sh [mini|client]"
|
||||
+38
-11
@@ -40,6 +40,9 @@ from logging.handlers import SysLogHandler
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# updated by scripts/bumpminor.sh
|
||||
__version__ = "5.2.1"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol (mirrors hbd/common/proto.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -233,7 +236,7 @@ class OSInfoPlugin(InfoPlugin):
|
||||
"machine": platform.machine(),
|
||||
"architecture": platform.architecture()[0],
|
||||
"python_version": platform.python_version(),
|
||||
"hbc_version": "5.1.8",
|
||||
"hbc_version": __version__,
|
||||
"hbc_type": "mini",
|
||||
}
|
||||
if platform.system() == "Linux":
|
||||
@@ -385,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
results: Dict[str, Any] = {}
|
||||
worst = 0
|
||||
for cmd_cfg in self.commands:
|
||||
name = cmd_cfg.get("name")
|
||||
command = cmd_cfg.get("command")
|
||||
@@ -396,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
||||
results[f"{name}_status_code"] = rc
|
||||
results[f"{name}_output"] = msg
|
||||
results.update({f"{name}_{k}": v for k, v in perf.items()})
|
||||
worst = max(worst, rc)
|
||||
results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN")
|
||||
results["overall_status_code"] = worst
|
||||
results["plugin_count"] = len(self.commands)
|
||||
return results
|
||||
|
||||
|
||||
@@ -484,6 +482,12 @@ class CPUMonitorPlugin(MonitorPlugin):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
with open("/proc/uptime") as fh:
|
||||
data["uptime_seconds"] = int(float(fh.read().split()[0]))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@@ -532,6 +536,20 @@ class MemoryMonitorPlugin(MonitorPlugin):
|
||||
total = mi.get("MemTotal", 0)
|
||||
avail = mi.get("MemAvailable", mi.get("MemFree", 0))
|
||||
free = mi.get("MemFree", 0)
|
||||
|
||||
# ZFS ARC is reclaimable but not included in MemAvailable; add it.
|
||||
arc_kb = 0
|
||||
try:
|
||||
with open("/proc/spl/kstat/zfs/arcstats") as _f:
|
||||
for _line in _f:
|
||||
_p = _line.split()
|
||||
if len(_p) >= 3 and _p[0] == "size":
|
||||
arc_kb = int(_p[2]) // 1024
|
||||
break
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
avail = min(avail + arc_kb, total)
|
||||
used = total - avail
|
||||
data: Dict[str, Any] = {
|
||||
"memory_total": total * 1024,
|
||||
@@ -875,7 +893,7 @@ async def _handle_update(conn: AsyncConnection):
|
||||
log.info("running installer: %s", installer)
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
installer, "client",
|
||||
installer, "mini",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
)
|
||||
@@ -1049,8 +1067,8 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
if args.message:
|
||||
bmsg["service"] = "service"
|
||||
bmsg["msg"] = args.message
|
||||
for c in connections:
|
||||
await c.sendto(bmsg)
|
||||
target = next((c for c in connections if c._transport), connections[0])
|
||||
await target.sendto(bmsg)
|
||||
if args.message and not args.daemon:
|
||||
await asyncio.sleep(0.3)
|
||||
for c in connections:
|
||||
@@ -1063,6 +1081,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, _stop)
|
||||
|
||||
def _sighup():
|
||||
global _dorestart
|
||||
_dorestart = True
|
||||
_stop()
|
||||
|
||||
loop.add_signal_handler(signal.SIGHUP, _sighup)
|
||||
|
||||
for conn in connections:
|
||||
_active_tasks.append(asyncio.create_task(_heartbeat_sender(conn, interval)))
|
||||
|
||||
@@ -1075,11 +1100,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
pass
|
||||
|
||||
log.info("shutting down")
|
||||
for conn in connections:
|
||||
target = next((c for c in connections if c._transport), connections[0] if connections else None)
|
||||
if target:
|
||||
try:
|
||||
await conn.sendto({"shutdown": 1, "acks": conn.ackcount})
|
||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||
except Exception:
|
||||
pass
|
||||
for conn in connections:
|
||||
conn.close()
|
||||
await asyncio.sleep(0.3)
|
||||
for plugin in plugins:
|
||||
|
||||
+1
-2
@@ -68,8 +68,7 @@ async def test_nagios_runner():
|
||||
print(f" ✓ Collected {len(data)} data points")
|
||||
|
||||
print(f"\n4. Results:")
|
||||
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
|
||||
print(f" Plugins Executed: {data.get('plugin_count')}")
|
||||
print(f" Data points collected: {len(data)}")
|
||||
|
||||
# Show individual plugin results
|
||||
print(f"\n5. Individual Plugin Results:")
|
||||
|
||||
Reference in New Issue
Block a user