Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e931acb9f5 | |||
| 018409e71d | |||
| 1824f637b4 | |||
| a534c06b26 | |||
| d7b5c97a4e | |||
| ae447ac4a6 | |||
| d44ce3d124 | |||
| b1985d0eb2 | |||
| de778f680f | |||
| d7b368c7c6 | |||
| e790663f9f | |||
| 475319e248 | |||
| ca5ef384a8 | |||
| c93dbdc0f4 | |||
| 3a546a1e5c | |||
| 74c89d098c | |||
| 3301dbfe34 | |||
| d00d903e7d | |||
| babb5d61aa | |||
| 11d1c718b3 | |||
| a99b6b54c7 |
@@ -27,6 +27,7 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
|||||||
- Configurable retention and backup management
|
- Configurable retention and backup management
|
||||||
- **Plugin system for extensible monitoring** ✅
|
- **Plugin system for extensible monitoring** ✅
|
||||||
- Collect system metrics (CPU, memory, disk, network)
|
- Collect system metrics (CPU, memory, disk, network)
|
||||||
|
- Monitor ZFS pool health, capacity, and I/O via `zpool(8)`
|
||||||
- Execute existing Nagios monitoring plugins
|
- Execute existing Nagios monitoring plugins
|
||||||
- Create custom plugins with simple Python classes
|
- Create custom plugins with simple Python classes
|
||||||
- **Threshold alerting system** ✅
|
- **Threshold alerting system** ✅
|
||||||
@@ -34,6 +35,8 @@ A lightweight daemon that listens for UDP heartbeat messages and acts on them: k
|
|||||||
- Hysteresis to prevent alert flapping
|
- Hysteresis to prevent alert flapping
|
||||||
- Automatic notifications on state changes
|
- Automatic notifications on state changes
|
||||||
- Re-notification for ongoing alerts
|
- Re-notification for ongoing alerts
|
||||||
|
- **Per-host watch flag** — set `watch: false` on any host to silence all notifications for that host without removing its configuration ✅
|
||||||
|
- **Role-filtered dashboards** — Live Dashboard and Host Overview show only hosts where the logged-in user is owner or manager (admins see all) ✅
|
||||||
- Modular codebase suitable for unit testing and CI ✅
|
- Modular codebase suitable for unit testing and CI ✅
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -55,21 +58,26 @@ Heartbeat includes a comprehensive plugin architecture that extends monitoring b
|
|||||||
### Built-in Plugins
|
### Built-in Plugins
|
||||||
|
|
||||||
- `os_info`: Collects OS, kernel, distribution, and architecture information
|
- `os_info`: Collects OS, kernel, distribution, and architecture information
|
||||||
- `cpu_monitor`: Monitors CPU usage, load average, frequency, and process counts
|
- `cpu_monitor`: Monitors CPU usage, load average, frequency, process counts, and uptime
|
||||||
- `memory_monitor`: Monitors RAM and swap usage, available memory
|
- `memory_monitor`: Monitors RAM and swap usage, available memory (ZFS ARC-aware)
|
||||||
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
|
- `disk_monitor`: Monitors disk usage, I/O statistics, and filesystem metrics
|
||||||
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
|
- `network_monitor`: Monitors network interface statistics, bandwidth, and connections
|
||||||
|
- `ping_monitor`: Measures round-trip latency to configured hosts
|
||||||
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
|
- `filesystem_info`: Collects mounted filesystem information (physical filesystems only by default)
|
||||||
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
|
- `nagios_runner`: Executes Nagios monitoring plugins (check_disk, check_load, check_http, etc.)
|
||||||
|
- `zfs_monitor`: Monitors ZFS pool health, capacity, fragmentation, dedup ratio, and cumulative I/O via `zpool(8)`
|
||||||
|
|
||||||
### Nagios Integration
|
### Nagios Integration
|
||||||
|
|
||||||
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
|
The `nagios_runner` plugin provides seamless integration with the vast Nagios plugin ecosystem. You can run any Nagios-compatible plugin and have the results automatically parsed and stored:
|
||||||
|
|
||||||
- Executes plugins via subprocess with timeout protection
|
- Executes plugins asynchronously (non-blocking) with timeout protection
|
||||||
|
- Captures both stdout and stderr; if stdout is empty, stderr is used as the status message
|
||||||
|
- Handles signal-killed processes (negative exit code → UNKNOWN status)
|
||||||
|
- Validates absolute command paths at startup and warns on missing or non-executable files
|
||||||
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
|
- Parses exit codes (OK/WARNING/CRITICAL/UNKNOWN)
|
||||||
- Extracts performance data with thresholds
|
- Extracts performance data with thresholds
|
||||||
- Reports aggregated status across all configured checks
|
- Reports per-check status, exit code, and output; no aggregate rollup field
|
||||||
|
|
||||||
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
|
See [docs/NAGIOS_INTEGRATION.md](docs/NAGIOS_INTEGRATION.md) for complete integration guide including configuration examples and custom plugin development.
|
||||||
|
|
||||||
@@ -147,9 +155,11 @@ Heartbeat includes a sophisticated threshold alerting system that monitors plugi
|
|||||||
- **Multi-level alerts**: WARNING and CRITICAL severity levels
|
- **Multi-level alerts**: WARNING and CRITICAL severity levels
|
||||||
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
|
- **Flexible operators**: Support for >, >=, <, <=, ==, != comparisons
|
||||||
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
|
- **Hysteresis**: Prevents alert flapping with configurable recovery thresholds
|
||||||
- **Smart notifications**: Alerts only on state changes, not every check
|
- **Smart notifications**: Alerts only on state changes, not every check; de-escalations (e.g. CRITICAL → WARNING) do not generate a notification
|
||||||
- **Re-notifications**: Periodic reminders for ongoing alerts
|
- **Re-notifications**: Periodic reminders for ongoing alerts
|
||||||
|
- **Short-duration suppression**: Recovery notifications are suppressed for down events under 4 seconds (avoids noise from transient blips)
|
||||||
- **Journal integration**: All threshold events logged for audit trail
|
- **Journal integration**: All threshold events logged for audit trail
|
||||||
|
- **`ping_monitor` thresholds**: Latency and packet-loss thresholds use the same format as all other plugin metrics
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
@@ -172,7 +182,8 @@ thresholds:
|
|||||||
warning: 80.0 # Warn when CPU > 80%
|
warning: 80.0 # Warn when CPU > 80%
|
||||||
critical: 90.0 # Critical when CPU > 90%
|
critical: 90.0 # Critical when CPU > 90%
|
||||||
operator: ">"
|
operator: ">"
|
||||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
hysteresis: 0.02 # 2% hysteresis to prevent flapping
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%)" # optional
|
||||||
|
|
||||||
memory_monitor:
|
memory_monitor:
|
||||||
percent:
|
percent:
|
||||||
@@ -214,7 +225,7 @@ thresholds:
|
|||||||
<hostname>:
|
<hostname>:
|
||||||
warning: <milliseconds> # Warn when RTT > this value
|
warning: <milliseconds> # Warn when RTT > this value
|
||||||
critical: <milliseconds> # Critical when RTT > this value
|
critical: <milliseconds> # Critical when RTT > this value
|
||||||
hysteresis: 0.1 # Optional: 10% hysteresis (default)
|
hysteresis: 0.02 # Optional: 2% hysteresis (default)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Example alerts:**
|
**Example alerts:**
|
||||||
@@ -265,7 +276,59 @@ All plugin metrics can be thresholded:
|
|||||||
- **Memory**: percent, available_mb, swap_percent
|
- **Memory**: percent, available_mb, swap_percent
|
||||||
- **Disk**: Per-partition percent, free_gb, free_mb
|
- **Disk**: Per-partition percent, free_gb, free_mb
|
||||||
- **Network**: errors_total, dropped packets, connection counts
|
- **Network**: errors_total, dropped packets, connection counts
|
||||||
- **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL)
|
- **Nagios**: Any field emitted by `nagios_runner` (`<name>_status_code`, `<name>_status`, `<name>_output`, performance data fields)
|
||||||
|
|
||||||
|
### Display Format Templates
|
||||||
|
|
||||||
|
Each threshold entry accepts an optional `display` field — a Python format string shown in notifications and on the Alerts dashboard:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
status_code:
|
||||||
|
warning: 1
|
||||||
|
critical: 2
|
||||||
|
operator: ">="
|
||||||
|
display: "{check_name}: exit {value} (expected < {threshold_value})"
|
||||||
|
```
|
||||||
|
|
||||||
|
Available variables:
|
||||||
|
|
||||||
|
| Variable | Description |
|
||||||
|
|---|---|
|
||||||
|
| `{value}` | Current metric value |
|
||||||
|
| `{threshold_value}` | Threshold that was crossed |
|
||||||
|
| `{op_symbol}` | Comparison operator (`>`, `<`, `>=`, …); `"nagios"` for the nagios operator |
|
||||||
|
| `{check_name}` | Prefix stripped by generic matching (see below) |
|
||||||
|
| `{metric_name}` | Full field name within the plugin data |
|
||||||
|
| `{output}` | For `nagios_runner` generic matches: the matched check's status text (alias for `{check_name}_output`) |
|
||||||
|
| `{status}` | For `nagios_runner` generic matches: the matched check's status name — OK/WARNING/CRITICAL/UNKNOWN (alias for `{check_name}_status`) |
|
||||||
|
| any plugin field | Any other field present in the plugin's data |
|
||||||
|
|
||||||
|
### Generic Threshold Matching
|
||||||
|
|
||||||
|
When a metric name has no exact threshold entry, the server progressively strips leading underscore-separated segments and re-tries the lookup. This lets a single generic entry cover an entire family of metrics.
|
||||||
|
|
||||||
|
The classic use case is `nagios_runner`, which names each metric after the command that produced it:
|
||||||
|
|
||||||
|
```
|
||||||
|
nagios_runner.check_disk_root_status_code → no exact match
|
||||||
|
nagios_runner.disk_root_status_code → no match
|
||||||
|
nagios_runner.root_status_code → no match
|
||||||
|
nagios_runner.status_code → matched ✓
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure the generic threshold once using the `nagios` operator, which maps exit codes directly to alert severity without requiring numeric warning/critical values:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
nagios_runner:
|
||||||
|
status_code:
|
||||||
|
operator: "nagios" # 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||||
|
display: "{check_name}: {output}"
|
||||||
|
```
|
||||||
|
|
||||||
|
The stripped prefix (`check_disk_root` in the example above) is available as `{check_name}` in the display template, so you can identify which check triggered the alert without writing a separate threshold entry per command.
|
||||||
|
|
||||||
|
Exact matches always take priority. A generic entry only applies when no specific one is defined.
|
||||||
|
|
||||||
### Per-Host Threshold Profiles
|
### Per-Host Threshold Profiles
|
||||||
|
|
||||||
@@ -363,9 +426,10 @@ Heartbeat includes a built-in HTTP/WebSocket server that provides both a REST AP
|
|||||||
### Web Dashboards
|
### Web Dashboards
|
||||||
|
|
||||||
- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
|
- **Login** (`/login`): Browser login form (shown automatically when auth is configured)
|
||||||
- **Live View** (`/live`): Real-time host connectivity, latency, and messages
|
- **Live View** (`/live`): Real-time host connectivity, latency, and messages; hostnames link directly to the Host Overview page
|
||||||
- **Plugin Metrics** (`/plugins`): Browse and visualize metrics from all plugins
|
- **Host Overview** (`/plugins/<host>`): Per-host plugin metrics with ZFS pool visualization; filtered to hosts where the logged-in user is owner or manager (admins see all)
|
||||||
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering
|
- **Alerts Dashboard** (`/alerts`): Monitor active alerts with severity filtering; alert count pie chart shown in the navigation bar
|
||||||
|
- **Settings** (`/settings`): Server configuration, user management, and threshold configuration viewer
|
||||||
|
|
||||||
### API Endpoints
|
### API Endpoints
|
||||||
|
|
||||||
@@ -451,12 +515,11 @@ You can also run it via the module entrypoint:
|
|||||||
python -m hbd.client.main your-server.example.com
|
python -m hbd.client.main your-server.example.com
|
||||||
```
|
```
|
||||||
|
|
||||||
Client configuration can also be specified in YAML:
|
Client configuration can also be specified in YAML (`~/.hbc.yaml`):
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
server: hbd.example.com
|
hb_port: 50003 # Server port (default: 50003)
|
||||||
port: 50003
|
interval: 30 # Heartbeat interval in seconds
|
||||||
interval: 30
|
|
||||||
plugins:
|
plugins:
|
||||||
cpu_monitor:
|
cpu_monitor:
|
||||||
interval: 300 # Check every 5 minutes (default)
|
interval: 300 # Check every 5 minutes (default)
|
||||||
@@ -470,12 +533,20 @@ plugins:
|
|||||||
nagios_runner:
|
nagios_runner:
|
||||||
interval: 300 # Check every 5 minutes (default)
|
interval: 300 # Check every 5 minutes (default)
|
||||||
commands:
|
commands:
|
||||||
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
- name: check_load
|
||||||
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||||
|
- name: check_disk
|
||||||
|
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The server hostname is always passed as a positional command-line argument; there is no `server:` config key.
|
||||||
|
|
||||||
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
|
All monitoring plugins default to 5-minute (300 second) intervals, but can be customized as needed.
|
||||||
|
|
||||||
|
**Connection retry:** If a server is temporarily unreachable, `hbc` retries `open()` indefinitely on every heartbeat interval. IPv6 connections that never succeeded during early startup are dropped after 3 consecutive failures (to handle hosts without IPv6 routing), while IPv4 connections always retry.
|
||||||
|
|
||||||
|
**Daemon logging:** When running with `-d`, `hbc` routes all log output to syslog (`LOG_DAEMON` facility) after daemonizing. Without `-d`, logs go to stderr as usual.
|
||||||
|
|
||||||
### hbc_mini — single-file client (no external dependencies)
|
### hbc_mini — single-file client (no external dependencies)
|
||||||
|
|
||||||
`scripts/hbc_mini.py` is a self-contained version of the heartbeat client that requires only Python 3.8+ and no external packages. Copy it to any host and run it directly — no virtualenv, no `pip install`.
|
`scripts/hbc_mini.py` is a self-contained version of the heartbeat client that requires only Python 3.8+ and no external packages. Copy it to any host and run it directly — no virtualenv, no `pip install`.
|
||||||
@@ -531,8 +602,10 @@ python3 hbc_mini.py -m "maintenance starting" your-server.example.com
|
|||||||
|
|
||||||
- No YAML config (use JSON instead)
|
- No YAML config (use JSON instead)
|
||||||
- No `filesystem_info` plugin
|
- No `filesystem_info` plugin
|
||||||
|
- No `zfs_monitor` plugin (requires `zpool(8)` and the full plugin loader)
|
||||||
- `cpu_monitor` does not report per-core usage or CPU frequency (no psutil)
|
- `cpu_monitor` does not report per-core usage or CPU frequency (no psutil)
|
||||||
- Plugins cannot be loaded from external `.py` files — all plugins are compiled in
|
- Plugins cannot be loaded from external `.py` files — all plugins are compiled in
|
||||||
|
- No IPv6 early-fail protection — connections that fail to open at startup are silently skipped rather than retried
|
||||||
|
|
||||||
Everything else — heartbeat protocol, ACK/CMD/UPD handling, `hb_install.sh`-based self-update, daemonize, syslog — is identical to the full client.
|
Everything else — heartbeat protocol, ACK/CMD/UPD handling, `hb_install.sh`-based self-update, daemonize, syslog — is identical to the full client.
|
||||||
|
|
||||||
|
|||||||
@@ -104,11 +104,6 @@ The `nagios_runner` plugin collects:
|
|||||||
- `{name}_{metric}_min` - Minimum value (if present)
|
- `{name}_{metric}_min` - Minimum value (if present)
|
||||||
- `{name}_{metric}_max` - Maximum value (if present)
|
- `{name}_{metric}_max` - Maximum value (if present)
|
||||||
|
|
||||||
**Overall:**
|
|
||||||
- `overall_status` - Worst status from all commands
|
|
||||||
- `overall_status_code` - Worst status code
|
|
||||||
- `plugin_count` - Number of Nagios plugins executed
|
|
||||||
|
|
||||||
## Configuration Options
|
## Configuration Options
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|||||||
@@ -1110,33 +1110,6 @@ hosts:
|
|||||||
db-02:
|
db-02:
|
||||||
threshold_config: [tight_memory, db_disk]
|
threshold_config: [tight_memory, db_disk]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Backward Compatibility
|
|
||||||
|
|
||||||
The legacy single threshold configuration is fully supported:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# Old format - still works
|
|
||||||
thresholds:
|
|
||||||
cpu_monitor:
|
|
||||||
cpu_percent:
|
|
||||||
warning: 80.0
|
|
||||||
critical: 90.0
|
|
||||||
```
|
|
||||||
|
|
||||||
This is equivalent to:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# New format
|
|
||||||
threshold_configs:
|
|
||||||
default:
|
|
||||||
thresholds:
|
|
||||||
cpu_monitor:
|
|
||||||
cpu_percent:
|
|
||||||
warning: 80.0
|
|
||||||
critical: 90.0
|
|
||||||
```
|
|
||||||
|
|
||||||
### Configuration Priority
|
### Configuration Priority
|
||||||
|
|
||||||
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
|
1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults
|
||||||
|
|||||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = ["__version__"]
|
__all__ = ["__version__"]
|
||||||
__version__ = "5.1.16"
|
__version__ = "5.2.0"
|
||||||
|
|||||||
+56
-22
@@ -56,6 +56,8 @@ class AsyncConnection:
|
|||||||
self.transport: Optional[asyncio.DatagramTransport] = None
|
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||||
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||||
self._dead = False
|
self._dead = False
|
||||||
|
self._ever_opened = False
|
||||||
|
self._open_fail_count = 0 # consecutive failures before first success
|
||||||
|
|
||||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||||
|
|
||||||
@@ -73,6 +75,7 @@ class AsyncConnection:
|
|||||||
lambda: HeartbeatProtocol(self),
|
lambda: HeartbeatProtocol(self),
|
||||||
family=self.af
|
family=self.af
|
||||||
)
|
)
|
||||||
|
self._ever_opened = True
|
||||||
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -262,15 +265,51 @@ async def handle_update(conn: AsyncConnection, _msg: dict): # pyright: ignore[r
|
|||||||
|
|
||||||
|
|
||||||
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||||
"""Send periodic heartbeats.
|
"""Send periodic heartbeats, retrying the connection if it is not open.
|
||||||
|
|
||||||
|
IPv6 connections that fail to open before their first successful send are
|
||||||
|
dropped after IPV6_EARLY_FAIL_LIMIT attempts so that a network without IPv6
|
||||||
|
does not keep a dead sender alive. IPv4 connections are retried indefinitely.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
conn: Connection to send on
|
conn: Connection to send on
|
||||||
interval: Heartbeat interval in seconds
|
interval: Heartbeat interval in seconds
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger("hbc.heartbeat")
|
logger = logging.getLogger("hbc.heartbeat")
|
||||||
|
IPV6_EARLY_FAIL_LIMIT = 3
|
||||||
|
|
||||||
|
while running and not conn._dead:
|
||||||
|
# Ensure transport is open before attempting to send.
|
||||||
|
if not conn.transport:
|
||||||
|
opened = await conn.open()
|
||||||
|
if opened:
|
||||||
|
conn._open_fail_count = 0
|
||||||
|
else:
|
||||||
|
conn._open_fail_count += 1
|
||||||
|
# Drop an IPv6 connection that has never come up within the
|
||||||
|
# first few attempts — it is likely unavailable on this network.
|
||||||
|
if (not conn._ever_opened
|
||||||
|
and conn.af == socket.AF_INET6
|
||||||
|
and conn._open_fail_count >= IPV6_EARLY_FAIL_LIMIT):
|
||||||
|
logger.warning(
|
||||||
|
f"IPv6 connection to {conn.addr} unreachable after "
|
||||||
|
f"{conn._open_fail_count} attempts, disabling"
|
||||||
|
)
|
||||||
|
conn._dead = True
|
||||||
|
break
|
||||||
|
# Retry after the normal interval; IPv4 retries forever.
|
||||||
|
try:
|
||||||
|
if shutdown_event:
|
||||||
|
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
continue
|
||||||
|
|
||||||
while running:
|
|
||||||
try:
|
try:
|
||||||
msg = {
|
msg = {
|
||||||
"acks": conn.ackcount,
|
"acks": conn.ackcount,
|
||||||
@@ -279,19 +318,16 @@ async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
|||||||
}
|
}
|
||||||
await conn.sendto(msg, "HTB")
|
await conn.sendto(msg, "HTB")
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.debug("Heartbeat sender cancelled")
|
logger.debug("Heartbeat sender cancelled")
|
||||||
raise
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||||
|
|
||||||
# Wait for next interval or shutdown event
|
# Wait for next interval or shutdown event
|
||||||
try:
|
try:
|
||||||
if shutdown_event:
|
if shutdown_event:
|
||||||
await asyncio.wait_for(
|
await asyncio.wait_for(shutdown_event.wait(), timeout=interval)
|
||||||
shutdown_event.wait(),
|
|
||||||
timeout=interval
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
@@ -427,16 +463,13 @@ async def cleanup(connections: List[AsyncConnection]):
|
|||||||
logger = logging.getLogger("hbc.cleanup")
|
logger = logging.getLogger("hbc.cleanup")
|
||||||
logger.info("Cleaning up connections")
|
logger.info("Cleaning up connections")
|
||||||
|
|
||||||
for conn in connections:
|
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||||
|
if target:
|
||||||
try:
|
try:
|
||||||
msg = {
|
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||||
"shutdown": 1,
|
|
||||||
"acks": conn.ackcount
|
|
||||||
}
|
|
||||||
await conn.sendto(msg)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error sending shutdown: {e}")
|
logger.error(f"Error sending shutdown: {e}")
|
||||||
|
for conn in connections:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
# Give messages time to send
|
# Give messages time to send
|
||||||
@@ -481,12 +514,13 @@ async def async_main(args, config):
|
|||||||
addr = addr_info[4][0]
|
addr = addr_info[4][0]
|
||||||
|
|
||||||
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||||
if await conn.open():
|
if not await conn.open():
|
||||||
connections.append(conn)
|
logger.warning(f"Initial open to {addr} failed, heartbeat sender will retry")
|
||||||
conn_id += 1
|
connections.append(conn)
|
||||||
|
conn_id += 1
|
||||||
|
|
||||||
if not connections:
|
if not connections:
|
||||||
logger.error("No connections established")
|
logger.error("No connections established (DNS resolution failed for all hosts)")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
logger.info(f"Created {len(connections)} connections")
|
logger.info(f"Created {len(connections)} connections")
|
||||||
@@ -501,8 +535,8 @@ async def async_main(args, config):
|
|||||||
boot_msg["msg"] = args.message
|
boot_msg["msg"] = args.message
|
||||||
|
|
||||||
boot_msg["acks"] = 0
|
boot_msg["acks"] = 0
|
||||||
for conn in connections:
|
target = next((c for c in connections if c.transport), connections[0])
|
||||||
await conn.sendto(boot_msg)
|
await target.sendto(boot_msg)
|
||||||
|
|
||||||
if args.message and not args.daemon:
|
if args.message and not args.daemon:
|
||||||
# Message-only mode
|
# Message-only mode
|
||||||
@@ -702,7 +736,7 @@ def main(argv=None):
|
|||||||
|
|
||||||
# Daemonize if requested
|
# Daemonize if requested
|
||||||
if args.daemon:
|
if args.daemon:
|
||||||
print("Daemonizing...")
|
logging.info("Daemonizing...")
|
||||||
daemonize()
|
daemonize()
|
||||||
_reconfigure_logging_for_daemon(log_level)
|
_reconfigure_logging_for_daemon(log_level)
|
||||||
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
|
||||||
|
|||||||
@@ -119,6 +119,13 @@ class CPUMonitorPlugin(MonitorPlugin):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.debug(f"Could not get CPU times: {e}")
|
self.logger.debug(f"Could not get CPU times: {e}")
|
||||||
|
|
||||||
|
# Uptime in seconds
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Could not get uptime: {e}")
|
||||||
|
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -14,6 +14,24 @@ except ImportError:
|
|||||||
|
|
||||||
from hbd.client.plugin import MonitorPlugin
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
def _zfs_arc_bytes() -> int:
|
||||||
|
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
|
||||||
|
|
||||||
|
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
|
||||||
|
kernel (it is not in SReclaimable), so it would otherwise be counted as
|
||||||
|
used memory.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open("/proc/spl/kstat/zfs/arcstats") as fh:
|
||||||
|
for line in fh:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 3 and parts[0] == "size":
|
||||||
|
return int(parts[2])
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
return 0
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -101,11 +119,21 @@ class MemoryMonitorPlugin(MonitorPlugin):
|
|||||||
|
|
||||||
# Virtual (physical) memory statistics
|
# Virtual (physical) memory statistics
|
||||||
vmem = psutil.virtual_memory()
|
vmem = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# psutil's available already excludes page cache / file buffers
|
||||||
|
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
|
||||||
|
# does not include it in SReclaimable / MemAvailable even though it is
|
||||||
|
# reclaimable.
|
||||||
|
arc_bytes = _zfs_arc_bytes()
|
||||||
|
available = min(vmem.available + arc_bytes, vmem.total)
|
||||||
|
used = vmem.total - available
|
||||||
|
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
|
||||||
|
|
||||||
metrics['memory_total'] = vmem.total
|
metrics['memory_total'] = vmem.total
|
||||||
metrics['memory_available'] = vmem.available
|
metrics['memory_available'] = available
|
||||||
metrics['memory_used'] = vmem.used
|
metrics['memory_used'] = used
|
||||||
metrics['memory_free'] = vmem.free
|
metrics['memory_free'] = vmem.free
|
||||||
metrics['memory_percent'] = vmem.percent
|
metrics['memory_percent'] = percent
|
||||||
|
|
||||||
# Platform-specific memory details
|
# Platform-specific memory details
|
||||||
if hasattr(vmem, 'active'):
|
if hasattr(vmem, 'active'):
|
||||||
|
|||||||
@@ -31,16 +31,13 @@ from hbd.client.plugin import MonitorPlugin
|
|||||||
|
|
||||||
|
|
||||||
# Nagios exit codes
|
# Nagios exit codes
|
||||||
NAGIOS_OK = 0
|
|
||||||
NAGIOS_WARNING = 1
|
|
||||||
NAGIOS_CRITICAL = 2
|
|
||||||
NAGIOS_UNKNOWN = 3
|
NAGIOS_UNKNOWN = 3
|
||||||
|
|
||||||
STATUS_NAMES = {
|
STATUS_NAMES = {
|
||||||
NAGIOS_OK: "OK",
|
0: "OK",
|
||||||
NAGIOS_WARNING: "WARNING",
|
1: "WARNING",
|
||||||
NAGIOS_CRITICAL: "CRITICAL",
|
2: "CRITICAL",
|
||||||
NAGIOS_UNKNOWN: "UNKNOWN"
|
3: "UNKNOWN",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -129,9 +126,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
"""
|
"""
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
# Track overall status (worst status wins)
|
|
||||||
worst_status = NAGIOS_OK
|
|
||||||
|
|
||||||
for cmd_config in self.commands:
|
for cmd_config in self.commands:
|
||||||
name = cmd_config.get("name")
|
name = cmd_config.get("name")
|
||||||
command = cmd_config.get("command")
|
command = cmd_config.get("command")
|
||||||
@@ -149,10 +143,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
results[f"{name}_status_code"] = status_code
|
results[f"{name}_status_code"] = status_code
|
||||||
results[f"{name}_output"] = output
|
results[f"{name}_output"] = output
|
||||||
|
|
||||||
# Track worst status
|
|
||||||
if status_code > worst_status:
|
|
||||||
worst_status = status_code
|
|
||||||
|
|
||||||
# Parse and add performance data
|
# Parse and add performance data
|
||||||
if perfdata:
|
if perfdata:
|
||||||
for metric_name, metric_value in perfdata.items():
|
for metric_name, metric_value in perfdata.items():
|
||||||
@@ -167,12 +157,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
results[f"{name}_status"] = "ERROR"
|
results[f"{name}_status"] = "ERROR"
|
||||||
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||||
results[f"{name}_output"] = str(e)
|
results[f"{name}_output"] = str(e)
|
||||||
worst_status = NAGIOS_UNKNOWN
|
|
||||||
|
|
||||||
# Add overall status
|
|
||||||
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
|
|
||||||
results["overall_status_code"] = worst_status
|
|
||||||
results["plugin_count"] = len(self.commands)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -95,6 +95,12 @@ THRESHOLD_DEFAULTS = {
|
|||||||
'warning': 200,
|
'warning': 200,
|
||||||
'critical': 250.0,
|
'critical': 250.0,
|
||||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||||
|
},
|
||||||
|
'nagios_runner': {
|
||||||
|
'status_code': {
|
||||||
|
'display': '{check_name} {output}',
|
||||||
|
'operator': "nagios"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+22
-1
@@ -154,6 +154,25 @@ async def start(
|
|||||||
lst = [h.jsons() for h in hosts]
|
lst = [h.jsons() for h in hosts]
|
||||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||||
|
|
||||||
|
async def api_alert_summary(request):
|
||||||
|
"""GET /api/0/alert_summary — counts of ok/warning/critical hosts visible to caller."""
|
||||||
|
user, err = _require_auth(request)
|
||||||
|
if err:
|
||||||
|
return err
|
||||||
|
from .threshold import AlertLevel
|
||||||
|
critical = warning = ok = 0
|
||||||
|
for host in hbdclass.Host.hosts.values():
|
||||||
|
if not _can_operate_host(user, host):
|
||||||
|
continue
|
||||||
|
levels = {s.level for s in host.alert_states.values()}
|
||||||
|
if AlertLevel.CRITICAL in levels:
|
||||||
|
critical += 1
|
||||||
|
elif AlertLevel.WARNING in levels:
|
||||||
|
warning += 1
|
||||||
|
else:
|
||||||
|
ok += 1
|
||||||
|
return web.json_response({"critical": critical, "warning": warning, "ok": ok})
|
||||||
|
|
||||||
async def api_messages(request):
|
async def api_messages(request):
|
||||||
lst = data.msgs[-30:]
|
lst = data.msgs[-30:]
|
||||||
return web.json_response(lst)
|
return web.json_response(lst)
|
||||||
@@ -518,6 +537,7 @@ async def start(
|
|||||||
hosts_with_plugins.append({
|
hosts_with_plugins.append({
|
||||||
"name": hostname,
|
"name": hostname,
|
||||||
"plugins": list(host.plugin_data.keys()),
|
"plugins": list(host.plugin_data.keys()),
|
||||||
|
"is_owner": _can_own_host(current_user, host),
|
||||||
})
|
})
|
||||||
|
|
||||||
tmpl = env.get_template("plugins.html")
|
tmpl = env.get_template("plugins.html")
|
||||||
@@ -870,7 +890,7 @@ async def start(
|
|||||||
tmpl = env.get_template("settings.html")
|
tmpl = env.get_template("settings.html")
|
||||||
body = tmpl.render(
|
body = tmpl.render(
|
||||||
title="Settings - Heartbeat",
|
title="Settings - Heartbeat",
|
||||||
sections=settings_mod.get_settings_sections(config),
|
sections=settings_mod.get_settings_sections(config, threshold_checker=threshold_checker),
|
||||||
current_user=current_user.to_dict() if current_user else None,
|
current_user=current_user.to_dict() if current_user else None,
|
||||||
active_page="settings",
|
active_page="settings",
|
||||||
)
|
)
|
||||||
@@ -893,6 +913,7 @@ async def start(
|
|||||||
web.get("/api/0/users/{username}/avatar", api_user_avatar),
|
web.get("/api/0/users/{username}/avatar", api_user_avatar),
|
||||||
# Hosts
|
# Hosts
|
||||||
web.get("/api/0/hosts", api_hosts),
|
web.get("/api/0/hosts", api_hosts),
|
||||||
|
web.get("/api/0/alert_summary", api_alert_summary),
|
||||||
web.get("/api/0/messages", api_messages),
|
web.get("/api/0/messages", api_messages),
|
||||||
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
|
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
|
||||||
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
|
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
|
||||||
|
|||||||
+7
-1
@@ -101,9 +101,10 @@ async def reload_configuration(config_obj, config_path, components):
|
|||||||
access = config_mod.get_host_access(new_config, hostname)
|
access = config_mod.get_host_access(new_config, hostname)
|
||||||
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
host.apply_access(access["owner"], access["managers"], access["monitors"])
|
||||||
|
|
||||||
# Reload threshold checker
|
# Reload threshold checker and prune alerts orphaned by the new config
|
||||||
if 'threshold_checker' in components:
|
if 'threshold_checker' in components:
|
||||||
components['threshold_checker'].reload(new_config)
|
components['threshold_checker'].reload(new_config)
|
||||||
|
components['threshold_checker'].purge_stale_alerts(hbdclass)
|
||||||
|
|
||||||
# Note: Changes to the following require restart:
|
# Note: Changes to the following require restart:
|
||||||
# - hb_port, hbd_port, ws_port (already bound)
|
# - hb_port, hbd_port, ws_port (already bound)
|
||||||
@@ -241,6 +242,10 @@ async def _run_async(config, config_path=None):
|
|||||||
)
|
)
|
||||||
udp.restore_connection_timers(hbdclass, restore_ctx)
|
udp.restore_connection_timers(hbdclass, restore_ctx)
|
||||||
|
|
||||||
|
# Drop alert states that no longer have a matching threshold (stale after
|
||||||
|
# upgrade or config change between runs).
|
||||||
|
threshold_checker.purge_stale_alerts(hbdclass)
|
||||||
|
|
||||||
# HTTP server (asyncio-based via aiohttp)
|
# HTTP server (asyncio-based via aiohttp)
|
||||||
try:
|
try:
|
||||||
http_task = asyncio.create_task(
|
http_task = asyncio.create_task(
|
||||||
@@ -250,6 +255,7 @@ async def _run_async(config, config_path=None):
|
|||||||
config=config,
|
config=config,
|
||||||
hbdclass=hbdclass,
|
hbdclass=hbdclass,
|
||||||
tcss=None,
|
tcss=None,
|
||||||
|
threshold_checker=threshold_checker,
|
||||||
verbose=config.get("verbose", False),
|
verbose=config.get("verbose", False),
|
||||||
get_now=lambda: time.time(),
|
get_now=lambda: time.time(),
|
||||||
VER="",
|
VER="",
|
||||||
|
|||||||
+30
-37
@@ -88,7 +88,7 @@ def _sanitize_channel(name, cfg):
|
|||||||
# Public API
|
# Public API
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def get_settings_sections(config: dict) -> list:
|
def get_settings_sections(config: dict, threshold_checker=None) -> list:
|
||||||
"""Return ordered list of setting sections for the settings page.
|
"""Return ordered list of setting sections for the settings page.
|
||||||
|
|
||||||
Each section:
|
Each section:
|
||||||
@@ -182,46 +182,39 @@ def get_settings_sections(config: dict) -> list:
|
|||||||
})
|
})
|
||||||
|
|
||||||
# ---- Threshold configurations -----------------------------------------
|
# ---- Threshold configurations -----------------------------------------
|
||||||
def _parse_metric_row(metric_path, metric_cfg):
|
def _tc_to_row(tc):
|
||||||
if not isinstance(metric_cfg, dict):
|
|
||||||
return None
|
|
||||||
return {
|
return {
|
||||||
"metric": metric_path,
|
"metric": tc.metric_path,
|
||||||
"operator": metric_cfg.get("operator", ">"),
|
"operator": tc.operator.value,
|
||||||
"warning": metric_cfg.get("warning"),
|
"warning": tc.warning,
|
||||||
"critical": metric_cfg.get("critical"),
|
"critical": tc.critical,
|
||||||
"hysteresis": metric_cfg.get("hysteresis"),
|
"hysteresis": tc.hysteresis,
|
||||||
"count": metric_cfg.get("count", 1),
|
"count": tc.count,
|
||||||
"enabled": metric_cfg.get("enabled", True),
|
"enabled": tc.enabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
threshold_config_list = []
|
threshold_config_list = []
|
||||||
raw_tconfigs = config.get("threshold_configs") or {}
|
if threshold_checker is not None:
|
||||||
if raw_tconfigs:
|
if threshold_checker.threshold_configs:
|
||||||
for cfg_name, cfg_data in sorted(raw_tconfigs.items()):
|
for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
|
||||||
if not isinstance(cfg_data, dict):
|
# For the default config use the merged effective set;
|
||||||
continue
|
# for named overrides use only the explicitly defined metrics
|
||||||
metrics = [
|
# (threshold_raw_configs) so inherited defaults are not repeated.
|
||||||
r for r in (
|
if cfg_name == "default":
|
||||||
_parse_metric_row(mp, mc)
|
display_metrics = cfg_metrics
|
||||||
for mp, mc in (cfg_data.get("thresholds") or {}).items()
|
else:
|
||||||
) if r
|
display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
|
||||||
]
|
metrics = sorted(
|
||||||
threshold_config_list.append({
|
[_tc_to_row(tc) for tc in display_metrics.values()],
|
||||||
"name": cfg_name,
|
key=lambda m: m["metric"],
|
||||||
"metrics": sorted(metrics, key=lambda m: m["metric"]),
|
)
|
||||||
})
|
threshold_config_list.append({"name": cfg_name, "metrics": metrics})
|
||||||
elif config.get("thresholds"):
|
elif threshold_checker.thresholds:
|
||||||
metrics = [
|
metrics = sorted(
|
||||||
r for r in (
|
[_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
|
||||||
_parse_metric_row(mp, mc)
|
key=lambda m: m["metric"],
|
||||||
for mp, mc in config["thresholds"].items()
|
)
|
||||||
) if r
|
threshold_config_list.append({"name": "default", "metrics": metrics})
|
||||||
]
|
|
||||||
threshold_config_list.append({
|
|
||||||
"name": "default",
|
|
||||||
"metrics": sorted(metrics, key=lambda m: m["metric"]),
|
|
||||||
})
|
|
||||||
|
|
||||||
# ---- Hosts summary ----------------------------------------------------
|
# ---- Hosts summary ----------------------------------------------------
|
||||||
hosts_list = []
|
hosts_list = []
|
||||||
|
|||||||
@@ -4,6 +4,11 @@
|
|||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
height: auto;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
.container {
|
.container {
|
||||||
max-width: 1400px;
|
max-width: 1400px;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
@@ -170,8 +175,12 @@
|
|||||||
|
|
||||||
.alert-hostname {
|
.alert-hostname {
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
color: #333;
|
color: #0066cc;
|
||||||
font-size: 1.1em;
|
font-size: 1.1em;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.alert-hostname:hover {
|
||||||
|
text-decoration: underline;
|
||||||
}
|
}
|
||||||
|
|
||||||
.alert-metric {
|
.alert-metric {
|
||||||
@@ -400,6 +409,10 @@
|
|||||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||||
}
|
}
|
||||||
|
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||||
|
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||||
|
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
// Build actions section
|
// Build actions section
|
||||||
let actionsHtml = '';
|
let actionsHtml = '';
|
||||||
@@ -424,7 +437,7 @@
|
|||||||
<div class="alert-main">
|
<div class="alert-main">
|
||||||
<div class="alert-header">
|
<div class="alert-header">
|
||||||
<span class="alert-level ${level}">${alert.level}</span>
|
<span class="alert-level ${level}">${alert.level}</span>
|
||||||
<span class="alert-hostname">${alert.hostname}</span>
|
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="alert-metric">${alert.metric_path}</div>
|
<div class="alert-metric">${alert.metric_path}</div>
|
||||||
<div class="alert-details">
|
<div class="alert-details">
|
||||||
|
|||||||
@@ -126,11 +126,17 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Swiss railway clock — nav */
|
/* Swiss railway clock — nav */
|
||||||
.nav-clock {
|
.nav-pie {
|
||||||
flex-shrink: 0;
|
flex-shrink: 0;
|
||||||
line-height: 0;
|
line-height: 0;
|
||||||
margin-left: auto;
|
margin-left: auto;
|
||||||
padding: 4px 4px 4px 0;
|
padding: 4px 4px 4px 0;
|
||||||
|
}
|
||||||
|
#alert-pie { display: block; cursor: default; }
|
||||||
|
.nav-clock {
|
||||||
|
flex-shrink: 0;
|
||||||
|
line-height: 0;
|
||||||
|
padding: 4px 4px 4px 0;
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
#swiss-clock { display: block; }
|
#swiss-clock { display: block; }
|
||||||
|
|||||||
@@ -11,6 +11,9 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
<a href="/about"{% if active_page == "about" %} class="active"{% endif %}>About</a>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="nav-pie" title="Host alert status">
|
||||||
|
<canvas id="alert-pie" width="44" height="44"></canvas>
|
||||||
|
</div>
|
||||||
<div class="nav-clock" title="Click for full-screen clock">
|
<div class="nav-clock" title="Click for full-screen clock">
|
||||||
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
<canvas id="swiss-clock" width="44" height="44"></canvas>
|
||||||
</div>
|
</div>
|
||||||
@@ -42,4 +45,52 @@
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
|
||||||
|
function drawAlertPie(critical, warning, ok) {
|
||||||
|
var canvas = document.getElementById('alert-pie');
|
||||||
|
if (!canvas) return;
|
||||||
|
var ctx = canvas.getContext('2d');
|
||||||
|
var SIZE = canvas.width;
|
||||||
|
var R = SIZE / 2;
|
||||||
|
ctx.clearRect(0, 0, SIZE, SIZE);
|
||||||
|
var total = critical + warning + ok;
|
||||||
|
if (total === 0) {
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(R, R, R - 1, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = '#ccc';
|
||||||
|
ctx.fill();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
var slices = [
|
||||||
|
{ value: critical, color: '#e53935' },
|
||||||
|
{ value: warning, color: '#ffb300' },
|
||||||
|
{ value: ok, color: '#43a047' }
|
||||||
|
];
|
||||||
|
var start = -Math.PI / 2;
|
||||||
|
slices.forEach(function(s) {
|
||||||
|
if (s.value === 0) return;
|
||||||
|
var sweep = (s.value / total) * Math.PI * 2;
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.moveTo(R, R);
|
||||||
|
ctx.arc(R, R, R - 1, start, start + sweep);
|
||||||
|
ctx.closePath();
|
||||||
|
ctx.fillStyle = s.color;
|
||||||
|
ctx.fill();
|
||||||
|
start += sweep;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateAlertPie() {
|
||||||
|
fetch('/api/0/alert_summary').then(function(r) {
|
||||||
|
if (!r.ok) return;
|
||||||
|
return r.json();
|
||||||
|
}).then(function(d) {
|
||||||
|
if (d) drawAlertPie(d.critical || 0, d.warning || 0, d.ok || 0);
|
||||||
|
}).catch(function() {});
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
updateAlertPie();
|
||||||
|
setInterval(updateAlertPie, 30000);
|
||||||
|
});
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
@@ -131,6 +131,52 @@
|
|||||||
text-overflow: ellipsis;
|
text-overflow: ellipsis;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.host-action-btn {
|
||||||
|
font-size: 0.75em;
|
||||||
|
font-weight: bold;
|
||||||
|
padding: 3px 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
border: none;
|
||||||
|
cursor: pointer;
|
||||||
|
text-decoration: none;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
.host-action-btn.update-btn {
|
||||||
|
background: #e3f2fd;
|
||||||
|
color: #1565c0;
|
||||||
|
}
|
||||||
|
.host-action-btn.update-btn:hover { background: #bbdefb; }
|
||||||
|
.host-action-btn.delete-btn {
|
||||||
|
background: #ffebee;
|
||||||
|
color: #c62828;
|
||||||
|
}
|
||||||
|
.host-action-btn.delete-btn:hover { background: #ffcdd2; }
|
||||||
|
|
||||||
|
/* ── Action result toast ───────────────────────────────────── */
|
||||||
|
#action-toast {
|
||||||
|
position: fixed;
|
||||||
|
bottom: 24px;
|
||||||
|
left: 50%;
|
||||||
|
transform: translateX(-50%) translateY(20px);
|
||||||
|
background: #323232;
|
||||||
|
color: #fff;
|
||||||
|
padding: 12px 22px;
|
||||||
|
border-radius: 6px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
max-width: 480px;
|
||||||
|
text-align: center;
|
||||||
|
opacity: 0;
|
||||||
|
pointer-events: none;
|
||||||
|
transition: opacity 0.25s, transform 0.25s;
|
||||||
|
z-index: 9000;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}
|
||||||
|
#action-toast.show {
|
||||||
|
opacity: 1;
|
||||||
|
transform: translateX(-50%) translateY(0);
|
||||||
|
}
|
||||||
|
#action-toast.error { background: #c62828; }
|
||||||
|
|
||||||
/* ── Host body ──────────────────────────────────────────────── */
|
/* ── Host body ──────────────────────────────────────────────── */
|
||||||
|
|
||||||
.host-body {
|
.host-body {
|
||||||
@@ -379,6 +425,12 @@
|
|||||||
<span class="nagios-badge" id="nagios-badge-{{ host.name }}">—</span>
|
<span class="nagios-badge" id="nagios-badge-{{ host.name }}">—</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<span class="os-label" id="os-label-{{ host.name }}"></span>
|
<span class="os-label" id="os-label-{{ host.name }}"></span>
|
||||||
|
{% if host.is_owner %}
|
||||||
|
<button class="host-action-btn update-btn"
|
||||||
|
onclick="event.stopPropagation(); hostAction(this, '/u?h={{ host.name }}')">Update</button>
|
||||||
|
<button class="host-action-btn delete-btn"
|
||||||
|
onclick="event.stopPropagation(); hostDelete(this, '{{ host.name }}')">Delete</button>
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -447,6 +499,17 @@
|
|||||||
return pluginCache[hostname]?.[pluginName] ?? null;
|
return pluginCache[hostname]?.[pluginName] ?? null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return worst nagios exit code (0-3) found in a nagios_runner data object.
|
||||||
|
function nagiosWorstStatus(data) {
|
||||||
|
let worst = 0;
|
||||||
|
for (const [k, v] of Object.entries(data || {})) {
|
||||||
|
if (k.endsWith('_status_code') && typeof v === 'number' && v > worst) {
|
||||||
|
worst = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return worst;
|
||||||
|
}
|
||||||
|
|
||||||
// ── Fetch helpers ───────────────────────────────────────────────────────
|
// ── Fetch helpers ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
async function fetchPlugin(hostname, pluginName) {
|
async function fetchPlugin(hostname, pluginName) {
|
||||||
@@ -548,13 +611,13 @@
|
|||||||
? chips.join('')
|
? chips.join('')
|
||||||
: '<span class="glance-loading">—</span>';
|
: '<span class="glance-loading">—</span>';
|
||||||
|
|
||||||
// Nagios badge
|
// Nagios badge — derive worst status from individual check codes
|
||||||
const nagios = getCache(hostname, 'nagios_runner');
|
const nagios = getCache(hostname, 'nagios_runner');
|
||||||
if (nagosBadge && nagios) {
|
if (nagosBadge && nagios) {
|
||||||
const status = (nagios.data.overall_status || '—').toUpperCase();
|
const worst = nagiosWorstStatus(nagios.data);
|
||||||
const cls = status === 'OK' ? 'ok'
|
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||||
: status === 'WARNING' ? 'warning'
|
const status = names[worst] || '—';
|
||||||
: status === 'CRITICAL' ? 'critical' : '';
|
const cls = worst === 0 ? 'ok' : worst === 1 ? 'warning' : worst >= 2 ? 'critical' : '';
|
||||||
nagosBadge.className = `nagios-badge ${cls}`;
|
nagosBadge.className = `nagios-badge ${cls}`;
|
||||||
nagosBadge.textContent = status;
|
nagosBadge.textContent = status;
|
||||||
}
|
}
|
||||||
@@ -663,9 +726,10 @@
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 'nagios_runner': {
|
case 'nagios_runner': {
|
||||||
const status = (d.overall_status || '?').toUpperCase();
|
const worst = nagiosWorstStatus(d);
|
||||||
const count = d.plugin_count;
|
const names = {0:'OK', 1:'WARNING', 2:'CRITICAL', 3:'UNKNOWN'};
|
||||||
text = status + (count != null ? ` — ${count} checks` : '');
|
const codes = Object.keys(d).filter(k => k.endsWith('_status_code'));
|
||||||
|
text = (names[worst] || '?') + (codes.length ? ` — ${codes.length} checks` : '');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 'filesystem_info': {
|
case 'filesystem_info': {
|
||||||
@@ -1175,6 +1239,49 @@
|
|||||||
fetchHostGlance(first.dataset.hostname);
|
fetchHostGlance(first.dataset.hostname);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
// ── Host action helpers ──────────────────────────────────────
|
||||||
|
|
||||||
|
let _toastTimer = null;
|
||||||
|
function showToast(msg, isError) {
|
||||||
|
const t = document.getElementById('action-toast');
|
||||||
|
t.textContent = msg;
|
||||||
|
t.classList.toggle('error', !!isError);
|
||||||
|
t.classList.add('show');
|
||||||
|
clearTimeout(_toastTimer);
|
||||||
|
_toastTimer = setTimeout(() => t.classList.remove('show'), 4000);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function hostAction(btn, url) {
|
||||||
|
btn.disabled = true;
|
||||||
|
try {
|
||||||
|
const res = await fetch(url);
|
||||||
|
const text = await res.text();
|
||||||
|
showToast(text, !res.ok);
|
||||||
|
} catch (e) {
|
||||||
|
showToast('Request failed: ' + e.message, true);
|
||||||
|
} finally {
|
||||||
|
btn.disabled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function hostDelete(btn, hostname) {
|
||||||
|
if (!confirm('Delete host ' + hostname + '?')) return;
|
||||||
|
btn.disabled = true;
|
||||||
|
try {
|
||||||
|
const res = await fetch('/d?h=' + encodeURIComponent(hostname));
|
||||||
|
const text = await res.text();
|
||||||
|
showToast(text, !res.ok);
|
||||||
|
if (res.ok) {
|
||||||
|
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
|
||||||
|
if (card) card.remove();
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
showToast('Request failed: ' + e.message, true);
|
||||||
|
btn.disabled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
<div id="action-toast"></div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
+178
-64
@@ -30,12 +30,13 @@ class AlertLevel(Enum):
|
|||||||
|
|
||||||
class ComparisonOperator(Enum):
|
class ComparisonOperator(Enum):
|
||||||
"""Supported comparison operators for threshold checks."""
|
"""Supported comparison operators for threshold checks."""
|
||||||
GT = ">" # Greater than
|
GT = ">" # Greater than
|
||||||
GTE = ">=" # Greater than or equal
|
GTE = ">=" # Greater than or equal
|
||||||
LT = "<" # Less than
|
LT = "<" # Less than
|
||||||
LTE = "<=" # Less than or equal
|
LTE = "<=" # Less than or equal
|
||||||
EQ = "==" # Equal to
|
EQ = "==" # Equal to
|
||||||
NEQ = "!=" # Not equal to
|
NEQ = "!=" # Not equal to
|
||||||
|
NAGIOS = "nagios" # Nagios exit-code semantics: 0=OK 1=WARNING 2=CRITICAL 3=UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
class AlertState:
|
class AlertState:
|
||||||
@@ -57,6 +58,7 @@ class AlertState:
|
|||||||
self.last_notification = None
|
self.last_notification = None
|
||||||
self.threshold_value = None # The threshold value that triggered alert
|
self.threshold_value = None # The threshold value that triggered alert
|
||||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||||
|
self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery
|
||||||
self.formatted_message = None # Formatted display message for UI
|
self.formatted_message = None # Formatted display message for UI
|
||||||
self.acknowledged = False # Whether alert has been acknowledged
|
self.acknowledged = False # Whether alert has been acknowledged
|
||||||
self.acknowledged_at = None # Timestamp when acknowledged
|
self.acknowledged_at = None # Timestamp when acknowledged
|
||||||
@@ -152,6 +154,15 @@ class AlertState:
|
|||||||
if self.formatted_message is not None:
|
if self.formatted_message is not None:
|
||||||
result["formatted_message"] = self.formatted_message
|
result["formatted_message"] = self.formatted_message
|
||||||
|
|
||||||
|
# Compute and expose the recovery threshold so the UI can display it
|
||||||
|
if (self.hysteresis and self.threshold_value is not None
|
||||||
|
and self.operator is not None):
|
||||||
|
ha = abs(self.threshold_value * self.hysteresis)
|
||||||
|
if self.operator in ('>', '>='):
|
||||||
|
result["recovery_threshold"] = round(self.threshold_value - ha, 4)
|
||||||
|
elif self.operator in ('<', '<='):
|
||||||
|
result["recovery_threshold"] = round(self.threshold_value + ha, 4)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
@@ -159,6 +170,8 @@ class AlertState:
|
|||||||
self.__dict__.update(state)
|
self.__dict__.update(state)
|
||||||
if not hasattr(self, 'consecutive_count'):
|
if not hasattr(self, 'consecutive_count'):
|
||||||
self.consecutive_count = 0
|
self.consecutive_count = 0
|
||||||
|
if not hasattr(self, 'hysteresis'):
|
||||||
|
self.hysteresis = None
|
||||||
|
|
||||||
def acknowledge(self):
|
def acknowledge(self):
|
||||||
"""Acknowledge this alert to stop reminder notifications."""
|
"""Acknowledge this alert to stop reminder notifications."""
|
||||||
@@ -227,6 +240,16 @@ class ThresholdConfig:
|
|||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
return AlertLevel.OK
|
return AlertLevel.OK
|
||||||
|
|
||||||
|
# Nagios exit-code semantics: value IS the severity
|
||||||
|
if self.operator == ComparisonOperator.NAGIOS:
|
||||||
|
try:
|
||||||
|
code = int(value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return AlertLevel.UNKNOWN
|
||||||
|
return {0: AlertLevel.OK, 1: AlertLevel.WARNING, 2: AlertLevel.CRITICAL}.get(
|
||||||
|
code, AlertLevel.UNKNOWN
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert value to float for comparison
|
# Convert value to float for comparison
|
||||||
value = float(value)
|
value = float(value)
|
||||||
@@ -263,6 +286,10 @@ class ThresholdConfig:
|
|||||||
"""
|
"""
|
||||||
new_level = self.evaluate(value)
|
new_level = self.evaluate(value)
|
||||||
|
|
||||||
|
# Nagios exit codes are discrete integers — hysteresis doesn't apply
|
||||||
|
if self.operator == ComparisonOperator.NAGIOS:
|
||||||
|
return new_level
|
||||||
|
|
||||||
# If no hysteresis, return new level
|
# If no hysteresis, return new level
|
||||||
if self.hysteresis == 0.0:
|
if self.hysteresis == 0.0:
|
||||||
return new_level
|
return new_level
|
||||||
@@ -396,10 +423,24 @@ class ThresholdChecker:
|
|||||||
Supports two formats:
|
Supports two formats:
|
||||||
1. Legacy format with direct 'thresholds' section
|
1. Legacy format with direct 'thresholds' section
|
||||||
2. New format with 'threshold_configs' and 'host_threshold_mapping'
|
2. New format with 'threshold_configs' and 'host_threshold_mapping'
|
||||||
|
|
||||||
|
In all cases, THRESHOLD_DEFAULTS are seeded into threshold_configs["default"]
|
||||||
|
so the Settings page always shows the built-in defaults.
|
||||||
|
_parse_multi_config() overwrites this with the fully-merged effective defaults.
|
||||||
"""
|
"""
|
||||||
|
# Always expose built-in defaults through threshold_configs["default"] so
|
||||||
|
# the Settings page has something to display even in legacy/no-config mode.
|
||||||
|
seed: Dict[str, ThresholdConfig] = {}
|
||||||
|
for plugin_name, plugin_thresholds in THRESHOLD_DEFAULTS.get("thresholds", {}).items():
|
||||||
|
if isinstance(plugin_thresholds, dict):
|
||||||
|
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=seed)
|
||||||
|
if seed:
|
||||||
|
self.threshold_configs["default"] = seed
|
||||||
|
self.threshold_raw_configs["default"] = {}
|
||||||
|
|
||||||
# Check for new multi-config format
|
# Check for new multi-config format
|
||||||
if "threshold_configs" in config:
|
if "threshold_configs" in config:
|
||||||
self._parse_multi_config(config)
|
self._parse_multi_config(config) # overwrites threshold_configs["default"]
|
||||||
elif "thresholds" in config:
|
elif "thresholds" in config:
|
||||||
# Legacy single threshold configuration
|
# Legacy single threshold configuration
|
||||||
self._parse_legacy_config(config)
|
self._parse_legacy_config(config)
|
||||||
@@ -545,11 +586,14 @@ class ThresholdChecker:
|
|||||||
warning = threshold_config.get("warning")
|
warning = threshold_config.get("warning")
|
||||||
critical = threshold_config.get("critical")
|
critical = threshold_config.get("critical")
|
||||||
operator = threshold_config.get("operator", ">")
|
operator = threshold_config.get("operator", ">")
|
||||||
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
# Nagios operator maps exit codes directly; no numeric thresholds needed
|
||||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
is_nagios_op = (operator == "nagios")
|
||||||
|
default_display = "{check_name}: {output}" if is_nagios_op else "(threshold: {op_symbol} {threshold_value})"
|
||||||
|
display = threshold_config.get("display", default_display)
|
||||||
|
hysteresis = threshold_config.get("hysteresis", 0.0 if is_nagios_op else 0.02)
|
||||||
enabled = threshold_config.get("enabled", True)
|
enabled = threshold_config.get("enabled", True)
|
||||||
|
|
||||||
if warning is None and critical is None:
|
if warning is None and critical is None and not is_nagios_op:
|
||||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -649,7 +693,7 @@ class ThresholdChecker:
|
|||||||
warning = rtt_thresholds.get("warning")
|
warning = rtt_thresholds.get("warning")
|
||||||
critical = rtt_thresholds.get("critical")
|
critical = rtt_thresholds.get("critical")
|
||||||
operator = rtt_thresholds.get("operator", ">")
|
operator = rtt_thresholds.get("operator", ">")
|
||||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default
|
||||||
enabled = rtt_thresholds.get("enabled", True)
|
enabled = rtt_thresholds.get("enabled", True)
|
||||||
display = rtt_thresholds.get("display")
|
display = rtt_thresholds.get("display")
|
||||||
count = rtt_thresholds.get("count", 1)
|
count = rtt_thresholds.get("count", 1)
|
||||||
@@ -794,6 +838,12 @@ class ThresholdChecker:
|
|||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
# Keep hysteresis on the state so the UI can show the recovery threshold
|
||||||
|
if new_level != AlertLevel.OK:
|
||||||
|
alert_state.hysteresis = threshold.hysteresis
|
||||||
|
else:
|
||||||
|
alert_state.hysteresis = None
|
||||||
|
|
||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
@@ -805,26 +855,33 @@ class ThresholdChecker:
|
|||||||
return None
|
return None
|
||||||
def _find_threshold(
|
def _find_threshold(
|
||||||
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
|
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
|
||||||
) -> Optional["ThresholdConfig"]:
|
) -> Tuple[Optional["ThresholdConfig"], Optional[str]]:
|
||||||
"""Return the threshold for *metric_path*, falling back to suffix matches.
|
"""Return (threshold, check_name) for *metric_path*, falling back to suffix matches.
|
||||||
|
|
||||||
Allows generic thresholds like ``ping_monitor.rtt_avg`` to match
|
Allows generic thresholds like ``nagios_runner.status_code`` to match
|
||||||
fully-qualified paths like ``ping_monitor.8_8_8_8_rtt_avg``.
|
fully-qualified paths like ``nagios_runner.check_disk_root_status_code``.
|
||||||
The exact match is always tried first; then successive leading
|
The exact match is always tried first; then successive leading
|
||||||
underscore-delimited segments are stripped from the field name until
|
underscore-delimited segments are stripped from the field name until
|
||||||
a match is found or no segments remain.
|
a match is found or no segments remain.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(ThresholdConfig, None) for an exact match.
|
||||||
|
(ThresholdConfig, "check_disk_root") for a suffix match — the second
|
||||||
|
element is the stripped prefix, available as ``{check_name}`` in
|
||||||
|
display format templates.
|
||||||
|
(None, None) when no threshold is found.
|
||||||
"""
|
"""
|
||||||
if metric_path in thresholds:
|
if metric_path in thresholds:
|
||||||
return thresholds[metric_path]
|
return thresholds[metric_path], None
|
||||||
plugin, sep, field = metric_path.partition(".")
|
plugin, sep, field = metric_path.partition(".")
|
||||||
if not sep:
|
if not sep:
|
||||||
return None
|
return None, None
|
||||||
parts = field.split("_")
|
parts = field.split("_")
|
||||||
for i in range(1, len(parts)):
|
for i in range(1, len(parts)):
|
||||||
candidate = plugin + "." + "_".join(parts[i:])
|
candidate = plugin + "." + "_".join(parts[i:])
|
||||||
if candidate in thresholds:
|
if candidate in thresholds:
|
||||||
return thresholds[candidate]
|
return thresholds[candidate], "_".join(parts[:i])
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
def check_plugin_data(
|
def check_plugin_data(
|
||||||
self,
|
self,
|
||||||
@@ -854,7 +911,7 @@ class ThresholdChecker:
|
|||||||
for metric_name, value in data.items():
|
for metric_name, value in data.items():
|
||||||
metric_path = f"{plugin_name}.{metric_name}"
|
metric_path = f"{plugin_name}.{metric_name}"
|
||||||
|
|
||||||
threshold = self._find_threshold(thresholds, metric_path)
|
threshold, check_name = self._find_threshold(thresholds, metric_path)
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -877,13 +934,15 @@ class ThresholdChecker:
|
|||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||||
|
|
||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data, check_name=check_name, metric_name=metric_name)
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data, check_name=check_name, metric_name=metric_name)
|
||||||
|
|
||||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||||
self._check_nested_metrics(
|
self._check_nested_metrics(
|
||||||
@@ -943,6 +1002,8 @@ class ThresholdChecker:
|
|||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||||
|
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
@@ -959,6 +1020,8 @@ class ThresholdChecker:
|
|||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
plugin_data: Optional[Dict[str, Any]] = None,
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
|
check_name: Optional[str] = None,
|
||||||
|
metric_name: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Trigger a notification for an alert state change.
|
"""Trigger a notification for an alert state change.
|
||||||
|
|
||||||
@@ -985,50 +1048,45 @@ class ThresholdChecker:
|
|||||||
import math
|
import math
|
||||||
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
|
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
|
||||||
|
|
||||||
# Format message
|
# Format message — for the nagios operator there is no numeric threshold_value;
|
||||||
|
# render the display template whenever one is available.
|
||||||
|
has_display = threshold_value is not None or threshold.operator == ComparisonOperator.NAGIOS
|
||||||
|
|
||||||
|
def _fmt():
|
||||||
|
return self._format_display(
|
||||||
|
threshold.display,
|
||||||
|
value=display_value,
|
||||||
|
threshold_value=threshold_value,
|
||||||
|
op_symbol=op_symbol,
|
||||||
|
plugin_data=plugin_data,
|
||||||
|
check_name=check_name,
|
||||||
|
metric_name=metric_name,
|
||||||
|
)
|
||||||
|
|
||||||
if new_level == AlertLevel.OK:
|
if new_level == AlertLevel.OK:
|
||||||
lvl = "RECOVER"
|
lvl = "RECOVER"
|
||||||
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
|
||||||
elif new_level == AlertLevel.WARNING:
|
elif new_level == AlertLevel.WARNING:
|
||||||
lvl = "WARNING"
|
lvl = "WARNING"
|
||||||
if threshold_value is not None:
|
if has_display:
|
||||||
threshold_info = self._format_display(
|
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||||
threshold.display,
|
|
||||||
value=display_value,
|
|
||||||
threshold_value=threshold_value,
|
|
||||||
op_symbol=op_symbol,
|
|
||||||
plugin_data=plugin_data
|
|
||||||
)
|
|
||||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
|
||||||
else:
|
else:
|
||||||
message = f"{metric_path} = {display_value}"
|
message = f"{metric_path} = {display_value}"
|
||||||
elif new_level == AlertLevel.CRITICAL:
|
elif new_level == AlertLevel.CRITICAL:
|
||||||
lvl = "CRITICAL"
|
lvl = "CRITICAL"
|
||||||
if threshold_value is not None:
|
if has_display:
|
||||||
threshold_info = self._format_display(
|
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||||
threshold.display,
|
|
||||||
value=display_value,
|
|
||||||
threshold_value=threshold_value,
|
|
||||||
op_symbol=op_symbol,
|
|
||||||
plugin_data=plugin_data
|
|
||||||
)
|
|
||||||
message = f"{metric_path} = {display_value} {threshold_info}"
|
|
||||||
else:
|
else:
|
||||||
message = f"{metric_path} = {display_value}"
|
message = f"{metric_path} = {display_value}"
|
||||||
else:
|
else:
|
||||||
lvl = "UNKNOWN"
|
lvl = "UNKNOWN"
|
||||||
message = f"{metric_path} = {display_value}"
|
if has_display:
|
||||||
|
message = f"{metric_path} = {display_value} {_fmt()}"
|
||||||
|
else:
|
||||||
|
message = f"{metric_path} = {display_value}"
|
||||||
|
|
||||||
# Return the formatted threshold info for storing in AlertState
|
# Formatted threshold info stored on AlertState for the UI
|
||||||
formatted_threshold_msg = None
|
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
|
||||||
if threshold_value is not None and new_level != AlertLevel.OK:
|
|
||||||
formatted_threshold_msg = self._format_display(
|
|
||||||
threshold.display,
|
|
||||||
value=display_value,
|
|
||||||
threshold_value=threshold_value,
|
|
||||||
op_symbol=op_symbol,
|
|
||||||
plugin_data=plugin_data
|
|
||||||
)
|
|
||||||
|
|
||||||
return lvl, message, formatted_threshold_msg
|
return lvl, message, formatted_threshold_msg
|
||||||
|
|
||||||
@@ -1077,18 +1135,24 @@ class ThresholdChecker:
|
|||||||
self,
|
self,
|
||||||
display_format: str,
|
display_format: str,
|
||||||
value: Any,
|
value: Any,
|
||||||
threshold_value: float,
|
threshold_value: Optional[float],
|
||||||
op_symbol: str,
|
op_symbol: str,
|
||||||
plugin_data: Optional[Dict[str, Any]] = None,
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
|
check_name: Optional[str] = None,
|
||||||
|
metric_name: Optional[str] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Format the display string using available data.
|
"""Format the display string using available data.
|
||||||
|
|
||||||
Args:
|
Available template variables:
|
||||||
display_format: Format string from threshold config
|
{value} - current metric value
|
||||||
value: Current metric value
|
{threshold_value} - threshold that was exceeded
|
||||||
threshold_value: Threshold value that was exceeded
|
{op_symbol} - comparison operator (>, <, >=, <=, ==, !=)
|
||||||
op_symbol: Comparison operator symbol
|
{check_name} - prefix stripped for generic threshold match
|
||||||
plugin_data: Optional dictionary of plugin data fields
|
(e.g. "check_disk_root" when metric
|
||||||
|
"check_disk_root_status_code" matched generic
|
||||||
|
threshold "status_code")
|
||||||
|
{metric_name} - field name within the plugin data dict
|
||||||
|
Any key from plugin_data is also available.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Formatted display string
|
Formatted display string
|
||||||
@@ -1096,14 +1160,34 @@ class ThresholdChecker:
|
|||||||
# Build format context with standard variables
|
# Build format context with standard variables
|
||||||
format_context = {
|
format_context = {
|
||||||
'value': value,
|
'value': value,
|
||||||
'threshold_value': threshold_value,
|
|
||||||
'op_symbol': op_symbol,
|
'op_symbol': op_symbol,
|
||||||
}
|
}
|
||||||
|
if threshold_value is not None:
|
||||||
|
format_context['threshold_value'] = threshold_value
|
||||||
|
|
||||||
|
# Add generic-match context variables when available
|
||||||
|
if check_name is not None:
|
||||||
|
format_context['check_name'] = check_name
|
||||||
|
if metric_name is not None:
|
||||||
|
format_context['metric_name'] = metric_name
|
||||||
|
|
||||||
# Add all plugin data fields if available
|
# Add all plugin data fields if available
|
||||||
if plugin_data:
|
if plugin_data:
|
||||||
format_context.update(plugin_data)
|
format_context.update(plugin_data)
|
||||||
|
|
||||||
|
# For nagios_runner generic matches, expose the matched check's output
|
||||||
|
# and status as short aliases {output} and {status} so display templates
|
||||||
|
# don't need to use the full {check_disk_root_output} form.
|
||||||
|
if check_name and plugin_data:
|
||||||
|
if 'output' not in format_context:
|
||||||
|
output = plugin_data.get(f"{check_name}_output")
|
||||||
|
if output is not None:
|
||||||
|
format_context['output'] = output
|
||||||
|
if 'status' not in format_context:
|
||||||
|
status = plugin_data.get(f"{check_name}_status")
|
||||||
|
if status is not None:
|
||||||
|
format_context['status'] = status
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Format the display string
|
# Format the display string
|
||||||
return display_format.format(**format_context)
|
return display_format.format(**format_context)
|
||||||
@@ -1133,6 +1217,8 @@ class ThresholdChecker:
|
|||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
plugin_data: Optional[Dict[str, Any]],
|
plugin_data: Optional[Dict[str, Any]],
|
||||||
|
check_name: Optional[str] = None,
|
||||||
|
metric_name: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Handle a state-change transition with grace-period logic.
|
"""Handle a state-change transition with grace-period logic.
|
||||||
|
|
||||||
@@ -1145,7 +1231,8 @@ class ThresholdChecker:
|
|||||||
- Past grace: fires the RECOVER notification normally.
|
- Past grace: fires the RECOVER notification normally.
|
||||||
"""
|
"""
|
||||||
lvl, message, formatted_msg = self._trigger_notification(
|
lvl, message, formatted_msg = self._trigger_notification(
|
||||||
host_name, metric_path, old_level, new_level, value, threshold, plugin_data
|
host_name, metric_path, old_level, new_level, value, threshold, plugin_data,
|
||||||
|
check_name=check_name, metric_name=metric_name,
|
||||||
)
|
)
|
||||||
alert_state.formatted_message = formatted_msg
|
alert_state.formatted_message = formatted_msg
|
||||||
|
|
||||||
@@ -1181,6 +1268,8 @@ class ThresholdChecker:
|
|||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
plugin_data: Optional[Dict[str, Any]],
|
plugin_data: Optional[Dict[str, Any]],
|
||||||
|
check_name: Optional[str] = None,
|
||||||
|
metric_name: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Called when alert level is unchanged and non-OK.
|
"""Called when alert level is unchanged and non-OK.
|
||||||
|
|
||||||
@@ -1190,7 +1279,8 @@ class ThresholdChecker:
|
|||||||
if alert_state.pending_since is not None:
|
if alert_state.pending_since is not None:
|
||||||
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
||||||
lvl, message, formatted_msg = self._trigger_notification(
|
lvl, message, formatted_msg = self._trigger_notification(
|
||||||
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data
|
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data,
|
||||||
|
check_name=check_name, metric_name=metric_name,
|
||||||
)
|
)
|
||||||
alert_state.formatted_message = formatted_msg
|
alert_state.formatted_message = formatted_msg
|
||||||
self._send_notification(
|
self._send_notification(
|
||||||
@@ -1199,7 +1289,7 @@ class ThresholdChecker:
|
|||||||
alert_state.pending_since = None
|
alert_state.pending_since = None
|
||||||
# else: still within grace window, do nothing
|
# else: still within grace window, do nothing
|
||||||
else:
|
else:
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data)
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data, check_name=check_name, metric_name=metric_name)
|
||||||
|
|
||||||
def _check_renotify(
|
def _check_renotify(
|
||||||
self,
|
self,
|
||||||
@@ -1209,6 +1299,8 @@ class ThresholdChecker:
|
|||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
plugin_data: Optional[Dict[str, Any]] = None,
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
|
check_name: Optional[str] = None,
|
||||||
|
metric_name: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Check if we should send a repeat notification.
|
"""Check if we should send a repeat notification.
|
||||||
|
|
||||||
@@ -1255,7 +1347,9 @@ class ThresholdChecker:
|
|||||||
value=value,
|
value=value,
|
||||||
threshold_value=threshold_value,
|
threshold_value=threshold_value,
|
||||||
op_symbol=op_symbol,
|
op_symbol=op_symbol,
|
||||||
plugin_data=plugin_data
|
plugin_data=plugin_data,
|
||||||
|
check_name=check_name,
|
||||||
|
metric_name=metric_name,
|
||||||
)
|
)
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||||
else:
|
else:
|
||||||
@@ -1276,6 +1370,26 @@ class ThresholdChecker:
|
|||||||
alert_state.last_notification = now
|
alert_state.last_notification = now
|
||||||
alert_state.notification_count += 1
|
alert_state.notification_count += 1
|
||||||
|
|
||||||
|
def purge_stale_alerts(self, hbdclass) -> None:
|
||||||
|
"""Remove alert states that have no matching threshold configuration.
|
||||||
|
|
||||||
|
Called after startup (pickle restore) and after each config reload so
|
||||||
|
that alerts orphaned by configuration changes do not linger forever.
|
||||||
|
Alerts whose metric_path is not present in the current threshold config
|
||||||
|
for that host are silently dropped.
|
||||||
|
"""
|
||||||
|
for hostname, host in hbdclass.Host.hosts.items():
|
||||||
|
if not host.alert_states:
|
||||||
|
continue
|
||||||
|
configured = self.get_thresholds_for_host(hostname)
|
||||||
|
stale = [mp for mp in host.alert_states if self._find_threshold(configured, mp)[0] is None]
|
||||||
|
for mp in stale:
|
||||||
|
logger.info(
|
||||||
|
"Purging stale alert state for %s / %s (no threshold configured)",
|
||||||
|
hostname, mp,
|
||||||
|
)
|
||||||
|
del host.alert_states[mp]
|
||||||
|
|
||||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||||
"""
|
"""
|
||||||
Get all currently active (non-OK) alerts.
|
Get all currently active (non-OK) alerts.
|
||||||
|
|||||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "hbd"
|
name = "hbd"
|
||||||
version = "5.1.16"
|
version = "5.2.0"
|
||||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
|
|||||||
+27
-10
@@ -41,7 +41,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
# updated by scripts/bumpminor.sh
|
# updated by scripts/bumpminor.sh
|
||||||
__version__ = "5.1.16"
|
__version__ = "5.2.0"
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Protocol (mirrors hbd/common/proto.py)
|
# Protocol (mirrors hbd/common/proto.py)
|
||||||
@@ -388,7 +388,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
|
|
||||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
results: Dict[str, Any] = {}
|
results: Dict[str, Any] = {}
|
||||||
worst = 0
|
|
||||||
for cmd_cfg in self.commands:
|
for cmd_cfg in self.commands:
|
||||||
name = cmd_cfg.get("name")
|
name = cmd_cfg.get("name")
|
||||||
command = cmd_cfg.get("command")
|
command = cmd_cfg.get("command")
|
||||||
@@ -399,10 +398,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
results[f"{name}_status_code"] = rc
|
results[f"{name}_status_code"] = rc
|
||||||
results[f"{name}_output"] = msg
|
results[f"{name}_output"] = msg
|
||||||
results.update({f"{name}_{k}": v for k, v in perf.items()})
|
results.update({f"{name}_{k}": v for k, v in perf.items()})
|
||||||
worst = max(worst, rc)
|
|
||||||
results["overall_status"] = _NAGIOS_STATUS.get(worst, "UNKNOWN")
|
|
||||||
results["overall_status_code"] = worst
|
|
||||||
results["plugin_count"] = len(self.commands)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -487,6 +482,12 @@ class CPUMonitorPlugin(MonitorPlugin):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("/proc/uptime") as fh:
|
||||||
|
data["uptime_seconds"] = int(float(fh.read().split()[0]))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -535,6 +536,20 @@ class MemoryMonitorPlugin(MonitorPlugin):
|
|||||||
total = mi.get("MemTotal", 0)
|
total = mi.get("MemTotal", 0)
|
||||||
avail = mi.get("MemAvailable", mi.get("MemFree", 0))
|
avail = mi.get("MemAvailable", mi.get("MemFree", 0))
|
||||||
free = mi.get("MemFree", 0)
|
free = mi.get("MemFree", 0)
|
||||||
|
|
||||||
|
# ZFS ARC is reclaimable but not included in MemAvailable; add it.
|
||||||
|
arc_kb = 0
|
||||||
|
try:
|
||||||
|
with open("/proc/spl/kstat/zfs/arcstats") as _f:
|
||||||
|
for _line in _f:
|
||||||
|
_p = _line.split()
|
||||||
|
if len(_p) >= 3 and _p[0] == "size":
|
||||||
|
arc_kb = int(_p[2]) // 1024
|
||||||
|
break
|
||||||
|
except (OSError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
avail = min(avail + arc_kb, total)
|
||||||
used = total - avail
|
used = total - avail
|
||||||
data: Dict[str, Any] = {
|
data: Dict[str, Any] = {
|
||||||
"memory_total": total * 1024,
|
"memory_total": total * 1024,
|
||||||
@@ -1052,8 +1067,8 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
|||||||
if args.message:
|
if args.message:
|
||||||
bmsg["service"] = "service"
|
bmsg["service"] = "service"
|
||||||
bmsg["msg"] = args.message
|
bmsg["msg"] = args.message
|
||||||
for c in connections:
|
target = next((c for c in connections if c._transport), connections[0])
|
||||||
await c.sendto(bmsg)
|
await target.sendto(bmsg)
|
||||||
if args.message and not args.daemon:
|
if args.message and not args.daemon:
|
||||||
await asyncio.sleep(0.3)
|
await asyncio.sleep(0.3)
|
||||||
for c in connections:
|
for c in connections:
|
||||||
@@ -1085,11 +1100,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
log.info("shutting down")
|
log.info("shutting down")
|
||||||
for conn in connections:
|
target = next((c for c in connections if c._transport), connections[0] if connections else None)
|
||||||
|
if target:
|
||||||
try:
|
try:
|
||||||
await conn.sendto({"shutdown": 1, "acks": conn.ackcount})
|
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
for conn in connections:
|
||||||
conn.close()
|
conn.close()
|
||||||
await asyncio.sleep(0.3)
|
await asyncio.sleep(0.3)
|
||||||
for plugin in plugins:
|
for plugin in plugins:
|
||||||
|
|||||||
+1
-2
@@ -68,8 +68,7 @@ async def test_nagios_runner():
|
|||||||
print(f" ✓ Collected {len(data)} data points")
|
print(f" ✓ Collected {len(data)} data points")
|
||||||
|
|
||||||
print(f"\n4. Results:")
|
print(f"\n4. Results:")
|
||||||
print(f" Overall Status: {data.get('overall_status')} (code: {data.get('overall_status_code')})")
|
print(f" Data points collected: {len(data)}")
|
||||||
print(f" Plugins Executed: {data.get('plugin_count')}")
|
|
||||||
|
|
||||||
# Show individual plugin results
|
# Show individual plugin results
|
||||||
print(f"\n5. Individual Plugin Results:")
|
print(f"\n5. Individual Plugin Results:")
|
||||||
|
|||||||
Reference in New Issue
Block a user