Add a ping monitor

This commit is contained in:
Andreas Wrede
2026-04-11 15:25:23 -04:00
parent 6217f7a124
commit ee3b72878f
5 changed files with 171 additions and 12 deletions
+5 -4
View File
@@ -4,6 +4,7 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Run hbd (module)",
"type": "debugpy",
@@ -28,14 +29,14 @@
]
},
{
"name": "Python: Run hbd with debugpy (listen)",
"name": "Python: Run hbc (module)",
"type": "debugpy",
"request": "launch",
"module": "debugpy",
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", "~/.hb.yaml", "-f", "-v"],
"module": "hbd.client.main",
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
"cwd": "${workspaceFolder}",
"env": { "PYTHONPATH": "${workspaceFolder}" },
"console": "integratedTerminal",
"justMyCode": false
}
]
}
+8 -4
View File
@@ -2,6 +2,9 @@
import logging
import os
import logging
logger = logging.getLogger(__name__)
try:
import yaml
@@ -30,18 +33,19 @@ def load_config(path=None):
If YAML is not available or the file does not exist, defaults are returned.
Args:
path: Path to YAML config file (default: ~/.hb.yaml)
path: Path to YAML config file (default: ~/.hbc.yaml)
Returns:
Dictionary with configuration
"""
cfg = CLIENT_DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
# default path (~/.hbc.yaml)
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
if os.path.exists(path):
if yaml:
logger.info("Loading configuration from %s", path)
with open(path) as fh:
data = yaml.safe_load(fh)
# Merge YAML data with defaults
@@ -50,5 +54,5 @@ def load_config(path=None):
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
return cfg
+3 -3
View File
@@ -644,9 +644,6 @@ def main(argv=None):
parser = build_parser()
args = parser.parse_args(argv)
# Load config
config = load_config(args.configfile)
# Setup logging
log_level = logging.WARNING
if args.verbose:
@@ -659,6 +656,9 @@ def main(argv=None):
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
# Load config
config = load_config(args.configfile)
# Daemonize if requested
if args.daemon:
+4 -1
View File
@@ -311,7 +311,10 @@ class PluginLoader:
return 0
loaded_count = 0
plugin_config = config or {}
raw_config = config or {}
# Per-plugin config lives under the 'plugins' key; fall back to top-level
# for backwards compatibility.
plugin_config = raw_config.get("plugins", raw_config)
# Scan for Python files
for plugin_file in directory.glob("*.py"):
+151
View File
@@ -0,0 +1,151 @@
"""Ping Monitor Plugin for Heartbeat.
Pings one or more hosts and reports round-trip time. Results are sent as
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
alerts using the same RTT threshold configuration format used for heartbeat RTT.
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
```yaml
plugins:
ping_monitor:
interval: 60 # ping every 60 seconds (default)
count: 3 # ICMP packets per ping run (default 3)
timeout: 5 # seconds before a host is considered unreachable (default 5)
hosts:
8.8.8.8:
warning: 20.0 # ms
critical: 100.0 # ms
192.168.1.1:
warning: 5.0
critical: 20.0
```
Reported metrics per host (metric key uses the hostname with dots/colons replaced
by underscores so it is a valid identifier):
ping.<hostname>.rtt_avg average RTT in ms (float, or inf if unreachable)
ping.<hostname>.rtt_min minimum RTT in ms
ping.<hostname>.rtt_max maximum RTT in ms
ping.<hostname>.loss packet loss percentage (0100)
Server-side threshold config example:
```yaml
threshold_configs:
default:
thresholds:
ping_monitor:
8_8_8_8_rtt_avg:
warning: 20.0
critical: 100.0
```
"""
import asyncio
import re
import sys
from typing import Any, Dict, Optional
from hbd.client.plugin import MonitorPlugin
def _host_key(host: str) -> str:
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
class PingMonitorPlugin(MonitorPlugin):
"""Ping one or more configured hosts and report RTT metrics."""
name = "ping_monitor"
version = "1.0.0"
description = "ICMP ping latency monitoring"
interval = 60
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
cfg = config or {}
self.interval = cfg.get("interval", 60)
self.count = int(cfg.get("count", 3))
self.timeout = int(cfg.get("timeout", 5))
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
raw_hosts = cfg.get("hosts", {})
if isinstance(raw_hosts, list):
self.hosts = {h: {} for h in raw_hosts}
else:
self.hosts = dict(raw_hosts)
async def initialize(self) -> bool:
if not self.hosts:
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
return False
self.logger.info(
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
len(self.hosts), self.interval, self.count, self.timeout,
)
return True
async def _ping(self, host: str) -> Dict[str, float]:
"""Run a system ping command and return rtt_min/avg/max/loss."""
if sys.platform == "win32":
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
else:
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await asyncio.wait_for(
proc.communicate(),
timeout=self.timeout * self.count + 2,
)
output = stdout.decode(errors="replace")
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": 100.0}
# Parse packet loss
loss = 100.0
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
if loss_match:
loss = float(loss_match.group(1))
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
rtt_match = re.search(
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
output,
)
if rtt_match:
return {
"rtt_min": float(rtt_match.group(1)),
"rtt_avg": float(rtt_match.group(2)),
"rtt_max": float(rtt_match.group(3)),
"loss": loss,
}
# Host unreachable or all packets lost
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": loss}
async def _collect_metrics(self) -> Dict[str, Any]:
data: Dict[str, Any] = {}
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
for host, task in tasks.items():
try:
result = await task
except Exception as e:
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
"rtt_max": float("inf"), "loss": 100.0}
key = _host_key(host)
for metric, value in result.items():
data[f"{key}_{metric}"] = value
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
self.logger.debug("ping_monitor: %s -> %s", host, status)
return data