Add a ping monitor
This commit is contained in:
Vendored
+5
-4
@@ -4,6 +4,7 @@
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python: Run hbd (module)",
|
||||
"type": "debugpy",
|
||||
@@ -28,14 +29,14 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Python: Run hbd with debugpy (listen)",
|
||||
"name": "Python: Run hbc (module)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "debugpy",
|
||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", "~/.hb.yaml", "-f", "-v"],
|
||||
"module": "hbd.client.main",
|
||||
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import yaml
|
||||
@@ -30,18 +33,19 @@ def load_config(path=None):
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = CLIENT_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
# default path (~/.hbc.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
logger.info("Loading configuration from %s", path)
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
@@ -50,5 +54,5 @@ def load_config(path=None):
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||
return cfg
|
||||
|
||||
+3
-3
@@ -644,9 +644,6 @@ def main(argv=None):
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.WARNING
|
||||
if args.verbose:
|
||||
@@ -659,6 +656,9 @@ def main(argv=None):
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
|
||||
@@ -311,7 +311,10 @@ class PluginLoader:
|
||||
return 0
|
||||
|
||||
loaded_count = 0
|
||||
plugin_config = config or {}
|
||||
raw_config = config or {}
|
||||
# Per-plugin config lives under the 'plugins' key; fall back to top-level
|
||||
# for backwards compatibility.
|
||||
plugin_config = raw_config.get("plugins", raw_config)
|
||||
|
||||
# Scan for Python files
|
||||
for plugin_file in directory.glob("*.py"):
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
"""Ping Monitor Plugin for Heartbeat.
|
||||
|
||||
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||
|
||||
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
ping_monitor:
|
||||
interval: 60 # ping every 60 seconds (default)
|
||||
count: 3 # ICMP packets per ping run (default 3)
|
||||
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||
hosts:
|
||||
8.8.8.8:
|
||||
warning: 20.0 # ms
|
||||
critical: 100.0 # ms
|
||||
192.168.1.1:
|
||||
warning: 5.0
|
||||
critical: 20.0
|
||||
```
|
||||
|
||||
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||
by underscores so it is a valid identifier):
|
||||
|
||||
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||
|
||||
Server-side threshold config example:
|
||||
|
||||
```yaml
|
||||
threshold_configs:
|
||||
default:
|
||||
thresholds:
|
||||
ping_monitor:
|
||||
8_8_8_8_rtt_avg:
|
||||
warning: 20.0
|
||||
critical: 100.0
|
||||
```
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
def _host_key(host: str) -> str:
|
||||
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||
|
||||
|
||||
class PingMonitorPlugin(MonitorPlugin):
|
||||
"""Ping one or more configured hosts and report RTT metrics."""
|
||||
|
||||
name = "ping_monitor"
|
||||
version = "1.0.0"
|
||||
description = "ICMP ping latency monitoring"
|
||||
interval = 60
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
cfg = config or {}
|
||||
self.interval = cfg.get("interval", 60)
|
||||
self.count = int(cfg.get("count", 3))
|
||||
self.timeout = int(cfg.get("timeout", 5))
|
||||
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||
raw_hosts = cfg.get("hosts", {})
|
||||
if isinstance(raw_hosts, list):
|
||||
self.hosts = {h: {} for h in raw_hosts}
|
||||
else:
|
||||
self.hosts = dict(raw_hosts)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
if not self.hosts:
|
||||
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||
return False
|
||||
self.logger.info(
|
||||
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||
len(self.hosts), self.interval, self.count, self.timeout,
|
||||
)
|
||||
return True
|
||||
|
||||
async def _ping(self, host: str) -> Dict[str, float]:
|
||||
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||
if sys.platform == "win32":
|
||||
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||
else:
|
||||
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, _ = await asyncio.wait_for(
|
||||
proc.communicate(),
|
||||
timeout=self.timeout * self.count + 2,
|
||||
)
|
||||
output = stdout.decode(errors="replace")
|
||||
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
|
||||
# Parse packet loss
|
||||
loss = 100.0
|
||||
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||
if loss_match:
|
||||
loss = float(loss_match.group(1))
|
||||
|
||||
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||
rtt_match = re.search(
|
||||
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||
output,
|
||||
)
|
||||
if rtt_match:
|
||||
return {
|
||||
"rtt_min": float(rtt_match.group(1)),
|
||||
"rtt_avg": float(rtt_match.group(2)),
|
||||
"rtt_max": float(rtt_match.group(3)),
|
||||
"loss": loss,
|
||||
}
|
||||
|
||||
# Host unreachable or all packets lost
|
||||
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": loss}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
data: Dict[str, Any] = {}
|
||||
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||
for host, task in tasks.items():
|
||||
try:
|
||||
result = await task
|
||||
except Exception as e:
|
||||
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||
"rtt_max": float("inf"), "loss": 100.0}
|
||||
key = _host_key(host)
|
||||
for metric, value in result.items():
|
||||
data[f"{key}_{metric}"] = value
|
||||
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||
return data
|
||||
Reference in New Issue
Block a user