Add a ping monitor
This commit is contained in:
Vendored
+5
-4
@@ -4,6 +4,7 @@
|
|||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd (module)",
|
"name": "Python: Run hbd (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
@@ -28,14 +29,14 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: Run hbd with debugpy (listen)",
|
"name": "Python: Run hbc (module)",
|
||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "debugpy",
|
"module": "hbd.client.main",
|
||||||
"args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", "~/.hb.yaml", "-f", "-v"],
|
"args": ["-c", "~/.hbc.yaml", "-v", "winter"],
|
||||||
|
"cwd": "${workspaceFolder}",
|
||||||
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
"env": { "PYTHONPATH": "${workspaceFolder}" },
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
@@ -30,18 +33,19 @@ def load_config(path=None):
|
|||||||
If YAML is not available or the file does not exist, defaults are returned.
|
If YAML is not available or the file does not exist, defaults are returned.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
path: Path to YAML config file (default: ~/.hbc.yaml)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with configuration
|
Dictionary with configuration
|
||||||
"""
|
"""
|
||||||
cfg = CLIENT_DEFAULTS.copy()
|
cfg = CLIENT_DEFAULTS.copy()
|
||||||
if not path:
|
if not path:
|
||||||
# default path (~/.hb.yaml)
|
# default path (~/.hbc.yaml)
|
||||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
path = os.path.join(os.path.expanduser("~"), ".hbc.yaml")
|
||||||
|
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
if yaml:
|
if yaml:
|
||||||
|
logger.info("Loading configuration from %s", path)
|
||||||
with open(path) as fh:
|
with open(path) as fh:
|
||||||
data = yaml.safe_load(fh)
|
data = yaml.safe_load(fh)
|
||||||
# Merge YAML data with defaults
|
# Merge YAML data with defaults
|
||||||
@@ -50,5 +54,5 @@ def load_config(path=None):
|
|||||||
cfg[k] = v
|
cfg[k] = v
|
||||||
else:
|
else:
|
||||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
pass
|
logger.warning("PyYAML not available - cannot load config from %s, using defaults", path)
|
||||||
return cfg
|
return cfg
|
||||||
|
|||||||
+3
-3
@@ -644,9 +644,6 @@ def main(argv=None):
|
|||||||
parser = build_parser()
|
parser = build_parser()
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
# Load config
|
|
||||||
config = load_config(args.configfile)
|
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
log_level = logging.WARNING
|
log_level = logging.WARNING
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
@@ -659,6 +656,9 @@ def main(argv=None):
|
|||||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||||
datefmt="%Y-%m-%d %H:%M:%S"
|
datefmt="%Y-%m-%d %H:%M:%S"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
config = load_config(args.configfile)
|
||||||
|
|
||||||
# Daemonize if requested
|
# Daemonize if requested
|
||||||
if args.daemon:
|
if args.daemon:
|
||||||
|
|||||||
@@ -311,7 +311,10 @@ class PluginLoader:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
loaded_count = 0
|
loaded_count = 0
|
||||||
plugin_config = config or {}
|
raw_config = config or {}
|
||||||
|
# Per-plugin config lives under the 'plugins' key; fall back to top-level
|
||||||
|
# for backwards compatibility.
|
||||||
|
plugin_config = raw_config.get("plugins", raw_config)
|
||||||
|
|
||||||
# Scan for Python files
|
# Scan for Python files
|
||||||
for plugin_file in directory.glob("*.py"):
|
for plugin_file in directory.glob("*.py"):
|
||||||
|
|||||||
@@ -0,0 +1,151 @@
|
|||||||
|
"""Ping Monitor Plugin for Heartbeat.
|
||||||
|
|
||||||
|
Pings one or more hosts and reports round-trip time. Results are sent as
|
||||||
|
plugin metrics so the server-side threshold system can raise WARNING/CRITICAL
|
||||||
|
alerts using the same RTT threshold configuration format used for heartbeat RTT.
|
||||||
|
|
||||||
|
Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
ping_monitor:
|
||||||
|
interval: 60 # ping every 60 seconds (default)
|
||||||
|
count: 3 # ICMP packets per ping run (default 3)
|
||||||
|
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||||
|
hosts:
|
||||||
|
8.8.8.8:
|
||||||
|
warning: 20.0 # ms
|
||||||
|
critical: 100.0 # ms
|
||||||
|
192.168.1.1:
|
||||||
|
warning: 5.0
|
||||||
|
critical: 20.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||||
|
by underscores so it is a valid identifier):
|
||||||
|
|
||||||
|
ping.<hostname>.rtt_avg – average RTT in ms (float, or inf if unreachable)
|
||||||
|
ping.<hostname>.rtt_min – minimum RTT in ms
|
||||||
|
ping.<hostname>.rtt_max – maximum RTT in ms
|
||||||
|
ping.<hostname>.loss – packet loss percentage (0–100)
|
||||||
|
|
||||||
|
Server-side threshold config example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
ping_monitor:
|
||||||
|
8_8_8_8_rtt_avg:
|
||||||
|
warning: 20.0
|
||||||
|
critical: 100.0
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from hbd.client.plugin import MonitorPlugin
|
||||||
|
|
||||||
|
|
||||||
|
def _host_key(host: str) -> str:
|
||||||
|
"""Convert a hostname/IP to a safe metric key (replace . and : with _)."""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_]", "_", host)
|
||||||
|
|
||||||
|
|
||||||
|
class PingMonitorPlugin(MonitorPlugin):
|
||||||
|
"""Ping one or more configured hosts and report RTT metrics."""
|
||||||
|
|
||||||
|
name = "ping_monitor"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "ICMP ping latency monitoring"
|
||||||
|
interval = 60
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
|
super().__init__(config)
|
||||||
|
cfg = config or {}
|
||||||
|
self.interval = cfg.get("interval", 60)
|
||||||
|
self.count = int(cfg.get("count", 3))
|
||||||
|
self.timeout = int(cfg.get("timeout", 5))
|
||||||
|
# hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames
|
||||||
|
raw_hosts = cfg.get("hosts", {})
|
||||||
|
if isinstance(raw_hosts, list):
|
||||||
|
self.hosts = {h: {} for h in raw_hosts}
|
||||||
|
else:
|
||||||
|
self.hosts = dict(raw_hosts)
|
||||||
|
|
||||||
|
async def initialize(self) -> bool:
|
||||||
|
if not self.hosts:
|
||||||
|
self.logger.warning("ping_monitor: no hosts configured, plugin disabled")
|
||||||
|
return False
|
||||||
|
self.logger.info(
|
||||||
|
"ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds",
|
||||||
|
len(self.hosts), self.interval, self.count, self.timeout,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _ping(self, host: str) -> Dict[str, float]:
|
||||||
|
"""Run a system ping command and return rtt_min/avg/max/loss."""
|
||||||
|
if sys.platform == "win32":
|
||||||
|
cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host]
|
||||||
|
else:
|
||||||
|
cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host]
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, _ = await asyncio.wait_for(
|
||||||
|
proc.communicate(),
|
||||||
|
timeout=self.timeout * self.count + 2,
|
||||||
|
)
|
||||||
|
output = stdout.decode(errors="replace")
|
||||||
|
except (asyncio.TimeoutError, FileNotFoundError, OSError) as e:
|
||||||
|
self.logger.warning("ping_monitor: ping failed for %s: %s", host, e)
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
|
||||||
|
# Parse packet loss
|
||||||
|
loss = 100.0
|
||||||
|
loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output)
|
||||||
|
if loss_match:
|
||||||
|
loss = float(loss_match.group(1))
|
||||||
|
|
||||||
|
# Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms"
|
||||||
|
# macOS: "round-trip min/avg/max/stddev = x/x/x/x ms"
|
||||||
|
rtt_match = re.search(
|
||||||
|
r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)",
|
||||||
|
output,
|
||||||
|
)
|
||||||
|
if rtt_match:
|
||||||
|
return {
|
||||||
|
"rtt_min": float(rtt_match.group(1)),
|
||||||
|
"rtt_avg": float(rtt_match.group(2)),
|
||||||
|
"rtt_max": float(rtt_match.group(3)),
|
||||||
|
"loss": loss,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Host unreachable or all packets lost
|
||||||
|
return {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": loss}
|
||||||
|
|
||||||
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||||
|
data: Dict[str, Any] = {}
|
||||||
|
tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts}
|
||||||
|
for host, task in tasks.items():
|
||||||
|
try:
|
||||||
|
result = await task
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("ping_monitor: error pinging %s: %s", host, e)
|
||||||
|
result = {"rtt_min": float("inf"), "rtt_avg": float("inf"),
|
||||||
|
"rtt_max": float("inf"), "loss": 100.0}
|
||||||
|
key = _host_key(host)
|
||||||
|
for metric, value in result.items():
|
||||||
|
data[f"{key}_{metric}"] = value
|
||||||
|
status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms"
|
||||||
|
self.logger.debug("ping_monitor: %s -> %s", host, status)
|
||||||
|
return data
|
||||||
Reference in New Issue
Block a user