diff --git a/.vscode/launch.json b/.vscode/launch.json index bdf17e8..e4a51e6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,7 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { "name": "Python: Run hbd (module)", "type": "debugpy", @@ -28,14 +29,14 @@ ] }, { - "name": "Python: Run hbd with debugpy (listen)", + "name": "Python: Run hbc (module)", "type": "debugpy", "request": "launch", - "module": "debugpy", - "args": ["--listen", "5678", "--wait-for-client", "-m", "hbd.server.cli", "-c", "~/.hb.yaml", "-f", "-v"], + "module": "hbd.client.main", + "args": ["-c", "~/.hbc.yaml", "-v", "winter"], + "cwd": "${workspaceFolder}", "env": { "PYTHONPATH": "${workspaceFolder}" }, "console": "integratedTerminal", - "justMyCode": false } ] } diff --git a/hbd/client/config.py b/hbd/client/config.py index 90ce9d0..9b85fc7 100644 --- a/hbd/client/config.py +++ b/hbd/client/config.py @@ -2,6 +2,9 @@ import logging import os +import logging + +logger = logging.getLogger(__name__) try: import yaml @@ -30,18 +33,19 @@ def load_config(path=None): If YAML is not available or the file does not exist, defaults are returned. Args: - path: Path to YAML config file (default: ~/.hb.yaml) + path: Path to YAML config file (default: ~/.hbc.yaml) Returns: Dictionary with configuration """ cfg = CLIENT_DEFAULTS.copy() if not path: - # default path (~/.hb.yaml) - path = os.path.join(os.path.expanduser("~"), ".hb.yaml") + # default path (~/.hbc.yaml) + path = os.path.join(os.path.expanduser("~"), ".hbc.yaml") if os.path.exists(path): if yaml: + logger.info("Loading configuration from %s", path) with open(path) as fh: data = yaml.safe_load(fh) # Merge YAML data with defaults @@ -50,5 +54,5 @@ def load_config(path=None): cfg[k] = v else: # yaml not installed: do not attempt to parse; user must ensure defaults - pass + logger.warning("PyYAML not available - cannot load config from %s, using defaults", path) return cfg diff --git a/hbd/client/main.py b/hbd/client/main.py index 218d132..4dd85ec 100644 --- a/hbd/client/main.py +++ b/hbd/client/main.py @@ -644,9 +644,6 @@ def main(argv=None): parser = build_parser() args = parser.parse_args(argv) - # Load config - config = load_config(args.configfile) - # Setup logging log_level = logging.WARNING if args.verbose: @@ -659,6 +656,9 @@ def main(argv=None): format="%(asctime)s %(name)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) + + # Load config + config = load_config(args.configfile) # Daemonize if requested if args.daemon: diff --git a/hbd/client/plugin.py b/hbd/client/plugin.py index beb6bef..8c5b1fc 100644 --- a/hbd/client/plugin.py +++ b/hbd/client/plugin.py @@ -311,7 +311,10 @@ class PluginLoader: return 0 loaded_count = 0 - plugin_config = config or {} + raw_config = config or {} + # Per-plugin config lives under the 'plugins' key; fall back to top-level + # for backwards compatibility. + plugin_config = raw_config.get("plugins", raw_config) # Scan for Python files for plugin_file in directory.glob("*.py"): diff --git a/hbd/client/plugins/ping_monitor.py b/hbd/client/plugins/ping_monitor.py new file mode 100644 index 0000000..eee1eb4 --- /dev/null +++ b/hbd/client/plugins/ping_monitor.py @@ -0,0 +1,151 @@ +"""Ping Monitor Plugin for Heartbeat. + +Pings one or more hosts and reports round-trip time. Results are sent as +plugin metrics so the server-side threshold system can raise WARNING/CRITICAL +alerts using the same RTT threshold configuration format used for heartbeat RTT. + +Example configuration in ~/.hbc.yaml (or the plugins section of ~/.hb.yaml): + +```yaml +plugins: + ping_monitor: + interval: 60 # ping every 60 seconds (default) + count: 3 # ICMP packets per ping run (default 3) + timeout: 5 # seconds before a host is considered unreachable (default 5) + hosts: + 8.8.8.8: + warning: 20.0 # ms + critical: 100.0 # ms + 192.168.1.1: + warning: 5.0 + critical: 20.0 +``` + +Reported metrics per host (metric key uses the hostname with dots/colons replaced +by underscores so it is a valid identifier): + + ping..rtt_avg – average RTT in ms (float, or inf if unreachable) + ping..rtt_min – minimum RTT in ms + ping..rtt_max – maximum RTT in ms + ping..loss – packet loss percentage (0–100) + +Server-side threshold config example: + +```yaml +threshold_configs: + default: + thresholds: + ping_monitor: + 8_8_8_8_rtt_avg: + warning: 20.0 + critical: 100.0 +``` +""" + +import asyncio +import re +import sys +from typing import Any, Dict, Optional + +from hbd.client.plugin import MonitorPlugin + + +def _host_key(host: str) -> str: + """Convert a hostname/IP to a safe metric key (replace . and : with _).""" + return re.sub(r"[^a-zA-Z0-9_]", "_", host) + + +class PingMonitorPlugin(MonitorPlugin): + """Ping one or more configured hosts and report RTT metrics.""" + + name = "ping_monitor" + version = "1.0.0" + description = "ICMP ping latency monitoring" + interval = 60 + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + cfg = config or {} + self.interval = cfg.get("interval", 60) + self.count = int(cfg.get("count", 3)) + self.timeout = int(cfg.get("timeout", 5)) + # hosts: dict of {hostname: {warning: x, critical: y}} or list of hostnames + raw_hosts = cfg.get("hosts", {}) + if isinstance(raw_hosts, list): + self.hosts = {h: {} for h in raw_hosts} + else: + self.hosts = dict(raw_hosts) + + async def initialize(self) -> bool: + if not self.hosts: + self.logger.warning("ping_monitor: no hosts configured, plugin disabled") + return False + self.logger.info( + "ping_monitor initialized: %d host(s), interval=%ds, count=%d, timeout=%ds", + len(self.hosts), self.interval, self.count, self.timeout, + ) + return True + + async def _ping(self, host: str) -> Dict[str, float]: + """Run a system ping command and return rtt_min/avg/max/loss.""" + if sys.platform == "win32": + cmd = ["ping", "-n", str(self.count), "-w", str(self.timeout * 1000), host] + else: + cmd = ["ping", "-c", str(self.count), "-W", str(self.timeout), host] + + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await asyncio.wait_for( + proc.communicate(), + timeout=self.timeout * self.count + 2, + ) + output = stdout.decode(errors="replace") + except (asyncio.TimeoutError, FileNotFoundError, OSError) as e: + self.logger.warning("ping_monitor: ping failed for %s: %s", host, e) + return {"rtt_min": float("inf"), "rtt_avg": float("inf"), + "rtt_max": float("inf"), "loss": 100.0} + + # Parse packet loss + loss = 100.0 + loss_match = re.search(r"(\d+(?:\.\d+)?)\s*%\s*packet\s*loss", output) + if loss_match: + loss = float(loss_match.group(1)) + + # Parse rtt min/avg/max — Linux: "rtt min/avg/max/mdev = x/x/x/x ms" + # macOS: "round-trip min/avg/max/stddev = x/x/x/x ms" + rtt_match = re.search( + r"(?:rtt|round-trip)\s+min/avg/max/\S+\s*=\s*([\d.]+)/([\d.]+)/([\d.]+)", + output, + ) + if rtt_match: + return { + "rtt_min": float(rtt_match.group(1)), + "rtt_avg": float(rtt_match.group(2)), + "rtt_max": float(rtt_match.group(3)), + "loss": loss, + } + + # Host unreachable or all packets lost + return {"rtt_min": float("inf"), "rtt_avg": float("inf"), + "rtt_max": float("inf"), "loss": loss} + + async def _collect_metrics(self) -> Dict[str, Any]: + data: Dict[str, Any] = {} + tasks = {host: asyncio.create_task(self._ping(host)) for host in self.hosts} + for host, task in tasks.items(): + try: + result = await task + except Exception as e: + self.logger.error("ping_monitor: error pinging %s: %s", host, e) + result = {"rtt_min": float("inf"), "rtt_avg": float("inf"), + "rtt_max": float("inf"), "loss": 100.0} + key = _host_key(host) + for metric, value in result.items(): + data[f"{key}_{metric}"] = value + status = "unreachable" if result["loss"] == 100.0 else f"{result['rtt_avg']:.1f}ms" + self.logger.debug("ping_monitor: %s -> %s", host, status) + return data