"""Nagios Plugin Runner for Heartbeat. Executes Nagios-compatible monitoring plugins and parses their output. Nagios Plugin Standard: - Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN - Output format: Single line status message, optional performance data - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max] Example configuration in ~/.hb.yaml: ```yaml nagios_runner: interval: 60 commands: - name: check_disk_root command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p / - name: check_procs command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400 - name: check_load command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6 ``` """ import asyncio import os import re import shlex from typing import Any, Dict, List, Optional, Tuple from hbd.client.plugin import MonitorPlugin # Nagios exit codes NAGIOS_OK = 0 NAGIOS_WARNING = 1 NAGIOS_CRITICAL = 2 NAGIOS_UNKNOWN = 3 STATUS_NAMES = { NAGIOS_OK: "OK", NAGIOS_WARNING: "WARNING", NAGIOS_CRITICAL: "CRITICAL", NAGIOS_UNKNOWN: "UNKNOWN" } class NagiosRunnerPlugin(MonitorPlugin): """Run Nagios-compatible monitoring plugins. This plugin executes external Nagios plugins and collects their output, including status codes, messages, and performance data. Configuration: interval: Collection interval in seconds (default: 300) commands: List of command definitions with 'name' and 'command' keys timeout: Command execution timeout in seconds (default: 30) Example: nagios_runner: interval: 300 # Check every 5 minutes timeout: 30 commands: - name: check_disk command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% - name: check_load command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6 """ name = "nagios_runner" version = "1.0.0" description = "Execute Nagios-compatible monitoring plugins" interval = 300 # MonitorPlugin: collect every 5 minutes by default def __init__(self, config: Optional[Dict[str, Any]] = None): super().__init__(config) # Extract configuration self.commands: List[Dict[str, str]] = config.get("commands", []) if config else [] self.timeout: int = config.get("timeout", 30) if config else 30 self.interval = config.get("interval", 300) if config else 300 async def initialize(self) -> bool: """Initialize the Nagios runner plugin. Returns: True if at least one command is configured, False otherwise """ self.logger.info(f"Initializing {self.name} plugin") if not self.commands: self.skip_reason = "no commands configured (add nagios_runner.commands to config)" return False self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)") for cmd_config in self.commands: name = cmd_config.get("name", "unnamed") self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}") # Validate absolute command paths early for cmd_config in self.commands: name = cmd_config.get("name", "unnamed") command = cmd_config.get("command", "") if not command: continue try: tokens = shlex.split(command) except ValueError: continue # malformed command string; skip validation if not tokens: continue exe = tokens[0] if os.path.isabs(exe): if not os.path.isfile(exe): self.logger.warning( f"Command '{name}': executable not found: {exe}" ) elif not os.access(exe, os.X_OK): self.logger.warning( f"Command '{name}': executable not executable: {exe}" ) return True async def _collect_metrics(self) -> Dict[str, Any]: """Collect metrics from all configured Nagios plugins. Returns: Dictionary with results from all plugins """ results = {} # Track overall status (worst status wins) worst_status = NAGIOS_OK for cmd_config in self.commands: name = cmd_config.get("name") command = cmd_config.get("command") if not name or not command: self.logger.warning("Skipping command with missing name or command") continue # Execute plugin try: status_code, output, perfdata = await self._run_nagios_plugin(command) # Store results results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN") results[f"{name}_status_code"] = status_code results[f"{name}_output"] = output # Track worst status if status_code > worst_status: worst_status = status_code # Parse and add performance data if perfdata: for metric_name, metric_value in perfdata.items(): results[f"{name}_{metric_name}"] = metric_value self.logger.info( f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}" ) except Exception as e: self.logger.error(f"Error running {name}: {e}", exc_info=True) results[f"{name}_status"] = "ERROR" results[f"{name}_status_code"] = NAGIOS_UNKNOWN results[f"{name}_output"] = str(e) worst_status = NAGIOS_UNKNOWN # Add overall status results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN") results["overall_status_code"] = worst_status results["plugin_count"] = len(self.commands) return results async def _run_nagios_plugin( self, command: str ) -> Tuple[int, str, Dict[str, Any]]: """Execute a Nagios plugin and parse its output.""" try: proc = await asyncio.create_subprocess_shell( command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) try: stdout_bytes, stderr_bytes = await asyncio.wait_for( proc.communicate(), timeout=self.timeout ) except asyncio.TimeoutError: proc.kill() await proc.communicate() self.logger.error(f"Command timed out: {command}") return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {} status_code = proc.returncode if status_code < 0: return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {} if status_code > 3: status_code = NAGIOS_UNKNOWN stdout = stdout_bytes.decode(errors="replace").strip() stderr = stderr_bytes.decode(errors="replace").strip() # Parse perfdata from stdout before mixing in stderr perfdata = self._parse_perfdata(stdout) # Build status message status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout if not stdout and stderr: output_msg = stderr elif stdout and stderr: output_msg = f"{status_part} [stderr: {stderr}]" else: output_msg = status_part return status_code, output_msg, perfdata except Exception as e: self.logger.error(f"Error executing command: {e}") return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {} def _parse_perfdata(self, output: str) -> Dict[str, Any]: """Parse Nagios performance data from plugin output. Nagios performance data format: 'label'=value[UOM];[warn];[crit];[min];[max] Multiple metrics separated by spaces. Args: output: Plugin output string Returns: Dictionary of metric_name: value """ perfdata = {} # Performance data comes after the pipe character if '|' not in output: return perfdata perf_section = output.split('|', 1)[1].strip() # Regex to match performance data format # Matches: 'label'=value or label=value perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)" for match in re.finditer(perf_regex, perf_section): label = match.group(1).strip() value_str = match.group(2) uom = match.group(3) or "" warn = match.group(4) crit = match.group(5) min_val = match.group(6) max_val = match.group(7) # Convert value to float try: value = float(value_str) except ValueError: continue # Store the value perfdata[label] = value # Optionally store UOM as separate field if uom: perfdata[f"{label}_uom"] = uom # Store thresholds if present if warn: try: perfdata[f"{label}_warn"] = float(warn) except ValueError: pass if crit: try: perfdata[f"{label}_crit"] = float(crit) except ValueError: pass if min_val: try: perfdata[f"{label}_min"] = float(min_val) except ValueError: pass if max_val: try: perfdata[f"{label}_max"] = float(max_val) except ValueError: pass return perfdata