From b5963badd68ee231d174adbbf996787e8a48a0f8 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Sat, 25 Apr 2026 16:18:09 +0200 Subject: [PATCH] feat: async subprocess in nagios_runner with stderr capture and signal handling Co-Authored-By: Claude Sonnet 4.6 --- hbd/client/plugins/nagios_runner.py | 75 +++++++++++++++-------------- tests/test_nagios_runner.py | 40 +++++++++++++++ 2 files changed, 79 insertions(+), 36 deletions(-) diff --git a/hbd/client/plugins/nagios_runner.py b/hbd/client/plugins/nagios_runner.py index 804418e..ddb72dd 100644 --- a/hbd/client/plugins/nagios_runner.py +++ b/hbd/client/plugins/nagios_runner.py @@ -21,8 +21,9 @@ nagios_runner: ``` """ +import asyncio +import os import re -import subprocess from typing import Any, Dict, List, Optional, Tuple from hbd.client.plugin import MonitorPlugin @@ -76,7 +77,6 @@ class NagiosRunnerPlugin(MonitorPlugin): # Extract configuration self.commands: List[Dict[str, str]] = config.get("commands", []) if config else [] self.timeout: int = config.get("timeout", 30) if config else 30 - self.shell: bool = config.get("shell", True) if config else True self.interval = config.get("interval", 300) if config else 300 async def initialize(self) -> bool: @@ -135,7 +135,7 @@ class NagiosRunnerPlugin(MonitorPlugin): for metric_name, metric_value in perfdata.items(): results[f"{name}_{metric_name}"] = metric_value - self.logger.debug( + self.logger.info( f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}" ) @@ -157,46 +157,49 @@ class NagiosRunnerPlugin(MonitorPlugin): self, command: str ) -> Tuple[int, str, Dict[str, Any]]: - """Execute a Nagios plugin and parse its output. - - Args: - command: Command string to execute - - Returns: - Tuple of (status_code, output_message, performance_data_dict) - """ + """Execute a Nagios plugin and parse its output.""" try: - # Run command - result = subprocess.run( + proc = await asyncio.create_subprocess_shell( command, - shell=self.shell, - capture_output=True, - timeout=self.timeout, - text=True + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, ) - - status_code = result.returncode - output = result.stdout.strip() - - # Nagios plugins can return codes > 3, treat as UNKNOWN + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + proc.communicate(), timeout=self.timeout + ) + except asyncio.TimeoutError: + proc.kill() + await proc.communicate() + self.logger.error(f"Command timed out: {command}") + return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {} + + status_code = proc.returncode + + if status_code < 0: + return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {} + if status_code > 3: status_code = NAGIOS_UNKNOWN - - # Parse performance data - perfdata = self._parse_perfdata(output) - - # Extract just the status message (before the pipe if present) - if '|' in output: - output_msg = output.split('|')[0].strip() + + stdout = stdout_bytes.decode(errors="replace").strip() + stderr = stderr_bytes.decode(errors="replace").strip() + + # Parse perfdata from stdout before mixing in stderr + perfdata = self._parse_perfdata(stdout) + + # Build status message + status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout + + if not stdout and stderr: + output_msg = stderr + elif stdout and stderr: + output_msg = f"{status_part} [stderr: {stderr}]" else: - output_msg = output - + output_msg = status_part + return status_code, output_msg, perfdata - - except subprocess.TimeoutExpired: - self.logger.error(f"Command timed out: {command}") - return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {} - + except Exception as e: self.logger.error(f"Error executing command: {e}") return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {} diff --git a/tests/test_nagios_runner.py b/tests/test_nagios_runner.py index 4656a5e..b2e05c9 100644 --- a/tests/test_nagios_runner.py +++ b/tests/test_nagios_runner.py @@ -20,3 +20,43 @@ def test_no_commands_sets_skip_reason(): assert result is False assert plugin.skip_reason is not None assert "nagios_runner.commands" in plugin.skip_reason + + +def test_stderr_used_when_stdout_empty(tmp_path): + script = tmp_path / "check_err.sh" + script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n") + script.chmod(script.stat().st_mode | stat.S_IEXEC) + + config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5} + plugin = NagiosRunnerPlugin(config=config) + asyncio.run(plugin.initialize()) + data = asyncio.run(plugin._collect_metrics()) + + assert "error from stderr" in data["t_output"] + assert data["t_status_code"] == NAGIOS_CRITICAL + + +def test_stderr_appended_when_both_present(tmp_path): + script = tmp_path / "check_both.sh" + script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n") + script.chmod(script.stat().st_mode | stat.S_IEXEC) + + config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5} + plugin = NagiosRunnerPlugin(config=config) + asyncio.run(plugin.initialize()) + data = asyncio.run(plugin._collect_metrics()) + + assert "OK - all good" in data["t_output"] + assert "extra detail" in data["t_output"] + assert data["t_status_code"] == NAGIOS_OK + + +def test_negative_returncode_maps_to_unknown(): + # kill -9 $$ kills the shell itself; asyncio sees returncode -9 + config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5} + plugin = NagiosRunnerPlugin(config=config) + asyncio.run(plugin.initialize()) + data = asyncio.run(plugin._collect_metrics()) + + assert data["t_status_code"] == NAGIOS_UNKNOWN + assert "signal" in data["t_output"].lower()