feat: async subprocess in nagios_runner with stderr capture and signal handling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 16:18:09 +02:00
parent a76a39b4a0
commit b5963badd6
2 changed files with 79 additions and 36 deletions
+39 -36
View File
@@ -21,8 +21,9 @@ nagios_runner:
``` ```
""" """
import asyncio
import os
import re import re
import subprocess
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from hbd.client.plugin import MonitorPlugin from hbd.client.plugin import MonitorPlugin
@@ -76,7 +77,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
# Extract configuration # Extract configuration
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else [] self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
self.timeout: int = config.get("timeout", 30) if config else 30 self.timeout: int = config.get("timeout", 30) if config else 30
self.shell: bool = config.get("shell", True) if config else True
self.interval = config.get("interval", 300) if config else 300 self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool: async def initialize(self) -> bool:
@@ -135,7 +135,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
for metric_name, metric_value in perfdata.items(): for metric_name, metric_value in perfdata.items():
results[f"{name}_{metric_name}"] = metric_value results[f"{name}_{metric_name}"] = metric_value
self.logger.debug( self.logger.info(
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}" f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
) )
@@ -157,46 +157,49 @@ class NagiosRunnerPlugin(MonitorPlugin):
self, self,
command: str command: str
) -> Tuple[int, str, Dict[str, Any]]: ) -> Tuple[int, str, Dict[str, Any]]:
"""Execute a Nagios plugin and parse its output. """Execute a Nagios plugin and parse its output."""
Args:
command: Command string to execute
Returns:
Tuple of (status_code, output_message, performance_data_dict)
"""
try: try:
# Run command proc = await asyncio.create_subprocess_shell(
result = subprocess.run(
command, command,
shell=self.shell, stdout=asyncio.subprocess.PIPE,
capture_output=True, stderr=asyncio.subprocess.PIPE,
timeout=self.timeout,
text=True
) )
try:
status_code = result.returncode stdout_bytes, stderr_bytes = await asyncio.wait_for(
output = result.stdout.strip() proc.communicate(), timeout=self.timeout
)
# Nagios plugins can return codes > 3, treat as UNKNOWN except asyncio.TimeoutError:
proc.kill()
await proc.communicate()
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
status_code = proc.returncode
if status_code < 0:
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
if status_code > 3: if status_code > 3:
status_code = NAGIOS_UNKNOWN status_code = NAGIOS_UNKNOWN
# Parse performance data stdout = stdout_bytes.decode(errors="replace").strip()
perfdata = self._parse_perfdata(output) stderr = stderr_bytes.decode(errors="replace").strip()
# Extract just the status message (before the pipe if present) # Parse perfdata from stdout before mixing in stderr
if '|' in output: perfdata = self._parse_perfdata(stdout)
output_msg = output.split('|')[0].strip()
# Build status message
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
if not stdout and stderr:
output_msg = stderr
elif stdout and stderr:
output_msg = f"{status_part} [stderr: {stderr}]"
else: else:
output_msg = output output_msg = status_part
return status_code, output_msg, perfdata return status_code, output_msg, perfdata
except subprocess.TimeoutExpired:
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
except Exception as e: except Exception as e:
self.logger.error(f"Error executing command: {e}") self.logger.error(f"Error executing command: {e}")
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {} return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
+40
View File
@@ -20,3 +20,43 @@ def test_no_commands_sets_skip_reason():
assert result is False assert result is False
assert plugin.skip_reason is not None assert plugin.skip_reason is not None
assert "nagios_runner.commands" in plugin.skip_reason assert "nagios_runner.commands" in plugin.skip_reason
def test_stderr_used_when_stdout_empty(tmp_path):
script = tmp_path / "check_err.sh"
script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n")
script.chmod(script.stat().st_mode | stat.S_IEXEC)
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert "error from stderr" in data["t_output"]
assert data["t_status_code"] == NAGIOS_CRITICAL
def test_stderr_appended_when_both_present(tmp_path):
script = tmp_path / "check_both.sh"
script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n")
script.chmod(script.stat().st_mode | stat.S_IEXEC)
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert "OK - all good" in data["t_output"]
assert "extra detail" in data["t_output"]
assert data["t_status_code"] == NAGIOS_OK
def test_negative_returncode_maps_to_unknown():
# kill -9 $$ kills the shell itself; asyncio sees returncode -9
config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert data["t_status_code"] == NAGIOS_UNKNOWN
assert "signal" in data["t_output"].lower()