feat: async subprocess in nagios_runner with stderr capture and signal handling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 16:18:09 +02:00
parent a76a39b4a0
commit b5963badd6
2 changed files with 79 additions and 36 deletions
+33 -30
View File
@@ -21,8 +21,9 @@ nagios_runner:
```
"""
import asyncio
import os
import re
import subprocess
from typing import Any, Dict, List, Optional, Tuple
from hbd.client.plugin import MonitorPlugin
@@ -76,7 +77,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
# Extract configuration
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
self.timeout: int = config.get("timeout", 30) if config else 30
self.shell: bool = config.get("shell", True) if config else True
self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool:
@@ -135,7 +135,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
for metric_name, metric_value in perfdata.items():
results[f"{name}_{metric_name}"] = metric_value
self.logger.debug(
self.logger.info(
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
)
@@ -157,46 +157,49 @@ class NagiosRunnerPlugin(MonitorPlugin):
self,
command: str
) -> Tuple[int, str, Dict[str, Any]]:
"""Execute a Nagios plugin and parse its output.
Args:
command: Command string to execute
Returns:
Tuple of (status_code, output_message, performance_data_dict)
"""
"""Execute a Nagios plugin and parse its output."""
try:
# Run command
result = subprocess.run(
proc = await asyncio.create_subprocess_shell(
command,
shell=self.shell,
capture_output=True,
timeout=self.timeout,
text=True
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout_bytes, stderr_bytes = await asyncio.wait_for(
proc.communicate(), timeout=self.timeout
)
except asyncio.TimeoutError:
proc.kill()
await proc.communicate()
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
status_code = result.returncode
output = result.stdout.strip()
status_code = proc.returncode
if status_code < 0:
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
# Nagios plugins can return codes > 3, treat as UNKNOWN
if status_code > 3:
status_code = NAGIOS_UNKNOWN
# Parse performance data
perfdata = self._parse_perfdata(output)
stdout = stdout_bytes.decode(errors="replace").strip()
stderr = stderr_bytes.decode(errors="replace").strip()
# Extract just the status message (before the pipe if present)
if '|' in output:
output_msg = output.split('|')[0].strip()
# Parse perfdata from stdout before mixing in stderr
perfdata = self._parse_perfdata(stdout)
# Build status message
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
if not stdout and stderr:
output_msg = stderr
elif stdout and stderr:
output_msg = f"{status_part} [stderr: {stderr}]"
else:
output_msg = output
output_msg = status_part
return status_code, output_msg, perfdata
except subprocess.TimeoutExpired:
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
except Exception as e:
self.logger.error(f"Error executing command: {e}")
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
+40
View File
@@ -20,3 +20,43 @@ def test_no_commands_sets_skip_reason():
assert result is False
assert plugin.skip_reason is not None
assert "nagios_runner.commands" in plugin.skip_reason
def test_stderr_used_when_stdout_empty(tmp_path):
script = tmp_path / "check_err.sh"
script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n")
script.chmod(script.stat().st_mode | stat.S_IEXEC)
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert "error from stderr" in data["t_output"]
assert data["t_status_code"] == NAGIOS_CRITICAL
def test_stderr_appended_when_both_present(tmp_path):
script = tmp_path / "check_both.sh"
script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n")
script.chmod(script.stat().st_mode | stat.S_IEXEC)
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert "OK - all good" in data["t_output"]
assert "extra detail" in data["t_output"]
assert data["t_status_code"] == NAGIOS_OK
def test_negative_returncode_maps_to_unknown():
# kill -9 $$ kills the shell itself; asyncio sees returncode -9
config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5}
plugin = NagiosRunnerPlugin(config=config)
asyncio.run(plugin.initialize())
data = asyncio.run(plugin._collect_metrics())
assert data["t_status_code"] == NAGIOS_UNKNOWN
assert "signal" in data["t_output"].lower()