feat: async subprocess in nagios_runner with stderr capture and signal handling
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,8 +21,9 @@ nagios_runner:
|
|||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from hbd.client.plugin import MonitorPlugin
|
from hbd.client.plugin import MonitorPlugin
|
||||||
@@ -76,7 +77,6 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
# Extract configuration
|
# Extract configuration
|
||||||
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||||
self.timeout: int = config.get("timeout", 30) if config else 30
|
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||||
self.shell: bool = config.get("shell", True) if config else True
|
|
||||||
self.interval = config.get("interval", 300) if config else 300
|
self.interval = config.get("interval", 300) if config else 300
|
||||||
|
|
||||||
async def initialize(self) -> bool:
|
async def initialize(self) -> bool:
|
||||||
@@ -135,7 +135,7 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
for metric_name, metric_value in perfdata.items():
|
for metric_name, metric_value in perfdata.items():
|
||||||
results[f"{name}_{metric_name}"] = metric_value
|
results[f"{name}_{metric_name}"] = metric_value
|
||||||
|
|
||||||
self.logger.debug(
|
self.logger.info(
|
||||||
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -157,46 +157,49 @@ class NagiosRunnerPlugin(MonitorPlugin):
|
|||||||
self,
|
self,
|
||||||
command: str
|
command: str
|
||||||
) -> Tuple[int, str, Dict[str, Any]]:
|
) -> Tuple[int, str, Dict[str, Any]]:
|
||||||
"""Execute a Nagios plugin and parse its output.
|
"""Execute a Nagios plugin and parse its output."""
|
||||||
|
|
||||||
Args:
|
|
||||||
command: Command string to execute
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (status_code, output_message, performance_data_dict)
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
# Run command
|
proc = await asyncio.create_subprocess_shell(
|
||||||
result = subprocess.run(
|
|
||||||
command,
|
command,
|
||||||
shell=self.shell,
|
stdout=asyncio.subprocess.PIPE,
|
||||||
capture_output=True,
|
stderr=asyncio.subprocess.PIPE,
|
||||||
timeout=self.timeout,
|
|
||||||
text=True
|
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||||
|
proc.communicate(), timeout=self.timeout
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
proc.kill()
|
||||||
|
await proc.communicate()
|
||||||
|
self.logger.error(f"Command timed out: {command}")
|
||||||
|
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||||
|
|
||||||
status_code = result.returncode
|
status_code = proc.returncode
|
||||||
output = result.stdout.strip()
|
|
||||||
|
if status_code < 0:
|
||||||
|
return NAGIOS_UNKNOWN, f"Process killed by signal {-status_code}", {}
|
||||||
|
|
||||||
# Nagios plugins can return codes > 3, treat as UNKNOWN
|
|
||||||
if status_code > 3:
|
if status_code > 3:
|
||||||
status_code = NAGIOS_UNKNOWN
|
status_code = NAGIOS_UNKNOWN
|
||||||
|
|
||||||
# Parse performance data
|
stdout = stdout_bytes.decode(errors="replace").strip()
|
||||||
perfdata = self._parse_perfdata(output)
|
stderr = stderr_bytes.decode(errors="replace").strip()
|
||||||
|
|
||||||
# Extract just the status message (before the pipe if present)
|
# Parse perfdata from stdout before mixing in stderr
|
||||||
if '|' in output:
|
perfdata = self._parse_perfdata(stdout)
|
||||||
output_msg = output.split('|')[0].strip()
|
|
||||||
|
# Build status message
|
||||||
|
status_part = stdout.split('|')[0].strip() if '|' in stdout else stdout
|
||||||
|
|
||||||
|
if not stdout and stderr:
|
||||||
|
output_msg = stderr
|
||||||
|
elif stdout and stderr:
|
||||||
|
output_msg = f"{status_part} [stderr: {stderr}]"
|
||||||
else:
|
else:
|
||||||
output_msg = output
|
output_msg = status_part
|
||||||
|
|
||||||
return status_code, output_msg, perfdata
|
return status_code, output_msg, perfdata
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
self.logger.error(f"Command timed out: {command}")
|
|
||||||
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error executing command: {e}")
|
self.logger.error(f"Error executing command: {e}")
|
||||||
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||||
|
|||||||
@@ -20,3 +20,43 @@ def test_no_commands_sets_skip_reason():
|
|||||||
assert result is False
|
assert result is False
|
||||||
assert plugin.skip_reason is not None
|
assert plugin.skip_reason is not None
|
||||||
assert "nagios_runner.commands" in plugin.skip_reason
|
assert "nagios_runner.commands" in plugin.skip_reason
|
||||||
|
|
||||||
|
|
||||||
|
def test_stderr_used_when_stdout_empty(tmp_path):
|
||||||
|
script = tmp_path / "check_err.sh"
|
||||||
|
script.write_text("#!/bin/sh\necho 'error from stderr' >&2\nexit 2\n")
|
||||||
|
script.chmod(script.stat().st_mode | stat.S_IEXEC)
|
||||||
|
|
||||||
|
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert "error from stderr" in data["t_output"]
|
||||||
|
assert data["t_status_code"] == NAGIOS_CRITICAL
|
||||||
|
|
||||||
|
|
||||||
|
def test_stderr_appended_when_both_present(tmp_path):
|
||||||
|
script = tmp_path / "check_both.sh"
|
||||||
|
script.write_text("#!/bin/sh\necho 'OK - all good'\necho 'extra detail' >&2\nexit 0\n")
|
||||||
|
script.chmod(script.stat().st_mode | stat.S_IEXEC)
|
||||||
|
|
||||||
|
config = {"commands": [{"name": "t", "command": str(script)}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert "OK - all good" in data["t_output"]
|
||||||
|
assert "extra detail" in data["t_output"]
|
||||||
|
assert data["t_status_code"] == NAGIOS_OK
|
||||||
|
|
||||||
|
|
||||||
|
def test_negative_returncode_maps_to_unknown():
|
||||||
|
# kill -9 $$ kills the shell itself; asyncio sees returncode -9
|
||||||
|
config = {"commands": [{"name": "t", "command": "kill -9 $$"}], "timeout": 5}
|
||||||
|
plugin = NagiosRunnerPlugin(config=config)
|
||||||
|
asyncio.run(plugin.initialize())
|
||||||
|
data = asyncio.run(plugin._collect_metrics())
|
||||||
|
|
||||||
|
assert data["t_status_code"] == NAGIOS_UNKNOWN
|
||||||
|
assert "signal" in data["t_output"].lower()
|
||||||
|
|||||||
Reference in New Issue
Block a user