Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:

- Restructuring of the project directory into client and server components
- Renaming of modules and classes to better reflect their purpose and functionality
- Moving common utilities and configurations to a shared location
- Updating import statements to reflect the new structure
- Adding new documentation files for better clarity on various aspects of the project
- Removing deprecated or unused code to streamline the codebase
- Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
This commit is contained in:
Andreas Wrede
2026-03-29 11:13:40 -04:00
parent 7e2038ecac
commit 0543266c92
65 changed files with 11371 additions and 140 deletions
+129
View File
@@ -0,0 +1,129 @@
"""CPU Monitoring Plugin for Heartbeat.
Collects CPU usage statistics including overall CPU percentage, per-core usage,
load average, and process counts.
"""
from typing import Any, Dict, Optional
import sys
from pathlib import Path
# Import from parent package
from hbd.client.plugin import MonitorPlugin
class CPUMonitorPlugin(MonitorPlugin):
"""Monitor CPU usage and load.
Collects:
- Overall CPU usage percentage
- Per-core CPU usage (if enabled in config)
- Load average (1min, 5min, 15min)
- Process count
- CPU frequency (if available)
"""
name = "cpu_monitor"
version = "1.0.0"
description = "CPU usage and load monitoring"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.psutil = None
self.per_core = config.get("per_core", False) if config else False
self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool:
"""Initialize the CPU monitor plugin.
Checks if psutil is available.
Returns:
True if psutil is available, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
try:
import psutil
self.psutil = psutil
self.logger.info(f"{self.name} initialized successfully")
return True
except ImportError:
self.logger.error(
"psutil module not available. Install with: pip install psutil"
)
return False
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect CPU metrics.
Returns:
Dictionary with CPU metrics
"""
if not self.psutil:
return {}
try:
data = {}
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
# Per-core CPU usage (if enabled)
if self.per_core:
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
data["cpu_per_core"] = per_core_percents
data["cpu_core_count"] = len(per_core_percents)
else:
# Just report core count
data["cpu_core_count"] = self.psutil.cpu_count()
# Load average (Unix-like systems only)
try:
load_avg = self.psutil.getloadavg()
data["load_1min"] = round(load_avg[0], 2)
data["load_5min"] = round(load_avg[1], 2)
data["load_15min"] = round(load_avg[2], 2)
except (AttributeError, OSError):
# Not available on Windows
pass
# Process count
try:
data["process_count"] = len(self.psutil.pids())
except Exception as e:
self.logger.warning(f"Could not get process count: {e}")
# CPU frequency (if available)
try:
freq = self.psutil.cpu_freq()
if freq:
data["cpu_freq_current"] = round(freq.current, 2)
data["cpu_freq_min"] = round(freq.min, 2)
data["cpu_freq_max"] = round(freq.max, 2)
except (AttributeError, OSError, RuntimeError, SystemError) as e:
# Not available on all systems, or may fail on FreeBSD with sysctl issues
self.logger.debug(f"CPU frequency not available: {e}")
pass
# CPU times (user, system, idle, etc.)
try:
cpu_times = self.psutil.cpu_times_percent(interval=0)
data["cpu_user"] = round(cpu_times.user, 1)
data["cpu_system"] = round(cpu_times.system, 1)
data["cpu_idle"] = round(cpu_times.idle, 1)
if hasattr(cpu_times, "iowait"):
data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
)
return data
except Exception as e:
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
return {}
+199
View File
@@ -0,0 +1,199 @@
"""
Disk monitoring plugin for Heartbeat.
Collects disk usage and I/O statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class DiskMonitorPlugin(MonitorPlugin):
"""
Monitor disk usage and I/O statistics.
Collects:
- Disk partition information
- Disk usage per partition (total, used, free, percent)
- Disk I/O counters (read/write bytes, read/write count)
- Disk I/O time statistics
Configuration:
interval: Collection interval in seconds (default: 300)
partitions: List of mount points to monitor (default: all)
include_io: Include disk I/O statistics (default: True)
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
"""
name = "disk_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the disk monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- partitions: List of specific mount points to monitor
- include_io: Include I/O statistics (default: True)
- exclude_types: List of filesystem types to exclude
"""
super().__init__(config)
self.partitions = self.config.get('partitions', None) # None = all partitions
self.include_io = self.config.get('include_io', True)
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for disk_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - disk_monitor cannot run")
return False
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
# Initialize I/O counters if available
if self.include_io:
try:
self._prev_io = psutil.disk_io_counters(perdisk=True)
except Exception as e:
logger.warning(f"Could not initialize disk I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current disk statistics.
Returns:
Dictionary with disk metrics organized by partition:
- partitions: Dict of partition data, keyed by mount point
- device: Device name (e.g., /dev/sda1)
- fstype: Filesystem type (e.g., ext4)
- total: Total space in bytes
- used: Used space in bytes
- free: Free space in bytes
- percent: Usage percentage
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
- read_count: Number of reads
- write_count: Number of writes
- read_bytes: Bytes read
- write_bytes: Bytes written
- read_time: Time spent reading in ms
- write_time: Time spent writing in ms
- read_bytes_delta: Bytes read since last collection
- write_bytes_delta: Bytes written since last collection
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
return data
except Exception as e:
logger.error(f"Error collecting disk metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect disk metrics from psutil."""
metrics = {}
# Collect partition usage
partitions_data = {}
partitions = psutil.disk_partitions(all=False)
for partition in partitions:
# Skip unwanted filesystem types
if partition.fstype in self.exclude_types:
continue
# Skip if we're only monitoring specific partitions
if self.partitions and partition.mountpoint not in self.partitions:
continue
try:
usage = psutil.disk_usage(partition.mountpoint)
partitions_data[partition.mountpoint] = {
'device': partition.device,
'fstype': partition.fstype,
'total': usage.total,
'used': usage.used,
'free': usage.free,
'percent': usage.percent
}
except PermissionError:
logger.debug(f"Permission denied accessing {partition.mountpoint}")
continue
except Exception as e:
logger.warning(f"Error reading {partition.mountpoint}: {e}")
continue
metrics['partitions'] = partitions_data
# Collect I/O statistics
if self.include_io:
try:
io_counters = psutil.disk_io_counters(perdisk=True)
io_data = {}
for disk_name, counters in io_counters.items():
disk_stats = {
'read_count': counters.read_count,
'write_count': counters.write_count,
'read_bytes': counters.read_bytes,
'write_bytes': counters.write_bytes,
}
# Add time statistics if available
if hasattr(counters, 'read_time'):
disk_stats['read_time'] = counters.read_time
if hasattr(counters, 'write_time'):
disk_stats['write_time'] = counters.write_time
if hasattr(counters, 'busy_time'):
disk_stats['busy_time'] = counters.busy_time
# Calculate deltas from previous collection
if disk_name in self._prev_io:
prev = self._prev_io[disk_name]
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
io_data[disk_name] = disk_stats
metrics['io_counters'] = io_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect disk I/O statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Disk monitor cleanup")
# Plugin instance for automatic discovery
plugin = DiskMonitorPlugin
+168
View File
@@ -0,0 +1,168 @@
"""
Filesystem information plugin for Heartbeat.
Collects static filesystem and partition information using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import InfoPlugin
logger = logging.getLogger(__name__)
class FilesystemInfoPlugin(InfoPlugin):
"""
Collect filesystem and partition information.
This is an InfoPlugin that collects static information once during startup.
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
Collects:
- List of mounted filesystems
- Partition details (device, mount point, filesystem type, options)
- Filesystem capabilities and features
Configuration:
include_pseudo: Include pseudo/virtual filesystems (default: False)
exclude_types: List of additional filesystem types to exclude (default: [])
"""
name = "filesystem_info"
interval = 0 # InfoPlugin - collect once
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the filesystem info plugin.
Args:
config: Optional configuration dict with keys:
- include_pseudo: Include pseudo/virtual filesystems (default: False)
- exclude_types: List of filesystem types to exclude (default: [])
"""
super().__init__(config)
self.include_pseudo = self.config.get('include_pseudo', False)
# By default, no exclusions since all=False filters most pseudo filesystems
# Users can add specific types to exclude if needed
self.exclude_types = set(self.config.get('exclude_types', []))
if psutil is None:
raise ImportError("psutil library is required for filesystem_info plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - filesystem_info cannot run")
return False
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect filesystem information.
Returns only physical mounted filesystems by default.
Returns:
Dictionary with filesystem data:
- filesystems: List of filesystem dictionaries:
- device: Device name (e.g., /dev/sda1)
- mountpoint: Mount point path
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
- opts: Mount options (comma-separated string)
- maxfile: Maximum filename length
- maxpath: Maximum path length
- filesystem_types: List of unique filesystem types found
- mount_count: Total number of mounted filesystems
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_info()
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
return data
except Exception as e:
logger.error(f"Error collecting filesystem info: {e}")
return {"error": str(e)}
async def _collect_info(self) -> Dict[str, Any]:
"""Collect filesystem information from psutil."""
info = {}
filesystems = []
filesystem_types = set()
# Get mounted disk partitions
# all=False returns only physical devices (real mounted filesystems)
# all=True would include pseudo filesystems (proc, sysfs, etc.)
partitions = psutil.disk_partitions(all=self.include_pseudo)
for partition in partitions:
# Additional filtering if exclude_types is specified
if partition.fstype in self.exclude_types:
continue
fs_info = {
'device': partition.device,
'mountpoint': partition.mountpoint,
'fstype': partition.fstype,
'opts': partition.opts,
}
# Try to get filesystem capabilities
try:
# Get path configuration for this mount point
import os
if hasattr(os, 'pathconf'):
try:
# Maximum filename length
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
if max_name:
fs_info['maxfile'] = max_name
except (OSError, ValueError):
pass
try:
# Maximum path length
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
if max_path:
fs_info['maxpath'] = max_path
except (OSError, ValueError):
pass
except Exception as e:
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
filesystems.append(fs_info)
filesystem_types.add(partition.fstype)
info['filesystems'] = filesystems
info['filesystem_types'] = sorted(list(filesystem_types))
info['mount_count'] = len(filesystems)
# Add some additional filesystem statistics
try:
# Get boot time (useful for determining filesystem mount times)
boot_time = psutil.boot_time()
info['boot_time'] = boot_time
except Exception as e:
logger.debug(f"Could not get boot time: {e}")
return info
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Filesystem info cleanup")
# Plugin instance for automatic discovery
plugin = FilesystemInfoPlugin
+147
View File
@@ -0,0 +1,147 @@
"""
Memory monitoring plugin for Heartbeat.
Collects memory and swap usage statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class MemoryMonitorPlugin(MonitorPlugin):
"""
Monitor memory and swap usage.
Collects:
- Physical memory (RAM) usage and statistics
- Virtual memory details
- Swap memory usage and statistics
- Memory available for applications
Configuration:
interval: Collection interval in seconds (default: 300)
include_swap: Include swap statistics (default: True)
"""
name = "memory_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the memory monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- include_swap: Include swap statistics (default: True)
"""
super().__init__(config)
self.include_swap = self.config.get('include_swap', True)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for memory_monitor plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - memory_monitor cannot run")
return False
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current memory statistics.
Returns:
Dictionary with memory metrics:
- memory_total: Total physical RAM in bytes
- memory_available: Available memory in bytes
- memory_used: Used memory in bytes
- memory_free: Free memory in bytes
- memory_percent: Memory usage percentage
- memory_active: Active memory (Unix)
- memory_inactive: Inactive memory (Unix)
- memory_buffers: Buffers (Linux)
- memory_cached: Cached (Linux)
- memory_shared: Shared (Linux)
- swap_total: Total swap in bytes (if include_swap)
- swap_used: Used swap in bytes (if include_swap)
- swap_free: Free swap in bytes (if include_swap)
- swap_percent: Swap usage percentage (if include_swap)
- swap_sin: Bytes swapped in from disk (if include_swap)
- swap_sout: Bytes swapped out to disk (if include_swap)
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected memory metrics: {len(data)} fields")
return data
except Exception as e:
logger.error(f"Error collecting memory metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect memory metrics from psutil."""
metrics = {}
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available
metrics['memory_used'] = vmem.used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
metrics['memory_active'] = vmem.active
if hasattr(vmem, 'inactive'):
metrics['memory_inactive'] = vmem.inactive
if hasattr(vmem, 'buffers'):
metrics['memory_buffers'] = vmem.buffers
if hasattr(vmem, 'cached'):
metrics['memory_cached'] = vmem.cached
if hasattr(vmem, 'shared'):
metrics['memory_shared'] = vmem.shared
# Swap memory statistics
if self.include_swap:
try:
swap = psutil.swap_memory()
metrics['swap_total'] = swap.total
metrics['swap_used'] = swap.used
metrics['swap_free'] = swap.free
metrics['swap_percent'] = swap.percent
# Swap in/out counters (may not be available on all platforms)
if hasattr(swap, 'sin'):
metrics['swap_sin'] = swap.sin
if hasattr(swap, 'sout'):
metrics['swap_sout'] = swap.sout
except Exception as e:
logger.warning(f"Could not collect swap statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Memory monitor cleanup")
# Plugin instance for automatic discovery
plugin = MemoryMonitorPlugin
+283
View File
@@ -0,0 +1,283 @@
"""Nagios Plugin Runner for Heartbeat.
Executes Nagios-compatible monitoring plugins and parses their output.
Nagios Plugin Standard:
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
- Output format: Single line status message, optional performance data
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
Example configuration in ~/.hb.yaml:
```yaml
nagios_runner:
interval: 60
commands:
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
```
"""
import re
import subprocess
from typing import Any, Dict, List, Optional, Tuple
from hbd.client.plugin import MonitorPlugin
# Nagios exit codes
NAGIOS_OK = 0
NAGIOS_WARNING = 1
NAGIOS_CRITICAL = 2
NAGIOS_UNKNOWN = 3
STATUS_NAMES = {
NAGIOS_OK: "OK",
NAGIOS_WARNING: "WARNING",
NAGIOS_CRITICAL: "CRITICAL",
NAGIOS_UNKNOWN: "UNKNOWN"
}
class NagiosRunnerPlugin(MonitorPlugin):
"""Run Nagios-compatible monitoring plugins.
This plugin executes external Nagios plugins and collects their output,
including status codes, messages, and performance data.
Configuration:
interval: Collection interval in seconds (default: 300)
commands: List of command definitions with 'name' and 'command' keys
timeout: Command execution timeout in seconds (default: 30)
shell: Whether to execute commands via shell (default: True)
Example:
nagios_runner:
interval: 300 # Check every 5 minutes
timeout: 30
commands:
- name: check_disk
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
"""
name = "nagios_runner"
version = "1.0.0"
description = "Execute Nagios-compatible monitoring plugins"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
# Extract configuration
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
self.timeout: int = config.get("timeout", 30) if config else 30
self.shell: bool = config.get("shell", True) if config else True
self.interval = config.get("interval", 300) if config else 300
# Validate commands
if not self.commands:
self.logger.warning(
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
)
async def initialize(self) -> bool:
"""Initialize the Nagios runner plugin.
Returns:
True if at least one command is configured, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
if not self.commands:
self.logger.error("No Nagios commands configured")
return False
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
for cmd_config in self.commands:
name = cmd_config.get("name", "unnamed")
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
return True
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect metrics from all configured Nagios plugins.
Returns:
Dictionary with results from all plugins
"""
results = {}
# Track overall status (worst status wins)
worst_status = NAGIOS_OK
for cmd_config in self.commands:
name = cmd_config.get("name")
command = cmd_config.get("command")
if not name or not command:
self.logger.warning("Skipping command with missing name or command")
continue
# Execute plugin
try:
status_code, output, perfdata = await self._run_nagios_plugin(command)
# Store results
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
results[f"{name}_status_code"] = status_code
results[f"{name}_output"] = output
# Track worst status
if status_code > worst_status:
worst_status = status_code
# Parse and add performance data
if perfdata:
for metric_name, metric_value in perfdata.items():
results[f"{name}_{metric_name}"] = metric_value
self.logger.debug(
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
)
except Exception as e:
self.logger.error(f"Error running {name}: {e}", exc_info=True)
results[f"{name}_status"] = "ERROR"
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
results[f"{name}_output"] = str(e)
worst_status = NAGIOS_UNKNOWN
# Add overall status
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
results["overall_status_code"] = worst_status
results["plugin_count"] = len(self.commands)
return results
async def _run_nagios_plugin(
self,
command: str
) -> Tuple[int, str, Dict[str, Any]]:
"""Execute a Nagios plugin and parse its output.
Args:
command: Command string to execute
Returns:
Tuple of (status_code, output_message, performance_data_dict)
"""
try:
# Run command
result = subprocess.run(
command,
shell=self.shell,
capture_output=True,
timeout=self.timeout,
text=True
)
status_code = result.returncode
output = result.stdout.strip()
# Nagios plugins can return codes > 3, treat as UNKNOWN
if status_code > 3:
status_code = NAGIOS_UNKNOWN
# Parse performance data
perfdata = self._parse_perfdata(output)
# Extract just the status message (before the pipe if present)
if '|' in output:
output_msg = output.split('|')[0].strip()
else:
output_msg = output
return status_code, output_msg, perfdata
except subprocess.TimeoutExpired:
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
except Exception as e:
self.logger.error(f"Error executing command: {e}")
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
"""Parse Nagios performance data from plugin output.
Nagios performance data format:
'label'=value[UOM];[warn];[crit];[min];[max]
Multiple metrics separated by spaces.
Args:
output: Plugin output string
Returns:
Dictionary of metric_name: value
"""
perfdata = {}
# Performance data comes after the pipe character
if '|' not in output:
return perfdata
perf_section = output.split('|', 1)[1].strip()
# Regex to match performance data format
# Matches: 'label'=value or label=value
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
for match in re.finditer(perf_regex, perf_section):
label = match.group(1).strip()
value_str = match.group(2)
uom = match.group(3) or ""
warn = match.group(4)
crit = match.group(5)
min_val = match.group(6)
max_val = match.group(7)
# Convert value to float
try:
value = float(value_str)
except ValueError:
continue
# Store the value
perfdata[label] = value
# Optionally store UOM as separate field
if uom:
perfdata[f"{label}_uom"] = uom
# Store thresholds if present
if warn:
try:
perfdata[f"{label}_warn"] = float(warn)
except ValueError:
pass
if crit:
try:
perfdata[f"{label}_crit"] = float(crit)
except ValueError:
pass
if min_val:
try:
perfdata[f"{label}_min"] = float(min_val)
except ValueError:
pass
if max_val:
try:
perfdata[f"{label}_max"] = float(max_val)
except ValueError:
pass
return perfdata
+240
View File
@@ -0,0 +1,240 @@
"""
Network monitoring plugin for Heartbeat.
Collects network interface statistics and connection information using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class NetworkMonitorPlugin(MonitorPlugin):
"""
Monitor network interface statistics and connections.
Collects:
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
- Per-interface statistics
- Network connection counts by state
- Interface addresses and configuration
Configuration:
interval: Collection interval in seconds (default: 300)
interfaces: List of interfaces to monitor (default: all)
include_connections: Include connection statistics (default: True)
include_addresses: Include interface addresses (default: False)
"""
name = "network_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the network monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- interfaces: List of specific interfaces to monitor
- include_connections: Include connection stats (default: True)
- include_addresses: Include interface addresses (default: False)
"""
super().__init__(config)
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
self.include_connections = self.config.get('include_connections', True)
self.include_addresses = self.config.get('include_addresses', False)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for network_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - network_monitor cannot run")
return False
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
f"connections: {self.include_connections})")
# Initialize I/O counters
try:
self._prev_io = psutil.net_io_counters(pernic=True)
except Exception as e:
logger.warning(f"Could not initialize network I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current network statistics.
Returns:
Dictionary with network metrics:
- interfaces: Dict of interface statistics, keyed by interface name
- bytes_sent: Total bytes sent
- bytes_recv: Total bytes received
- packets_sent: Total packets sent
- packets_recv: Total packets received
- errin: Total incoming errors
- errout: Total outgoing errors
- dropin: Total incoming packets dropped
- dropout: Total outgoing packets dropped
- bytes_sent_delta: Bytes sent since last collection
- bytes_recv_delta: Bytes received since last collection
- packets_sent_delta: Packets sent since last collection
- packets_recv_delta: Packets received since last collection
- connections: Connection statistics by state (if include_connections)
- ESTABLISHED: Count of established connections
- LISTEN: Count of listening sockets
- TIME_WAIT: Count of TIME_WAIT connections
- etc.
- addresses: Interface address information (if include_addresses)
- Dict keyed by interface name with address details
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
return data
except Exception as e:
logger.error(f"Error collecting network metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect network metrics from psutil."""
metrics = {}
# Collect per-interface I/O counters
try:
io_counters = psutil.net_io_counters(pernic=True)
interfaces_data = {}
for iface_name, counters in io_counters.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_stats = {
'bytes_sent': counters.bytes_sent,
'bytes_recv': counters.bytes_recv,
'packets_sent': counters.packets_sent,
'packets_recv': counters.packets_recv,
'errin': counters.errin,
'errout': counters.errout,
'dropin': counters.dropin,
'dropout': counters.dropout,
}
# Calculate deltas from previous collection
if iface_name in self._prev_io:
prev = self._prev_io[iface_name]
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
interfaces_data[iface_name] = iface_stats
metrics['interfaces'] = interfaces_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect network I/O counters: {e}")
# Collect connection statistics
if self.include_connections:
try:
connections = psutil.net_connections(kind='inet')
conn_stats = {}
# Count connections by state
for conn in connections:
state = conn.status
conn_stats[state] = conn_stats.get(state, 0) + 1
metrics['connections'] = conn_stats
except (PermissionError, psutil.AccessDenied):
logger.debug("Permission denied for net_connections (requires root/admin)")
except Exception as e:
logger.warning(f"Could not collect connection statistics: {e}")
# Collect interface addresses
if self.include_addresses:
try:
addresses = psutil.net_if_addrs()
addr_data = {}
for iface_name, addrs in addresses.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_addrs = []
for addr in addrs:
addr_info = {
'family': str(addr.family),
'address': addr.address,
}
if addr.netmask:
addr_info['netmask'] = addr.netmask
if addr.broadcast:
addr_info['broadcast'] = addr.broadcast
iface_addrs.append(addr_info)
addr_data[iface_name] = iface_addrs
metrics['addresses'] = addr_data
except Exception as e:
logger.warning(f"Could not collect interface addresses: {e}")
# Add interface stats (up/down status, speed, mtu)
try:
if_stats = psutil.net_if_stats()
stats_data = {}
for iface_name, stats in if_stats.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
stats_data[iface_name] = {
'isup': stats.isup,
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
'speed': stats.speed,
'mtu': stats.mtu,
}
metrics['interface_stats'] = stats_data
except Exception as e:
logger.warning(f"Could not collect interface stats: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Network monitor cleanup")
# Plugin instance for automatic discovery
plugin = NetworkMonitorPlugin
+136
View File
@@ -0,0 +1,136 @@
"""OS Information Plugin for Heartbeat.
Collects static operating system information including OS name, version,
kernel, architecture, and distribution details.
"""
import platform
import sys
from pathlib import Path
from typing import Any, Dict, Optional
# Import from parent package
from hbd.client.plugin import InfoPlugin
class OSInfoPlugin(InfoPlugin):
"""Collect operating system information.
This plugin gathers static OS information that rarely changes:
- OS name and version
- Kernel version
- Architecture (x86_64, arm64, etc.)
- Distribution details (for Linux)
- Python version (used by hbc)
"""
name = "os_info"
version = "1.0.0"
description = "Operating system and platform information"
interval = 0 # InfoPlugin: collect once at startup
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
async def initialize(self) -> bool:
"""Initialize the OS info plugin.
Returns:
True (always succeeds - platform module is stdlib)
"""
self.logger.info(f"Initializing {self.name} plugin")
return True
async def _collect_info(self) -> Dict[str, Any]:
"""Collect OS information.
Returns:
Dictionary with OS details
"""
try:
data = {
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
"node": platform.node(), # hostname
"release": platform.release(), # kernel version
"version": platform.version(), # detailed version
"machine": platform.machine(), # e.g., "x86_64", "arm64"
"processor": platform.processor(), # processor name
"architecture": platform.architecture()[0], # e.g., "64bit"
"python_version": platform.python_version(),
"python_implementation": platform.python_implementation(),
}
# Add Linux-specific distribution info
if platform.system() == "Linux":
data.update(self._get_linux_distro())
# Add macOS-specific info
elif platform.system() == "Darwin":
data["macos_version"] = platform.mac_ver()[0]
# Add Windows-specific info
elif platform.system() == "Windows":
win_ver = platform.win32_ver()
data["windows_release"] = win_ver[0]
data["windows_version"] = win_ver[1]
data["windows_sp"] = win_ver[2]
data["windows_type"] = win_ver[3]
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
return data
except Exception as e:
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
return {}
def _get_linux_distro(self) -> Dict[str, str]:
"""Get Linux distribution information.
Returns:
Dictionary with distribution details
"""
distro_info = {}
# Try reading /etc/os-release (standard on modern Linux)
os_release = Path("/etc/os-release")
if os_release.exists():
try:
with open(os_release) as f:
for line in f:
line = line.strip()
if "=" in line and not line.startswith("#"):
key, value = line.split("=", 1)
# Remove quotes from value
value = value.strip('"').strip("'")
# Map common keys
if key == "NAME":
distro_info["distro_name"] = value
elif key == "VERSION":
distro_info["distro_version"] = value
elif key == "ID":
distro_info["distro_id"] = value
elif key == "VERSION_ID":
distro_info["distro_version_id"] = value
elif key == "PRETTY_NAME":
distro_info["distro_pretty_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/os-release: {e}")
# Fallback: try lsb_release (older systems)
elif Path("/etc/lsb-release").exists():
try:
with open("/etc/lsb-release") as f:
for line in f:
line = line.strip()
if "=" in line:
key, value = line.split("=", 1)
if key == "DISTRIB_ID":
distro_info["distro_id"] = value
elif key == "DISTRIB_RELEASE":
distro_info["distro_version"] = value
elif key == "DISTRIB_DESCRIPTION":
distro_info["distro_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
return distro_info