e790663f9f
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size from /proc/spl/kstat/zfs/arcstats and add it to available memory before computing memory_percent and memory_used. No-op on systems without ZFS. cpu_monitor: report uptime_seconds via psutil.boot_time() (full client) and /proc/uptime (hbc_mini). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
137 lines
4.9 KiB
Python
137 lines
4.9 KiB
Python
"""CPU Monitoring Plugin for Heartbeat.
|
|
|
|
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
|
load average, and process counts.
|
|
"""
|
|
|
|
from typing import Any, Dict, Optional
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Import from parent package
|
|
from hbd.client.plugin import MonitorPlugin
|
|
|
|
|
|
class CPUMonitorPlugin(MonitorPlugin):
|
|
"""Monitor CPU usage and load.
|
|
|
|
Collects:
|
|
- Overall CPU usage percentage
|
|
- Per-core CPU usage (if enabled in config)
|
|
- Load average (1min, 5min, 15min)
|
|
- Process count
|
|
- CPU frequency (if available)
|
|
"""
|
|
|
|
name = "cpu_monitor"
|
|
version = "1.0.0"
|
|
description = "CPU usage and load monitoring"
|
|
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
super().__init__(config)
|
|
self.psutil = None
|
|
self.per_core = config.get("per_core", False) if config else False
|
|
self.interval = config.get("interval", 300) if config else 300
|
|
|
|
async def initialize(self) -> bool:
|
|
"""Initialize the CPU monitor plugin.
|
|
|
|
Checks if psutil is available.
|
|
|
|
Returns:
|
|
True if psutil is available, False otherwise
|
|
"""
|
|
self.logger.info(f"Initializing {self.name} plugin")
|
|
|
|
try:
|
|
import psutil
|
|
self.psutil = psutil
|
|
self.logger.info(f"{self.name} initialized successfully")
|
|
return True
|
|
except ImportError:
|
|
self.logger.error(
|
|
"psutil module not available. Install with: pip install psutil"
|
|
)
|
|
return False
|
|
|
|
async def _collect_metrics(self) -> Dict[str, Any]:
|
|
"""Collect CPU metrics.
|
|
|
|
Returns:
|
|
Dictionary with CPU metrics
|
|
"""
|
|
if not self.psutil:
|
|
return {}
|
|
|
|
try:
|
|
data = {}
|
|
|
|
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
|
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
|
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
|
|
|
# Per-core CPU usage (if enabled)
|
|
if self.per_core:
|
|
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
|
data["cpu_per_core"] = per_core_percents
|
|
data["cpu_core_count"] = len(per_core_percents)
|
|
else:
|
|
# Just report core count
|
|
data["cpu_core_count"] = self.psutil.cpu_count()
|
|
|
|
# Load average (Unix-like systems only)
|
|
try:
|
|
load_avg = self.psutil.getloadavg()
|
|
data["load_1min"] = round(load_avg[0], 2)
|
|
data["load_5min"] = round(load_avg[1], 2)
|
|
data["load_15min"] = round(load_avg[2], 2)
|
|
except (AttributeError, OSError):
|
|
# Not available on Windows
|
|
pass
|
|
|
|
# Process count
|
|
try:
|
|
data["process_count"] = len(self.psutil.pids())
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not get process count: {e}")
|
|
|
|
# CPU frequency (if available)
|
|
try:
|
|
freq = self.psutil.cpu_freq()
|
|
if freq:
|
|
data["cpu_freq_current"] = round(freq.current, 2)
|
|
data["cpu_freq_min"] = round(freq.min, 2)
|
|
data["cpu_freq_max"] = round(freq.max, 2)
|
|
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
|
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
|
self.logger.debug(f"CPU frequency not available: {e}")
|
|
pass
|
|
|
|
# CPU times (user, system, idle, etc.)
|
|
try:
|
|
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
|
data["cpu_user"] = round(cpu_times.user, 1)
|
|
data["cpu_system"] = round(cpu_times.system, 1)
|
|
data["cpu_idle"] = round(cpu_times.idle, 1)
|
|
if hasattr(cpu_times, "iowait"):
|
|
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
|
except Exception as e:
|
|
self.logger.debug(f"Could not get CPU times: {e}")
|
|
|
|
# Uptime in seconds
|
|
try:
|
|
import time
|
|
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
|
|
except Exception as e:
|
|
self.logger.debug(f"Could not get uptime: {e}")
|
|
|
|
self.logger.debug(
|
|
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
|
)
|
|
return data
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
|
return {}
|