Files
heartbeat/hbd/client/plugins/cpu_monitor.py
T
andreas e790663f9f feat: exclude ZFS ARC from memory_percent; add uptime_seconds to cpu_monitor
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in
MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size
from /proc/spl/kstat/zfs/arcstats and add it to available memory before
computing memory_percent and memory_used. No-op on systems without ZFS.

cpu_monitor: report uptime_seconds via psutil.boot_time() (full client)
and /proc/uptime (hbc_mini).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 12:09:58 -04:00

137 lines
4.9 KiB
Python

"""CPU Monitoring Plugin for Heartbeat.
Collects CPU usage statistics including overall CPU percentage, per-core usage,
load average, and process counts.
"""
from typing import Any, Dict, Optional
import sys
from pathlib import Path
# Import from parent package
from hbd.client.plugin import MonitorPlugin
class CPUMonitorPlugin(MonitorPlugin):
"""Monitor CPU usage and load.
Collects:
- Overall CPU usage percentage
- Per-core CPU usage (if enabled in config)
- Load average (1min, 5min, 15min)
- Process count
- CPU frequency (if available)
"""
name = "cpu_monitor"
version = "1.0.0"
description = "CPU usage and load monitoring"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.psutil = None
self.per_core = config.get("per_core", False) if config else False
self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool:
"""Initialize the CPU monitor plugin.
Checks if psutil is available.
Returns:
True if psutil is available, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
try:
import psutil
self.psutil = psutil
self.logger.info(f"{self.name} initialized successfully")
return True
except ImportError:
self.logger.error(
"psutil module not available. Install with: pip install psutil"
)
return False
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect CPU metrics.
Returns:
Dictionary with CPU metrics
"""
if not self.psutil:
return {}
try:
data = {}
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
# Per-core CPU usage (if enabled)
if self.per_core:
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
data["cpu_per_core"] = per_core_percents
data["cpu_core_count"] = len(per_core_percents)
else:
# Just report core count
data["cpu_core_count"] = self.psutil.cpu_count()
# Load average (Unix-like systems only)
try:
load_avg = self.psutil.getloadavg()
data["load_1min"] = round(load_avg[0], 2)
data["load_5min"] = round(load_avg[1], 2)
data["load_15min"] = round(load_avg[2], 2)
except (AttributeError, OSError):
# Not available on Windows
pass
# Process count
try:
data["process_count"] = len(self.psutil.pids())
except Exception as e:
self.logger.warning(f"Could not get process count: {e}")
# CPU frequency (if available)
try:
freq = self.psutil.cpu_freq()
if freq:
data["cpu_freq_current"] = round(freq.current, 2)
data["cpu_freq_min"] = round(freq.min, 2)
data["cpu_freq_max"] = round(freq.max, 2)
except (AttributeError, OSError, RuntimeError, SystemError) as e:
# Not available on all systems, or may fail on FreeBSD with sysctl issues
self.logger.debug(f"CPU frequency not available: {e}")
pass
# CPU times (user, system, idle, etc.)
try:
cpu_times = self.psutil.cpu_times_percent(interval=0)
data["cpu_user"] = round(cpu_times.user, 1)
data["cpu_system"] = round(cpu_times.system, 1)
data["cpu_idle"] = round(cpu_times.idle, 1)
if hasattr(cpu_times, "iowait"):
data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
# Uptime in seconds
try:
import time
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
except Exception as e:
self.logger.debug(f"Could not get uptime: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
)
return data
except Exception as e:
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
return {}