Files
heartbeat/hbd/client/plugins/memory_monitor.py
T
andreas e790663f9f feat: exclude ZFS ARC from memory_percent; add uptime_seconds to cpu_monitor
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in
MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size
from /proc/spl/kstat/zfs/arcstats and add it to available memory before
computing memory_percent and memory_used. No-op on systems without ZFS.

cpu_monitor: report uptime_seconds via psutil.boot_time() (full client)
and /proc/uptime (hbc_mini).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 12:09:58 -04:00

176 lines
6.1 KiB
Python

"""
Memory monitoring plugin for Heartbeat.
Collects memory and swap usage statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
def _zfs_arc_bytes() -> int:
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
kernel (it is not in SReclaimable), so it would otherwise be counted as
used memory.
"""
try:
with open("/proc/spl/kstat/zfs/arcstats") as fh:
for line in fh:
parts = line.split()
if len(parts) >= 3 and parts[0] == "size":
return int(parts[2])
except (OSError, ValueError):
pass
return 0
logger = logging.getLogger(__name__)
class MemoryMonitorPlugin(MonitorPlugin):
"""
Monitor memory and swap usage.
Collects:
- Physical memory (RAM) usage and statistics
- Virtual memory details
- Swap memory usage and statistics
- Memory available for applications
Configuration:
interval: Collection interval in seconds (default: 300)
include_swap: Include swap statistics (default: True)
"""
name = "memory_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the memory monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- include_swap: Include swap statistics (default: True)
"""
super().__init__(config)
self.include_swap = self.config.get('include_swap', True)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for memory_monitor plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - memory_monitor cannot run")
return False
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current memory statistics.
Returns:
Dictionary with memory metrics:
- memory_total: Total physical RAM in bytes
- memory_available: Available memory in bytes
- memory_used: Used memory in bytes
- memory_free: Free memory in bytes
- memory_percent: Memory usage percentage
- memory_active: Active memory (Unix)
- memory_inactive: Inactive memory (Unix)
- memory_buffers: Buffers (Linux)
- memory_cached: Cached (Linux)
- memory_shared: Shared (Linux)
- swap_total: Total swap in bytes (if include_swap)
- swap_used: Used swap in bytes (if include_swap)
- swap_free: Free swap in bytes (if include_swap)
- swap_percent: Swap usage percentage (if include_swap)
- swap_sin: Bytes swapped in from disk (if include_swap)
- swap_sout: Bytes swapped out to disk (if include_swap)
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected memory metrics: {len(data)} fields")
return data
except Exception as e:
logger.error(f"Error collecting memory metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect memory metrics from psutil."""
metrics = {}
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
# psutil's available already excludes page cache / file buffers
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
# does not include it in SReclaimable / MemAvailable even though it is
# reclaimable.
arc_bytes = _zfs_arc_bytes()
available = min(vmem.available + arc_bytes, vmem.total)
used = vmem.total - available
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
metrics['memory_total'] = vmem.total
metrics['memory_available'] = available
metrics['memory_used'] = used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
metrics['memory_active'] = vmem.active
if hasattr(vmem, 'inactive'):
metrics['memory_inactive'] = vmem.inactive
if hasattr(vmem, 'buffers'):
metrics['memory_buffers'] = vmem.buffers
if hasattr(vmem, 'cached'):
metrics['memory_cached'] = vmem.cached
if hasattr(vmem, 'shared'):
metrics['memory_shared'] = vmem.shared
# Swap memory statistics
if self.include_swap:
try:
swap = psutil.swap_memory()
metrics['swap_total'] = swap.total
metrics['swap_used'] = swap.used
metrics['swap_free'] = swap.free
metrics['swap_percent'] = swap.percent
# Swap in/out counters (may not be available on all platforms)
if hasattr(swap, 'sin'):
metrics['swap_sin'] = swap.sin
if hasattr(swap, 'sout'):
metrics['swap_sout'] = swap.sout
except Exception as e:
logger.warning(f"Could not collect swap statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Memory monitor cleanup")
# Plugin instance for automatic discovery
plugin = MemoryMonitorPlugin