Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:
- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
"""HeartBeat Client (hbc) - System monitoring client."""
|
||||
|
||||
__version__ = "5.0.5"
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
CLIENT_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port where hbd servers listen
|
||||
"interval": 10, # Heartbeat interval in seconds
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
# Plugin configuration
|
||||
"plugins": {}, # Per-plugin configuration
|
||||
"thresholds": {}, # Threshold configuration for monitoring
|
||||
}
|
||||
|
||||
|
||||
def load_config(path=None):
|
||||
"""Load configuration from a YAML file and merge with client defaults.
|
||||
|
||||
If YAML is not available or the file does not exist, defaults are returned.
|
||||
|
||||
Args:
|
||||
path: Path to YAML config file (default: ~/.hb.yaml)
|
||||
|
||||
Returns:
|
||||
Dictionary with configuration
|
||||
"""
|
||||
cfg = CLIENT_DEFAULTS.copy()
|
||||
if not path:
|
||||
# default path (~/.hb.yaml)
|
||||
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
|
||||
|
||||
if os.path.exists(path):
|
||||
if yaml:
|
||||
with open(path) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
# Merge YAML data with defaults
|
||||
# Keep all keys from YAML to support plugin configs and future extensions
|
||||
for k, v in data.items():
|
||||
cfg[k] = v
|
||||
else:
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
@@ -0,0 +1,643 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HeartBeat Client (hbc) - Async version with plugin support.
|
||||
|
||||
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
|
||||
system information via plugins.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Import protocol and config
|
||||
from .config import load_config
|
||||
from ..common.proto import dicttos, stodict
|
||||
|
||||
# Import plugin system
|
||||
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||
|
||||
# Constants
|
||||
PORT = 50003
|
||||
INTERVAL = 10
|
||||
VER = 6
|
||||
MAXRECV = 32767
|
||||
|
||||
# Global state
|
||||
running = True
|
||||
dorestart = False
|
||||
|
||||
|
||||
class AsyncConnection:
|
||||
"""Async UDP connection to a heartbeat server."""
|
||||
|
||||
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
|
||||
self.conn_id = conn_id
|
||||
self.addr = addr
|
||||
self.port = port
|
||||
self.af = af
|
||||
self.name = name
|
||||
|
||||
self.ackcount = 0
|
||||
self.lastack = 0.0
|
||||
self.send_count = 0
|
||||
self.lastsend = 0.0
|
||||
self.rtts = [0.0]
|
||||
|
||||
self.transport: Optional[asyncio.DatagramTransport] = None
|
||||
self.protocol: Optional[asyncio.DatagramProtocol] = None
|
||||
|
||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
async def open(self) -> bool:
|
||||
"""Open the UDP connection.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Create datagram endpoint
|
||||
self.transport, self.protocol = await loop.create_datagram_endpoint(
|
||||
lambda: HeartbeatProtocol(self),
|
||||
family=self.af
|
||||
)
|
||||
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to open connection: {e}")
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close the connection."""
|
||||
if self.transport:
|
||||
self.transport.close()
|
||||
self.transport = None
|
||||
self.protocol = None
|
||||
|
||||
async def sendto(self, msg: dict, msg_id: str = "HTB"):
|
||||
"""Send a message to the server.
|
||||
|
||||
Args:
|
||||
msg: Message dictionary
|
||||
msg_id: Message ID (HTB, PLG, etc.)
|
||||
"""
|
||||
if not self.transport:
|
||||
await self.open()
|
||||
|
||||
if not self.transport:
|
||||
self.logger.error("Cannot send - no transport")
|
||||
return
|
||||
|
||||
# Add standard fields
|
||||
msg["name"] = shortname(self.name)
|
||||
msg["id"] = self.conn_id
|
||||
msg["ver"] = VER
|
||||
msg["time"] = time.time()
|
||||
|
||||
# Encode message
|
||||
data = dicttos(msg_id, msg, compress=True)
|
||||
|
||||
# Send
|
||||
self.transport.sendto(data, (self.addr, self.port))
|
||||
self.send_count += 1
|
||||
self.lastsend = time.time()
|
||||
|
||||
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||
|
||||
def handle_ack(self, msg: dict, now: float):
|
||||
"""Handle ACK message from server."""
|
||||
try:
|
||||
self.lastack = msg.get("time", now)
|
||||
rtt = (self.lastack - self.lastsend) * 2000.0 # Convert to ms
|
||||
except Exception:
|
||||
self.lastack = now
|
||||
rtt = (self.lastack - self.lastsend) * 1000.0
|
||||
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > 10:
|
||||
self.rtts.pop(0)
|
||||
|
||||
self.ackcount += 1
|
||||
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||
|
||||
|
||||
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
"""Protocol handler for incoming UDP messages."""
|
||||
|
||||
def __init__(self, connection: AsyncConnection):
|
||||
self.connection = connection
|
||||
self.logger = logging.getLogger("hbc.protocol")
|
||||
|
||||
def datagram_received(self, data: bytes, addr):
|
||||
"""Handle incoming datagram."""
|
||||
try:
|
||||
msg = stodict(data)
|
||||
if not msg:
|
||||
self.logger.warning(f"Failed to parse message from {addr}")
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
msg_id = msg.get("ID")
|
||||
|
||||
if msg_id == "ACK":
|
||||
self.connection.handle_ack(msg, now)
|
||||
elif msg_id == "CMD":
|
||||
# Command from server
|
||||
asyncio.create_task(handle_command(self.connection, msg))
|
||||
elif msg_id == "UPD":
|
||||
# Update from server
|
||||
asyncio.create_task(handle_update(self.connection, msg))
|
||||
else:
|
||||
self.logger.warning(f"Unknown message type: {msg_id}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||
|
||||
def error_received(self, exc):
|
||||
"""Handle protocol errors."""
|
||||
self.logger.error(f"Protocol error: {exc}")
|
||||
|
||||
|
||||
async def handle_command(conn: AsyncConnection, msg: dict):
|
||||
"""Execute a command received from server."""
|
||||
import subprocess
|
||||
|
||||
cmd = msg.get("cmd", "")
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
logger = logging.getLogger("hbc.command")
|
||||
logger.info(f"Executing command: {cmd}")
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
|
||||
).decode()
|
||||
status = "OK"
|
||||
except subprocess.CalledProcessError as e:
|
||||
result = str(e)
|
||||
status = "CalledProcessError"
|
||||
except subprocess.TimeoutExpired:
|
||||
result = "Command timed out"
|
||||
status = "Timeout"
|
||||
except Exception as e:
|
||||
result = str(e)
|
||||
status = "Error"
|
||||
|
||||
# Send response
|
||||
response = {
|
||||
"service": "command",
|
||||
"msg": f"{status} {result}"
|
||||
}
|
||||
await conn.sendto(response)
|
||||
|
||||
|
||||
async def handle_update(conn: AsyncConnection, msg: dict):
|
||||
"""Handle self-update from server."""
|
||||
import codecs
|
||||
import shutil
|
||||
|
||||
logger = logging.getLogger("hbc.update")
|
||||
|
||||
try:
|
||||
code = codecs.decode(msg["code"], "base64").decode()
|
||||
csum = msg["csum"]
|
||||
except Exception as e:
|
||||
error = f"Missing code/csum: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Verify checksum
|
||||
m = md5()
|
||||
m.update(code.encode())
|
||||
if m.hexdigest() != csum:
|
||||
error = "Checksum mismatch"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Backup current file
|
||||
fn = sys.argv[0]
|
||||
ofn = f"{fn}.sav"
|
||||
try:
|
||||
shutil.copy2(fn, ofn)
|
||||
except Exception as e:
|
||||
error = f"Backup failed: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
# Write new code
|
||||
try:
|
||||
with open(fn, "w") as fh:
|
||||
fh.write(code)
|
||||
except Exception as e:
|
||||
error = f"Write failed: {e}"
|
||||
logger.error(error)
|
||||
await conn.sendto({"service": "update", "msg": error})
|
||||
return
|
||||
|
||||
logger.info("Update successful, restart required")
|
||||
await conn.sendto({"service": "update", "msg": "OK"})
|
||||
|
||||
# Trigger restart
|
||||
global dorestart
|
||||
dorestart = True
|
||||
stop()
|
||||
|
||||
|
||||
async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
"""Send periodic heartbeats.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
interval: Heartbeat interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger("hbc.heartbeat")
|
||||
|
||||
while running:
|
||||
try:
|
||||
msg = {
|
||||
"acks": conn.ackcount,
|
||||
"rtt": conn.rtts[-1],
|
||||
"interval": interval
|
||||
}
|
||||
await conn.sendto(msg, "HTB")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
|
||||
|
||||
# Wait for next interval
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||
"""Collect and send plugin data.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
registry: Plugin registry
|
||||
"""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
|
||||
# Collect InfoPlugins once at startup
|
||||
info_plugins = registry.get_by_type(InfoPlugin)
|
||||
for plugin in info_plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Create PLG message with plugin name
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.info(f"Sent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
# Schedule MonitorPlugins
|
||||
# Group plugins by interval
|
||||
from collections import defaultdict
|
||||
by_interval = defaultdict(list)
|
||||
|
||||
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||
for plugin in monitor_plugins:
|
||||
by_interval[plugin.interval].append(plugin)
|
||||
|
||||
# Create tasks for each interval
|
||||
tasks = []
|
||||
for interval, plugins in by_interval.items():
|
||||
task = asyncio.create_task(
|
||||
plugin_collector_interval(conn, plugins, interval)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
async def plugin_collector_interval(
|
||||
conn: AsyncConnection,
|
||||
plugins: List,
|
||||
interval: int
|
||||
):
|
||||
"""Collect plugins on a specific interval.
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
plugins: List of plugins to collect
|
||||
interval: Collection interval in seconds
|
||||
"""
|
||||
logger = logging.getLogger(f"hbc.plugins.{interval}s")
|
||||
|
||||
while running:
|
||||
for plugin in plugins:
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
# Don't use encode_plugin_data - create dict directly
|
||||
plugin_msg = {"plugin": plugin.name, **data}
|
||||
await conn.sendto(plugin_msg, "PLG")
|
||||
logger.debug(f"Sent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error collecting {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
def shortname(name: str) -> str:
|
||||
"""Extract short hostname."""
|
||||
return name.split(".")[0]
|
||||
|
||||
|
||||
def stop():
|
||||
"""Stop the event loop."""
|
||||
global running
|
||||
running = False
|
||||
|
||||
|
||||
async def cleanup(connections: List[AsyncConnection]):
|
||||
"""Cleanup connections on shutdown."""
|
||||
logger = logging.getLogger("hbc.cleanup")
|
||||
logger.info("Cleaning up connections")
|
||||
|
||||
for conn in connections:
|
||||
try:
|
||||
msg = {
|
||||
"shutdown": 1,
|
||||
"acks": conn.ackcount
|
||||
}
|
||||
await conn.sendto(msg)
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending shutdown: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Give messages time to send
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
async def async_main(args, config):
|
||||
"""Async main function."""
|
||||
global running
|
||||
|
||||
logger = logging.getLogger("hbc.main")
|
||||
|
||||
# Setup
|
||||
iam = socket.gethostname()
|
||||
if args.name:
|
||||
iam = args.name
|
||||
|
||||
hb_hosts = args.hosts
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
||||
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
||||
|
||||
# Create connections
|
||||
connections = []
|
||||
conn_id = 1
|
||||
|
||||
for host in hb_hosts:
|
||||
try:
|
||||
addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
|
||||
except socket.gaierror as e:
|
||||
logger.error(f"Cannot resolve {host}: {e}")
|
||||
continue
|
||||
|
||||
for addr_info in addrs:
|
||||
af = addr_info[0]
|
||||
addr = addr_info[4][0]
|
||||
|
||||
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
|
||||
if await conn.open():
|
||||
connections.append(conn)
|
||||
conn_id += 1
|
||||
|
||||
if not connections:
|
||||
logger.error("No connections established")
|
||||
return 1
|
||||
|
||||
logger.info(f"Created {len(connections)} connections")
|
||||
|
||||
# Send boot/message if requested
|
||||
if args.boot or args.message:
|
||||
boot_msg = {}
|
||||
if args.boot:
|
||||
boot_msg["boot"] = 1
|
||||
if args.message:
|
||||
boot_msg["service"] = "service"
|
||||
boot_msg["msg"] = args.message
|
||||
|
||||
boot_msg["acks"] = 0
|
||||
for conn in connections:
|
||||
await conn.sendto(boot_msg)
|
||||
|
||||
if args.message and not args.daemon:
|
||||
# Message-only mode
|
||||
await cleanup(connections)
|
||||
return 0
|
||||
|
||||
# Load plugins
|
||||
registry = PluginRegistry()
|
||||
loader = PluginLoader(registry)
|
||||
|
||||
plugin_dir = Path(__file__).parent / "plugins"
|
||||
if plugin_dir.exists():
|
||||
count = await loader.load_from_directory(plugin_dir, config)
|
||||
logger.info(f"Loaded {count} plugins")
|
||||
else:
|
||||
logger.warning(f"Plugin directory not found: {plugin_dir}")
|
||||
|
||||
# Start async tasks
|
||||
tasks = []
|
||||
|
||||
# Heartbeat senders (one per connection)
|
||||
for conn in connections:
|
||||
task = asyncio.create_task(heartbeat_sender(conn, interval))
|
||||
tasks.append(task)
|
||||
|
||||
# Plugin collector (uses all connections, but we'll use first one)
|
||||
if connections and registry.get_enabled():
|
||||
task = asyncio.create_task(plugin_collector(connections[0], registry))
|
||||
tasks.append(task)
|
||||
|
||||
# Setup signal handlers
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
loop.add_signal_handler(sig, stop)
|
||||
|
||||
# Wait for stop or tasks to complete
|
||||
try:
|
||||
await asyncio.gather(*tasks)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Cleanup
|
||||
await cleanup(connections)
|
||||
await loader.unload_all()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def daemonize(
|
||||
working_dir="/",
|
||||
stdin="/dev/zero",
|
||||
stdout="/dev/null",
|
||||
stderr="/dev/null"
|
||||
):
|
||||
"""UNIX double-fork daemonization."""
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #1 failed: {e}\n")
|
||||
os._exit(1)
|
||||
|
||||
os.chdir(working_dir)
|
||||
os.setsid()
|
||||
os.umask(0)
|
||||
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
os._exit(0)
|
||||
except OSError as e:
|
||||
sys.stderr.write(f"fork #2 failed: {e}\n")
|
||||
sys.exit(1)
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
si = open(stdin, "r")
|
||||
so = open(stdout, "a+")
|
||||
se = open(stderr, "a+")
|
||||
|
||||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
def build_parser():
|
||||
"""Build argument parser."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hbc",
|
||||
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--boot",
|
||||
action="store_true",
|
||||
help="Send a boot message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--config",
|
||||
dest="configfile",
|
||||
help="Config file path (YAML)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--message",
|
||||
dest="message",
|
||||
help="Send a message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--name",
|
||||
dest="name",
|
||||
help="Name to use in heartbeat message"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--daemon",
|
||||
action="store_true",
|
||||
help="Run in daemon mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose",
|
||||
action="store_true",
|
||||
help="Verbose output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x", "--debug",
|
||||
action="count",
|
||||
default=0,
|
||||
help="Increase debug level"
|
||||
)
|
||||
parser.add_argument(
|
||||
"hosts",
|
||||
nargs="+",
|
||||
help="Heartbeat daemon hosts to send to"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
"""Main entry point."""
|
||||
global running, dorestart
|
||||
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Load config
|
||||
config = load_config(args.configfile)
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.INFO
|
||||
if args.verbose:
|
||||
log_level = logging.DEBUG
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
|
||||
# Daemonize if requested
|
||||
if args.daemon:
|
||||
print("Daemonizing...")
|
||||
import syslog
|
||||
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
|
||||
syslog.syslog(syslog.LOG_INFO, f"Starting heartbeat to {', '.join(args.hosts)}")
|
||||
daemonize()
|
||||
|
||||
# Reconfigure logging for syslog
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="hbc[%(process)d]: %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
# Run async main
|
||||
try:
|
||||
exit_code = asyncio.run(async_main(args, config))
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Interrupted by user")
|
||||
exit_code = 0
|
||||
except Exception as e:
|
||||
logging.error(f"Fatal error: {e}", exc_info=True)
|
||||
exit_code = 1
|
||||
|
||||
# Handle restart
|
||||
if dorestart:
|
||||
logging.info("Restarting...")
|
||||
os.execv(sys.argv[0], sys.argv)
|
||||
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,410 @@
|
||||
"""Plugin system for extending Heartbeat data collection and monitoring.
|
||||
|
||||
This module provides the base classes and infrastructure for the plugin system
|
||||
that enables extending hbc (client) data collection and hbd (server) processing.
|
||||
|
||||
Plugin Types:
|
||||
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
|
||||
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
|
||||
|
||||
Plugins run on the client (hbc) to gather data, which is then sent to the server
|
||||
(hbd) for storage, threshold checking, and display.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
|
||||
class Plugin(ABC):
|
||||
"""Base class for all plugins.
|
||||
|
||||
Attributes:
|
||||
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
|
||||
version: Plugin version string
|
||||
description: Human-readable description
|
||||
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
|
||||
enabled: Whether plugin is active (can be disabled via config)
|
||||
"""
|
||||
|
||||
name: str = ""
|
||||
version: str = "1.0.0"
|
||||
description: str = ""
|
||||
interval: int = 0
|
||||
enabled: bool = True
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""Initialize plugin with optional configuration.
|
||||
|
||||
Args:
|
||||
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.logger = logging.getLogger(f"plugin.{self.name}")
|
||||
self._initialized = False
|
||||
|
||||
@abstractmethod
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize plugin (load resources, check dependencies).
|
||||
|
||||
Called once when plugin is loaded. Plugins should validate dependencies
|
||||
(e.g., check if psutil is available) and prepare any resources.
|
||||
|
||||
Returns:
|
||||
True if initialization succeeded, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect data from the system.
|
||||
|
||||
This is the main method called on each collection interval. Should return
|
||||
a dictionary of key-value pairs representing the collected data.
|
||||
|
||||
Keys should be strings (metric names). Values can be:
|
||||
- Scalars: int, float, str, bool
|
||||
- Lists/dicts (will be serialized appropriately)
|
||||
|
||||
Returns:
|
||||
Dictionary of collected metrics, or empty dict on error
|
||||
"""
|
||||
pass
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Cleanup plugin resources before shutdown.
|
||||
|
||||
Called when plugin is being unloaded or on system shutdown.
|
||||
Override to release resources, close connections, etc.
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_data(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate collected data before sending to server.
|
||||
|
||||
Override to implement custom validation logic.
|
||||
|
||||
Args:
|
||||
data: Data returned from collect()
|
||||
|
||||
Returns:
|
||||
True if data is valid, False otherwise
|
||||
"""
|
||||
return isinstance(data, dict)
|
||||
|
||||
|
||||
class InfoPlugin(Plugin):
|
||||
"""Plugin for collecting static or rarely-changing information.
|
||||
|
||||
InfoPlugins collect data that doesn't change frequently:
|
||||
- OS name and version
|
||||
- Hardware specifications (CPU model, RAM size)
|
||||
- Network interface MAC addresses
|
||||
|
||||
Characteristics:
|
||||
- interval = 0 (collected once at startup by default)
|
||||
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
|
||||
- Data is cached and reused until next collection
|
||||
"""
|
||||
|
||||
interval: int = 0 # Collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._cached_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get cached data if available (avoids re-collection).
|
||||
|
||||
Returns:
|
||||
Cached data dict, or None if not yet collected
|
||||
"""
|
||||
return self._cached_data
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect and cache static information."""
|
||||
if self._cached_data is None:
|
||||
self._cached_data = await self._collect_info()
|
||||
return self._cached_data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual data collection.
|
||||
|
||||
Override this method instead of collect() for InfoPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
def invalidate_cache(self) -> None:
|
||||
"""Force re-collection on next collect() call."""
|
||||
self._cached_data = None
|
||||
|
||||
|
||||
class MonitorPlugin(Plugin):
|
||||
"""Plugin for collecting periodic monitoring data.
|
||||
|
||||
MonitorPlugins collect time-series metrics that change frequently:
|
||||
- CPU usage percentage
|
||||
- Memory consumption
|
||||
- Disk I/O statistics
|
||||
- Network traffic
|
||||
|
||||
Characteristics:
|
||||
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
|
||||
- Collected continuously on schedule
|
||||
- Data includes timestamps for time-series tracking
|
||||
"""
|
||||
|
||||
interval: int = 30 # Default: collect every 30 seconds
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._last_reading: Optional[Dict[str, Any]] = None
|
||||
|
||||
def get_last_reading(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get the last collected reading.
|
||||
|
||||
Returns:
|
||||
Last reading dict with timestamp, or None if not yet collected
|
||||
"""
|
||||
return self._last_reading
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""Collect monitoring data and store as last reading."""
|
||||
data = await self._collect_metrics()
|
||||
if data:
|
||||
# Add collection timestamp
|
||||
import time
|
||||
data['_timestamp'] = time.time()
|
||||
self._last_reading = data
|
||||
return data
|
||||
|
||||
@abstractmethod
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Internal method to perform actual metric collection.
|
||||
|
||||
Override this method instead of collect() for MonitorPlugins.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PluginRegistry:
|
||||
"""Registry for managing loaded plugins.
|
||||
|
||||
Maintains a collection of loaded plugins and provides methods to
|
||||
query plugins by name, type, or interval.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._plugins: Dict[str, Plugin] = {}
|
||||
self.logger = logging.getLogger("plugin.registry")
|
||||
|
||||
def register(self, plugin: Plugin) -> bool:
|
||||
"""Register a plugin instance.
|
||||
|
||||
Args:
|
||||
plugin: Plugin instance to register
|
||||
|
||||
Returns:
|
||||
True if registered successfully, False if name conflict
|
||||
"""
|
||||
if plugin.name in self._plugins:
|
||||
self.logger.error(f"Plugin '{plugin.name}' already registered")
|
||||
return False
|
||||
|
||||
self._plugins[plugin.name] = plugin
|
||||
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
|
||||
return True
|
||||
|
||||
def unregister(self, name: str) -> bool:
|
||||
"""Unregister a plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name to unregister
|
||||
|
||||
Returns:
|
||||
True if unregistered, False if not found
|
||||
"""
|
||||
if name in self._plugins:
|
||||
del self._plugins[name]
|
||||
self.logger.info(f"Unregistered plugin: {name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, name: str) -> Optional[Plugin]:
|
||||
"""Get plugin by name.
|
||||
|
||||
Args:
|
||||
name: Plugin name
|
||||
|
||||
Returns:
|
||||
Plugin instance or None if not found
|
||||
"""
|
||||
return self._plugins.get(name)
|
||||
|
||||
def get_all(self) -> List[Plugin]:
|
||||
"""Get all registered plugins."""
|
||||
return list(self._plugins.values())
|
||||
|
||||
def get_enabled(self) -> List[Plugin]:
|
||||
"""Get all enabled plugins."""
|
||||
return [p for p in self._plugins.values() if p.enabled]
|
||||
|
||||
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
|
||||
"""Get all plugins of a specific type.
|
||||
|
||||
Args:
|
||||
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
|
||||
|
||||
Returns:
|
||||
List of plugins matching the type
|
||||
"""
|
||||
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
|
||||
|
||||
def get_by_interval(self, interval: int) -> List[Plugin]:
|
||||
"""Get all plugins with a specific collection interval.
|
||||
|
||||
Args:
|
||||
interval: Interval in seconds (0 for one-time collection)
|
||||
|
||||
Returns:
|
||||
List of plugins with matching interval
|
||||
"""
|
||||
return [p for p in self._plugins.values() if p.interval == interval]
|
||||
|
||||
|
||||
class PluginLoader:
|
||||
"""Load plugins from filesystem and instantiate them.
|
||||
|
||||
Scans plugin directories for Python modules containing Plugin subclasses,
|
||||
loads them dynamically, and registers them with the PluginRegistry.
|
||||
"""
|
||||
|
||||
def __init__(self, registry: PluginRegistry):
|
||||
self.registry = registry
|
||||
self.logger = logging.getLogger("plugin.loader")
|
||||
self._loaded_modules: Dict[str, Any] = {}
|
||||
|
||||
async def load_from_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
) -> int:
|
||||
"""Load all plugins from a directory.
|
||||
|
||||
Scans for .py files, imports them, finds Plugin subclasses,
|
||||
instantiates them with config, initializes, and registers.
|
||||
|
||||
Args:
|
||||
directory: Path to plugin directory
|
||||
config: Configuration dict (may contain per-plugin config)
|
||||
|
||||
Returns:
|
||||
Number of plugins successfully loaded
|
||||
"""
|
||||
if not directory.exists() or not directory.is_dir():
|
||||
self.logger.warning(f"Plugin directory not found: {directory}")
|
||||
return 0
|
||||
|
||||
loaded_count = 0
|
||||
plugin_config = config or {}
|
||||
|
||||
# Scan for Python files
|
||||
for plugin_file in directory.glob("*.py"):
|
||||
if plugin_file.name.startswith("_"):
|
||||
continue # Skip __init__.py and private modules
|
||||
|
||||
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
|
||||
|
||||
try:
|
||||
# Load module dynamically
|
||||
module_name = f"plugins.{plugin_file.stem}"
|
||||
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
|
||||
if not spec or not spec.loader:
|
||||
self.logger.warning(f"Could not create spec for {plugin_file}")
|
||||
continue
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
self._loaded_modules[module_name] = module
|
||||
|
||||
self.logger.debug(f"Loaded module: {module_name}")
|
||||
|
||||
# Track which plugin classes we've already processed to avoid duplicates
|
||||
processed_classes = set()
|
||||
|
||||
# Find Plugin subclasses in module
|
||||
for name, obj in inspect.getmembers(module, inspect.isclass):
|
||||
# Skip base classes and non-Plugin classes
|
||||
if obj in (Plugin, InfoPlugin, MonitorPlugin):
|
||||
self.logger.debug(f"Skipping base class: {name}")
|
||||
continue
|
||||
if not issubclass(obj, Plugin):
|
||||
self.logger.debug(f"Skipping non-Plugin class: {name}")
|
||||
continue
|
||||
|
||||
# Skip if we've already processed this class (handles module-level aliases)
|
||||
if id(obj) in processed_classes:
|
||||
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
|
||||
continue
|
||||
processed_classes.add(id(obj))
|
||||
|
||||
self.logger.debug(f"Found plugin class: {name}")
|
||||
|
||||
# Instantiate plugin with config
|
||||
plugin_instance_config = plugin_config.get(obj.name, {})
|
||||
plugin = obj(config=plugin_instance_config)
|
||||
|
||||
# Initialize plugin
|
||||
try:
|
||||
initialized = await plugin.initialize()
|
||||
if not initialized:
|
||||
self.logger.warning(
|
||||
f"Plugin {plugin.name} failed initialization, skipping"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error initializing plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
continue
|
||||
|
||||
# Register with registry
|
||||
if self.registry.register(plugin):
|
||||
loaded_count += 1
|
||||
self.logger.info(
|
||||
f"Loaded plugin: {plugin.name} v{plugin.version} "
|
||||
f"(interval: {plugin.interval}s)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error loading plugin from {plugin_file}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
return loaded_count
|
||||
|
||||
async def unload_all(self) -> None:
|
||||
"""Unload all plugins and cleanup resources."""
|
||||
for plugin in self.registry.get_all():
|
||||
try:
|
||||
await plugin.cleanup()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error cleaning up plugin {plugin.name}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
self.registry.unregister(plugin.name)
|
||||
|
||||
# Remove loaded modules
|
||||
for module_name in self._loaded_modules:
|
||||
if module_name in sys.modules:
|
||||
del sys.modules[module_name]
|
||||
self._loaded_modules.clear()
|
||||
@@ -0,0 +1,129 @@
|
||||
"""CPU Monitoring Plugin for Heartbeat.
|
||||
|
||||
Collects CPU usage statistics including overall CPU percentage, per-core usage,
|
||||
load average, and process counts.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
class CPUMonitorPlugin(MonitorPlugin):
|
||||
"""Monitor CPU usage and load.
|
||||
|
||||
Collects:
|
||||
- Overall CPU usage percentage
|
||||
- Per-core CPU usage (if enabled in config)
|
||||
- Load average (1min, 5min, 15min)
|
||||
- Process count
|
||||
- CPU frequency (if available)
|
||||
"""
|
||||
|
||||
name = "cpu_monitor"
|
||||
version = "1.0.0"
|
||||
description = "CPU usage and load monitoring"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self.psutil = None
|
||||
self.per_core = config.get("per_core", False) if config else False
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the CPU monitor plugin.
|
||||
|
||||
Checks if psutil is available.
|
||||
|
||||
Returns:
|
||||
True if psutil is available, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
try:
|
||||
import psutil
|
||||
self.psutil = psutil
|
||||
self.logger.info(f"{self.name} initialized successfully")
|
||||
return True
|
||||
except ImportError:
|
||||
self.logger.error(
|
||||
"psutil module not available. Install with: pip install psutil"
|
||||
)
|
||||
return False
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect CPU metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with CPU metrics
|
||||
"""
|
||||
if not self.psutil:
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = {}
|
||||
|
||||
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
|
||||
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
|
||||
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
|
||||
|
||||
# Per-core CPU usage (if enabled)
|
||||
if self.per_core:
|
||||
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
|
||||
data["cpu_per_core"] = per_core_percents
|
||||
data["cpu_core_count"] = len(per_core_percents)
|
||||
else:
|
||||
# Just report core count
|
||||
data["cpu_core_count"] = self.psutil.cpu_count()
|
||||
|
||||
# Load average (Unix-like systems only)
|
||||
try:
|
||||
load_avg = self.psutil.getloadavg()
|
||||
data["load_1min"] = round(load_avg[0], 2)
|
||||
data["load_5min"] = round(load_avg[1], 2)
|
||||
data["load_15min"] = round(load_avg[2], 2)
|
||||
except (AttributeError, OSError):
|
||||
# Not available on Windows
|
||||
pass
|
||||
|
||||
# Process count
|
||||
try:
|
||||
data["process_count"] = len(self.psutil.pids())
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not get process count: {e}")
|
||||
|
||||
# CPU frequency (if available)
|
||||
try:
|
||||
freq = self.psutil.cpu_freq()
|
||||
if freq:
|
||||
data["cpu_freq_current"] = round(freq.current, 2)
|
||||
data["cpu_freq_min"] = round(freq.min, 2)
|
||||
data["cpu_freq_max"] = round(freq.max, 2)
|
||||
except (AttributeError, OSError, RuntimeError, SystemError) as e:
|
||||
# Not available on all systems, or may fail on FreeBSD with sysctl issues
|
||||
self.logger.debug(f"CPU frequency not available: {e}")
|
||||
pass
|
||||
|
||||
# CPU times (user, system, idle, etc.)
|
||||
try:
|
||||
cpu_times = self.psutil.cpu_times_percent(interval=0)
|
||||
data["cpu_user"] = round(cpu_times.user, 1)
|
||||
data["cpu_system"] = round(cpu_times.system, 1)
|
||||
data["cpu_idle"] = round(cpu_times.idle, 1)
|
||||
if hasattr(cpu_times, "iowait"):
|
||||
data["cpu_iowait"] = round(cpu_times.iowait, 1)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not get CPU times: {e}")
|
||||
|
||||
self.logger.debug(
|
||||
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
|
||||
)
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
|
||||
return {}
|
||||
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Disk monitoring plugin for Heartbeat.
|
||||
|
||||
Collects disk usage and I/O statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DiskMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor disk usage and I/O statistics.
|
||||
|
||||
Collects:
|
||||
- Disk partition information
|
||||
- Disk usage per partition (total, used, free, percent)
|
||||
- Disk I/O counters (read/write bytes, read/write count)
|
||||
- Disk I/O time statistics
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
partitions: List of mount points to monitor (default: all)
|
||||
include_io: Include disk I/O statistics (default: True)
|
||||
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
|
||||
"""
|
||||
|
||||
name = "disk_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the disk monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- partitions: List of specific mount points to monitor
|
||||
- include_io: Include I/O statistics (default: True)
|
||||
- exclude_types: List of filesystem types to exclude
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.partitions = self.config.get('partitions', None) # None = all partitions
|
||||
self.include_io = self.config.get('include_io', True)
|
||||
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for disk_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - disk_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
|
||||
|
||||
# Initialize I/O counters if available
|
||||
if self.include_io:
|
||||
try:
|
||||
self._prev_io = psutil.disk_io_counters(perdisk=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize disk I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current disk statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with disk metrics organized by partition:
|
||||
- partitions: Dict of partition data, keyed by mount point
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- fstype: Filesystem type (e.g., ext4)
|
||||
- total: Total space in bytes
|
||||
- used: Used space in bytes
|
||||
- free: Free space in bytes
|
||||
- percent: Usage percentage
|
||||
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
|
||||
- read_count: Number of reads
|
||||
- write_count: Number of writes
|
||||
- read_bytes: Bytes read
|
||||
- write_bytes: Bytes written
|
||||
- read_time: Time spent reading in ms
|
||||
- write_time: Time spent writing in ms
|
||||
- read_bytes_delta: Bytes read since last collection
|
||||
- write_bytes_delta: Bytes written since last collection
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting disk metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect disk metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect partition usage
|
||||
partitions_data = {}
|
||||
partitions = psutil.disk_partitions(all=False)
|
||||
|
||||
for partition in partitions:
|
||||
# Skip unwanted filesystem types
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
# Skip if we're only monitoring specific partitions
|
||||
if self.partitions and partition.mountpoint not in self.partitions:
|
||||
continue
|
||||
|
||||
try:
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
partitions_data[partition.mountpoint] = {
|
||||
'device': partition.device,
|
||||
'fstype': partition.fstype,
|
||||
'total': usage.total,
|
||||
'used': usage.used,
|
||||
'free': usage.free,
|
||||
'percent': usage.percent
|
||||
}
|
||||
except PermissionError:
|
||||
logger.debug(f"Permission denied accessing {partition.mountpoint}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading {partition.mountpoint}: {e}")
|
||||
continue
|
||||
|
||||
metrics['partitions'] = partitions_data
|
||||
|
||||
# Collect I/O statistics
|
||||
if self.include_io:
|
||||
try:
|
||||
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||
io_data = {}
|
||||
|
||||
for disk_name, counters in io_counters.items():
|
||||
disk_stats = {
|
||||
'read_count': counters.read_count,
|
||||
'write_count': counters.write_count,
|
||||
'read_bytes': counters.read_bytes,
|
||||
'write_bytes': counters.write_bytes,
|
||||
}
|
||||
|
||||
# Add time statistics if available
|
||||
if hasattr(counters, 'read_time'):
|
||||
disk_stats['read_time'] = counters.read_time
|
||||
if hasattr(counters, 'write_time'):
|
||||
disk_stats['write_time'] = counters.write_time
|
||||
if hasattr(counters, 'busy_time'):
|
||||
disk_stats['busy_time'] = counters.busy_time
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if disk_name in self._prev_io:
|
||||
prev = self._prev_io[disk_name]
|
||||
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
|
||||
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
|
||||
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
|
||||
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
|
||||
|
||||
io_data[disk_name] = disk_stats
|
||||
|
||||
metrics['io_counters'] = io_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect disk I/O statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Disk monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = DiskMonitorPlugin
|
||||
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Filesystem information plugin for Heartbeat.
|
||||
|
||||
Collects static filesystem and partition information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilesystemInfoPlugin(InfoPlugin):
|
||||
"""
|
||||
Collect filesystem and partition information.
|
||||
|
||||
This is an InfoPlugin that collects static information once during startup.
|
||||
|
||||
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
|
||||
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
|
||||
|
||||
Collects:
|
||||
- List of mounted filesystems
|
||||
- Partition details (device, mount point, filesystem type, options)
|
||||
- Filesystem capabilities and features
|
||||
|
||||
Configuration:
|
||||
include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
exclude_types: List of additional filesystem types to exclude (default: [])
|
||||
"""
|
||||
|
||||
name = "filesystem_info"
|
||||
interval = 0 # InfoPlugin - collect once
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the filesystem info plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- include_pseudo: Include pseudo/virtual filesystems (default: False)
|
||||
- exclude_types: List of filesystem types to exclude (default: [])
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_pseudo = self.config.get('include_pseudo', False)
|
||||
# By default, no exclusions since all=False filters most pseudo filesystems
|
||||
# Users can add specific types to exclude if needed
|
||||
self.exclude_types = set(self.config.get('exclude_types', []))
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for filesystem_info plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - filesystem_info cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect filesystem information.
|
||||
|
||||
Returns only physical mounted filesystems by default.
|
||||
|
||||
Returns:
|
||||
Dictionary with filesystem data:
|
||||
- filesystems: List of filesystem dictionaries:
|
||||
- device: Device name (e.g., /dev/sda1)
|
||||
- mountpoint: Mount point path
|
||||
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
|
||||
- opts: Mount options (comma-separated string)
|
||||
- maxfile: Maximum filename length
|
||||
- maxpath: Maximum path length
|
||||
- filesystem_types: List of unique filesystem types found
|
||||
- mount_count: Total number of mounted filesystems
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_info()
|
||||
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting filesystem info: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect filesystem information from psutil."""
|
||||
info = {}
|
||||
filesystems = []
|
||||
filesystem_types = set()
|
||||
|
||||
# Get mounted disk partitions
|
||||
# all=False returns only physical devices (real mounted filesystems)
|
||||
# all=True would include pseudo filesystems (proc, sysfs, etc.)
|
||||
partitions = psutil.disk_partitions(all=self.include_pseudo)
|
||||
|
||||
for partition in partitions:
|
||||
# Additional filtering if exclude_types is specified
|
||||
if partition.fstype in self.exclude_types:
|
||||
continue
|
||||
|
||||
fs_info = {
|
||||
'device': partition.device,
|
||||
'mountpoint': partition.mountpoint,
|
||||
'fstype': partition.fstype,
|
||||
'opts': partition.opts,
|
||||
}
|
||||
|
||||
# Try to get filesystem capabilities
|
||||
try:
|
||||
# Get path configuration for this mount point
|
||||
import os
|
||||
if hasattr(os, 'pathconf'):
|
||||
try:
|
||||
# Maximum filename length
|
||||
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||
if max_name:
|
||||
fs_info['maxfile'] = max_name
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Maximum path length
|
||||
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||
if max_path:
|
||||
fs_info['maxpath'] = max_path
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
|
||||
|
||||
filesystems.append(fs_info)
|
||||
filesystem_types.add(partition.fstype)
|
||||
|
||||
info['filesystems'] = filesystems
|
||||
info['filesystem_types'] = sorted(list(filesystem_types))
|
||||
info['mount_count'] = len(filesystems)
|
||||
|
||||
# Add some additional filesystem statistics
|
||||
try:
|
||||
# Get boot time (useful for determining filesystem mount times)
|
||||
boot_time = psutil.boot_time()
|
||||
info['boot_time'] = boot_time
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not get boot time: {e}")
|
||||
|
||||
return info
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Filesystem info cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = FilesystemInfoPlugin
|
||||
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Memory monitoring plugin for Heartbeat.
|
||||
|
||||
Collects memory and swap usage statistics using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor memory and swap usage.
|
||||
|
||||
Collects:
|
||||
- Physical memory (RAM) usage and statistics
|
||||
- Virtual memory details
|
||||
- Swap memory usage and statistics
|
||||
- Memory available for applications
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
|
||||
name = "memory_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the memory monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- include_swap: Include swap statistics (default: True)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.include_swap = self.config.get('include_swap', True)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for memory_monitor plugin")
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - memory_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current memory statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with memory metrics:
|
||||
- memory_total: Total physical RAM in bytes
|
||||
- memory_available: Available memory in bytes
|
||||
- memory_used: Used memory in bytes
|
||||
- memory_free: Free memory in bytes
|
||||
- memory_percent: Memory usage percentage
|
||||
- memory_active: Active memory (Unix)
|
||||
- memory_inactive: Inactive memory (Unix)
|
||||
- memory_buffers: Buffers (Linux)
|
||||
- memory_cached: Cached (Linux)
|
||||
- memory_shared: Shared (Linux)
|
||||
- swap_total: Total swap in bytes (if include_swap)
|
||||
- swap_used: Used swap in bytes (if include_swap)
|
||||
- swap_free: Free swap in bytes (if include_swap)
|
||||
- swap_percent: Swap usage percentage (if include_swap)
|
||||
- swap_sin: Bytes swapped in from disk (if include_swap)
|
||||
- swap_sout: Bytes swapped out to disk (if include_swap)
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected memory metrics: {len(data)} fields")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting memory metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect memory metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Virtual (physical) memory statistics
|
||||
vmem = psutil.virtual_memory()
|
||||
metrics['memory_total'] = vmem.total
|
||||
metrics['memory_available'] = vmem.available
|
||||
metrics['memory_used'] = vmem.used
|
||||
metrics['memory_free'] = vmem.free
|
||||
metrics['memory_percent'] = vmem.percent
|
||||
|
||||
# Platform-specific memory details
|
||||
if hasattr(vmem, 'active'):
|
||||
metrics['memory_active'] = vmem.active
|
||||
if hasattr(vmem, 'inactive'):
|
||||
metrics['memory_inactive'] = vmem.inactive
|
||||
if hasattr(vmem, 'buffers'):
|
||||
metrics['memory_buffers'] = vmem.buffers
|
||||
if hasattr(vmem, 'cached'):
|
||||
metrics['memory_cached'] = vmem.cached
|
||||
if hasattr(vmem, 'shared'):
|
||||
metrics['memory_shared'] = vmem.shared
|
||||
|
||||
# Swap memory statistics
|
||||
if self.include_swap:
|
||||
try:
|
||||
swap = psutil.swap_memory()
|
||||
metrics['swap_total'] = swap.total
|
||||
metrics['swap_used'] = swap.used
|
||||
metrics['swap_free'] = swap.free
|
||||
metrics['swap_percent'] = swap.percent
|
||||
|
||||
# Swap in/out counters (may not be available on all platforms)
|
||||
if hasattr(swap, 'sin'):
|
||||
metrics['swap_sin'] = swap.sin
|
||||
if hasattr(swap, 'sout'):
|
||||
metrics['swap_sout'] = swap.sout
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect swap statistics: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Memory monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = MemoryMonitorPlugin
|
||||
@@ -0,0 +1,283 @@
|
||||
"""Nagios Plugin Runner for Heartbeat.
|
||||
|
||||
Executes Nagios-compatible monitoring plugins and parses their output.
|
||||
|
||||
Nagios Plugin Standard:
|
||||
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
- Output format: Single line status message, optional performance data
|
||||
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Example configuration in ~/.hb.yaml:
|
||||
```yaml
|
||||
nagios_runner:
|
||||
interval: 60
|
||||
commands:
|
||||
- name: check_disk_root
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
|
||||
- name: check_procs
|
||||
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
```
|
||||
"""
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
|
||||
# Nagios exit codes
|
||||
NAGIOS_OK = 0
|
||||
NAGIOS_WARNING = 1
|
||||
NAGIOS_CRITICAL = 2
|
||||
NAGIOS_UNKNOWN = 3
|
||||
|
||||
STATUS_NAMES = {
|
||||
NAGIOS_OK: "OK",
|
||||
NAGIOS_WARNING: "WARNING",
|
||||
NAGIOS_CRITICAL: "CRITICAL",
|
||||
NAGIOS_UNKNOWN: "UNKNOWN"
|
||||
}
|
||||
|
||||
|
||||
class NagiosRunnerPlugin(MonitorPlugin):
|
||||
"""Run Nagios-compatible monitoring plugins.
|
||||
|
||||
This plugin executes external Nagios plugins and collects their output,
|
||||
including status codes, messages, and performance data.
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
commands: List of command definitions with 'name' and 'command' keys
|
||||
timeout: Command execution timeout in seconds (default: 30)
|
||||
shell: Whether to execute commands via shell (default: True)
|
||||
|
||||
Example:
|
||||
nagios_runner:
|
||||
interval: 300 # Check every 5 minutes
|
||||
timeout: 30
|
||||
commands:
|
||||
- name: check_disk
|
||||
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
|
||||
- name: check_load
|
||||
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
|
||||
"""
|
||||
|
||||
name = "nagios_runner"
|
||||
version = "1.0.0"
|
||||
description = "Execute Nagios-compatible monitoring plugins"
|
||||
interval = 300 # MonitorPlugin: collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
# Extract configuration
|
||||
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
|
||||
self.timeout: int = config.get("timeout", 30) if config else 30
|
||||
self.shell: bool = config.get("shell", True) if config else True
|
||||
self.interval = config.get("interval", 300) if config else 300
|
||||
|
||||
# Validate commands
|
||||
if not self.commands:
|
||||
self.logger.warning(
|
||||
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
|
||||
)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the Nagios runner plugin.
|
||||
|
||||
Returns:
|
||||
True if at least one command is configured, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
|
||||
if not self.commands:
|
||||
self.logger.error("No Nagios commands configured")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name", "unnamed")
|
||||
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
|
||||
|
||||
return True
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect metrics from all configured Nagios plugins.
|
||||
|
||||
Returns:
|
||||
Dictionary with results from all plugins
|
||||
"""
|
||||
results = {}
|
||||
|
||||
# Track overall status (worst status wins)
|
||||
worst_status = NAGIOS_OK
|
||||
|
||||
for cmd_config in self.commands:
|
||||
name = cmd_config.get("name")
|
||||
command = cmd_config.get("command")
|
||||
|
||||
if not name or not command:
|
||||
self.logger.warning("Skipping command with missing name or command")
|
||||
continue
|
||||
|
||||
# Execute plugin
|
||||
try:
|
||||
status_code, output, perfdata = await self._run_nagios_plugin(command)
|
||||
|
||||
# Store results
|
||||
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
|
||||
results[f"{name}_status_code"] = status_code
|
||||
results[f"{name}_output"] = output
|
||||
|
||||
# Track worst status
|
||||
if status_code > worst_status:
|
||||
worst_status = status_code
|
||||
|
||||
# Parse and add performance data
|
||||
if perfdata:
|
||||
for metric_name, metric_value in perfdata.items():
|
||||
results[f"{name}_{metric_name}"] = metric_value
|
||||
|
||||
self.logger.debug(
|
||||
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error running {name}: {e}", exc_info=True)
|
||||
results[f"{name}_status"] = "ERROR"
|
||||
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
|
||||
results[f"{name}_output"] = str(e)
|
||||
worst_status = NAGIOS_UNKNOWN
|
||||
|
||||
# Add overall status
|
||||
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
|
||||
results["overall_status_code"] = worst_status
|
||||
results["plugin_count"] = len(self.commands)
|
||||
|
||||
return results
|
||||
|
||||
async def _run_nagios_plugin(
|
||||
self,
|
||||
command: str
|
||||
) -> Tuple[int, str, Dict[str, Any]]:
|
||||
"""Execute a Nagios plugin and parse its output.
|
||||
|
||||
Args:
|
||||
command: Command string to execute
|
||||
|
||||
Returns:
|
||||
Tuple of (status_code, output_message, performance_data_dict)
|
||||
"""
|
||||
try:
|
||||
# Run command
|
||||
result = subprocess.run(
|
||||
command,
|
||||
shell=self.shell,
|
||||
capture_output=True,
|
||||
timeout=self.timeout,
|
||||
text=True
|
||||
)
|
||||
|
||||
status_code = result.returncode
|
||||
output = result.stdout.strip()
|
||||
|
||||
# Nagios plugins can return codes > 3, treat as UNKNOWN
|
||||
if status_code > 3:
|
||||
status_code = NAGIOS_UNKNOWN
|
||||
|
||||
# Parse performance data
|
||||
perfdata = self._parse_perfdata(output)
|
||||
|
||||
# Extract just the status message (before the pipe if present)
|
||||
if '|' in output:
|
||||
output_msg = output.split('|')[0].strip()
|
||||
else:
|
||||
output_msg = output
|
||||
|
||||
return status_code, output_msg, perfdata
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error(f"Command timed out: {command}")
|
||||
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error executing command: {e}")
|
||||
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
|
||||
|
||||
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
|
||||
"""Parse Nagios performance data from plugin output.
|
||||
|
||||
Nagios performance data format:
|
||||
'label'=value[UOM];[warn];[crit];[min];[max]
|
||||
|
||||
Multiple metrics separated by spaces.
|
||||
|
||||
Args:
|
||||
output: Plugin output string
|
||||
|
||||
Returns:
|
||||
Dictionary of metric_name: value
|
||||
"""
|
||||
perfdata = {}
|
||||
|
||||
# Performance data comes after the pipe character
|
||||
if '|' not in output:
|
||||
return perfdata
|
||||
|
||||
perf_section = output.split('|', 1)[1].strip()
|
||||
|
||||
# Regex to match performance data format
|
||||
# Matches: 'label'=value or label=value
|
||||
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
|
||||
|
||||
for match in re.finditer(perf_regex, perf_section):
|
||||
label = match.group(1).strip()
|
||||
value_str = match.group(2)
|
||||
uom = match.group(3) or ""
|
||||
warn = match.group(4)
|
||||
crit = match.group(5)
|
||||
min_val = match.group(6)
|
||||
max_val = match.group(7)
|
||||
|
||||
# Convert value to float
|
||||
try:
|
||||
value = float(value_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Store the value
|
||||
perfdata[label] = value
|
||||
|
||||
# Optionally store UOM as separate field
|
||||
if uom:
|
||||
perfdata[f"{label}_uom"] = uom
|
||||
|
||||
# Store thresholds if present
|
||||
if warn:
|
||||
try:
|
||||
perfdata[f"{label}_warn"] = float(warn)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if crit:
|
||||
try:
|
||||
perfdata[f"{label}_crit"] = float(crit)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if min_val:
|
||||
try:
|
||||
perfdata[f"{label}_min"] = float(min_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if max_val:
|
||||
try:
|
||||
perfdata[f"{label}_max"] = float(max_val)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return perfdata
|
||||
@@ -0,0 +1,240 @@
|
||||
"""
|
||||
Network monitoring plugin for Heartbeat.
|
||||
|
||||
Collects network interface statistics and connection information using psutil.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
|
||||
from hbd.client.plugin import MonitorPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NetworkMonitorPlugin(MonitorPlugin):
|
||||
"""
|
||||
Monitor network interface statistics and connections.
|
||||
|
||||
Collects:
|
||||
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
|
||||
- Per-interface statistics
|
||||
- Network connection counts by state
|
||||
- Interface addresses and configuration
|
||||
|
||||
Configuration:
|
||||
interval: Collection interval in seconds (default: 300)
|
||||
interfaces: List of interfaces to monitor (default: all)
|
||||
include_connections: Include connection statistics (default: True)
|
||||
include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
|
||||
name = "network_monitor"
|
||||
interval = 300 # Collect every 5 minutes by default
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the network monitor plugin.
|
||||
|
||||
Args:
|
||||
config: Optional configuration dict with keys:
|
||||
- interval: Collection interval in seconds (default: 300)
|
||||
- interfaces: List of specific interfaces to monitor
|
||||
- include_connections: Include connection stats (default: True)
|
||||
- include_addresses: Include interface addresses (default: False)
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
|
||||
self.include_connections = self.config.get('include_connections', True)
|
||||
self.include_addresses = self.config.get('include_addresses', False)
|
||||
self.interval = self.config.get('interval', 300)
|
||||
|
||||
if psutil is None:
|
||||
raise ImportError("psutil library is required for network_monitor plugin")
|
||||
|
||||
# Store previous I/O counters for delta calculation
|
||||
self._prev_io = {}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the plugin (check psutil availability)."""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available - network_monitor cannot run")
|
||||
return False
|
||||
|
||||
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
|
||||
f"connections: {self.include_connections})")
|
||||
|
||||
# Initialize I/O counters
|
||||
try:
|
||||
self._prev_io = psutil.net_io_counters(pernic=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize network I/O counters: {e}")
|
||||
|
||||
return True
|
||||
|
||||
async def collect(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect current network statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with network metrics:
|
||||
- interfaces: Dict of interface statistics, keyed by interface name
|
||||
- bytes_sent: Total bytes sent
|
||||
- bytes_recv: Total bytes received
|
||||
- packets_sent: Total packets sent
|
||||
- packets_recv: Total packets received
|
||||
- errin: Total incoming errors
|
||||
- errout: Total outgoing errors
|
||||
- dropin: Total incoming packets dropped
|
||||
- dropout: Total outgoing packets dropped
|
||||
- bytes_sent_delta: Bytes sent since last collection
|
||||
- bytes_recv_delta: Bytes received since last collection
|
||||
- packets_sent_delta: Packets sent since last collection
|
||||
- packets_recv_delta: Packets received since last collection
|
||||
- connections: Connection statistics by state (if include_connections)
|
||||
- ESTABLISHED: Count of established connections
|
||||
- LISTEN: Count of listening sockets
|
||||
- TIME_WAIT: Count of TIME_WAIT connections
|
||||
- etc.
|
||||
- addresses: Interface address information (if include_addresses)
|
||||
- Dict keyed by interface name with address details
|
||||
"""
|
||||
if psutil is None:
|
||||
logger.error("psutil not available")
|
||||
return {}
|
||||
|
||||
try:
|
||||
data = await self._collect_metrics()
|
||||
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting network metrics: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _collect_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect network metrics from psutil."""
|
||||
metrics = {}
|
||||
|
||||
# Collect per-interface I/O counters
|
||||
try:
|
||||
io_counters = psutil.net_io_counters(pernic=True)
|
||||
interfaces_data = {}
|
||||
|
||||
for iface_name, counters in io_counters.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_stats = {
|
||||
'bytes_sent': counters.bytes_sent,
|
||||
'bytes_recv': counters.bytes_recv,
|
||||
'packets_sent': counters.packets_sent,
|
||||
'packets_recv': counters.packets_recv,
|
||||
'errin': counters.errin,
|
||||
'errout': counters.errout,
|
||||
'dropin': counters.dropin,
|
||||
'dropout': counters.dropout,
|
||||
}
|
||||
|
||||
# Calculate deltas from previous collection
|
||||
if iface_name in self._prev_io:
|
||||
prev = self._prev_io[iface_name]
|
||||
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
|
||||
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
|
||||
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
|
||||
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
|
||||
|
||||
interfaces_data[iface_name] = iface_stats
|
||||
|
||||
metrics['interfaces'] = interfaces_data
|
||||
|
||||
# Store current counters for next delta calculation
|
||||
self._prev_io = io_counters
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect network I/O counters: {e}")
|
||||
|
||||
# Collect connection statistics
|
||||
if self.include_connections:
|
||||
try:
|
||||
connections = psutil.net_connections(kind='inet')
|
||||
conn_stats = {}
|
||||
|
||||
# Count connections by state
|
||||
for conn in connections:
|
||||
state = conn.status
|
||||
conn_stats[state] = conn_stats.get(state, 0) + 1
|
||||
|
||||
metrics['connections'] = conn_stats
|
||||
|
||||
except (PermissionError, psutil.AccessDenied):
|
||||
logger.debug("Permission denied for net_connections (requires root/admin)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect connection statistics: {e}")
|
||||
|
||||
# Collect interface addresses
|
||||
if self.include_addresses:
|
||||
try:
|
||||
addresses = psutil.net_if_addrs()
|
||||
addr_data = {}
|
||||
|
||||
for iface_name, addrs in addresses.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
iface_addrs = []
|
||||
for addr in addrs:
|
||||
addr_info = {
|
||||
'family': str(addr.family),
|
||||
'address': addr.address,
|
||||
}
|
||||
if addr.netmask:
|
||||
addr_info['netmask'] = addr.netmask
|
||||
if addr.broadcast:
|
||||
addr_info['broadcast'] = addr.broadcast
|
||||
iface_addrs.append(addr_info)
|
||||
|
||||
addr_data[iface_name] = iface_addrs
|
||||
|
||||
metrics['addresses'] = addr_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface addresses: {e}")
|
||||
|
||||
# Add interface stats (up/down status, speed, mtu)
|
||||
try:
|
||||
if_stats = psutil.net_if_stats()
|
||||
stats_data = {}
|
||||
|
||||
for iface_name, stats in if_stats.items():
|
||||
# Skip if we're only monitoring specific interfaces
|
||||
if self.interfaces and iface_name not in self.interfaces:
|
||||
continue
|
||||
|
||||
stats_data[iface_name] = {
|
||||
'isup': stats.isup,
|
||||
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
|
||||
'speed': stats.speed,
|
||||
'mtu': stats.mtu,
|
||||
}
|
||||
|
||||
metrics['interface_stats'] = stats_data
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not collect interface stats: {e}")
|
||||
|
||||
return metrics
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup (nothing to do for this plugin)."""
|
||||
logger.info("Network monitor cleanup")
|
||||
|
||||
|
||||
# Plugin instance for automatic discovery
|
||||
plugin = NetworkMonitorPlugin
|
||||
@@ -0,0 +1,136 @@
|
||||
"""OS Information Plugin for Heartbeat.
|
||||
|
||||
Collects static operating system information including OS name, version,
|
||||
kernel, architecture, and distribution details.
|
||||
"""
|
||||
|
||||
import platform
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
# Import from parent package
|
||||
from hbd.client.plugin import InfoPlugin
|
||||
|
||||
|
||||
class OSInfoPlugin(InfoPlugin):
|
||||
"""Collect operating system information.
|
||||
|
||||
This plugin gathers static OS information that rarely changes:
|
||||
- OS name and version
|
||||
- Kernel version
|
||||
- Architecture (x86_64, arm64, etc.)
|
||||
- Distribution details (for Linux)
|
||||
- Python version (used by hbc)
|
||||
"""
|
||||
|
||||
name = "os_info"
|
||||
version = "1.0.0"
|
||||
description = "Operating system and platform information"
|
||||
interval = 0 # InfoPlugin: collect once at startup
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
|
||||
async def initialize(self) -> bool:
|
||||
"""Initialize the OS info plugin.
|
||||
|
||||
Returns:
|
||||
True (always succeeds - platform module is stdlib)
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} plugin")
|
||||
return True
|
||||
|
||||
async def _collect_info(self) -> Dict[str, Any]:
|
||||
"""Collect OS information.
|
||||
|
||||
Returns:
|
||||
Dictionary with OS details
|
||||
"""
|
||||
try:
|
||||
data = {
|
||||
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
|
||||
"node": platform.node(), # hostname
|
||||
"release": platform.release(), # kernel version
|
||||
"version": platform.version(), # detailed version
|
||||
"machine": platform.machine(), # e.g., "x86_64", "arm64"
|
||||
"processor": platform.processor(), # processor name
|
||||
"architecture": platform.architecture()[0], # e.g., "64bit"
|
||||
"python_version": platform.python_version(),
|
||||
"python_implementation": platform.python_implementation(),
|
||||
}
|
||||
|
||||
# Add Linux-specific distribution info
|
||||
if platform.system() == "Linux":
|
||||
data.update(self._get_linux_distro())
|
||||
|
||||
# Add macOS-specific info
|
||||
elif platform.system() == "Darwin":
|
||||
data["macos_version"] = platform.mac_ver()[0]
|
||||
|
||||
# Add Windows-specific info
|
||||
elif platform.system() == "Windows":
|
||||
win_ver = platform.win32_ver()
|
||||
data["windows_release"] = win_ver[0]
|
||||
data["windows_version"] = win_ver[1]
|
||||
data["windows_sp"] = win_ver[2]
|
||||
data["windows_type"] = win_ver[3]
|
||||
|
||||
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
|
||||
return {}
|
||||
|
||||
def _get_linux_distro(self) -> Dict[str, str]:
|
||||
"""Get Linux distribution information.
|
||||
|
||||
Returns:
|
||||
Dictionary with distribution details
|
||||
"""
|
||||
distro_info = {}
|
||||
|
||||
# Try reading /etc/os-release (standard on modern Linux)
|
||||
os_release = Path("/etc/os-release")
|
||||
if os_release.exists():
|
||||
try:
|
||||
with open(os_release) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line and not line.startswith("#"):
|
||||
key, value = line.split("=", 1)
|
||||
# Remove quotes from value
|
||||
value = value.strip('"').strip("'")
|
||||
# Map common keys
|
||||
if key == "NAME":
|
||||
distro_info["distro_name"] = value
|
||||
elif key == "VERSION":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "VERSION_ID":
|
||||
distro_info["distro_version_id"] = value
|
||||
elif key == "PRETTY_NAME":
|
||||
distro_info["distro_pretty_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/os-release: {e}")
|
||||
|
||||
# Fallback: try lsb_release (older systems)
|
||||
elif Path("/etc/lsb-release").exists():
|
||||
try:
|
||||
with open("/etc/lsb-release") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
if key == "DISTRIB_ID":
|
||||
distro_info["distro_id"] = value
|
||||
elif key == "DISTRIB_RELEASE":
|
||||
distro_info["distro_version"] = value
|
||||
elif key == "DISTRIB_DESCRIPTION":
|
||||
distro_info["distro_name"] = value
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
|
||||
|
||||
return distro_info
|
||||
@@ -0,0 +1,579 @@
|
||||
"""
|
||||
Threshold checking and alerting for plugin metrics.
|
||||
|
||||
This module provides a flexible threshold checking system that:
|
||||
- Evaluates plugin metrics against configured warning/critical thresholds
|
||||
- Tracks alert states per host and metric
|
||||
- Prevents alert flapping with hysteresis
|
||||
- Triggers notifications only on state changes
|
||||
- Supports multiple comparison operators
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, Tuple, Callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertLevel(Enum):
|
||||
"""Alert severity levels."""
|
||||
OK = 0
|
||||
WARNING = 1
|
||||
CRITICAL = 2
|
||||
UNKNOWN = 3
|
||||
|
||||
|
||||
class ComparisonOperator(Enum):
|
||||
"""Supported comparison operators for threshold checks."""
|
||||
GT = ">" # Greater than
|
||||
GTE = ">=" # Greater than or equal
|
||||
LT = "<" # Less than
|
||||
LTE = "<=" # Less than or equal
|
||||
EQ = "==" # Equal to
|
||||
NEQ = "!=" # Not equal to
|
||||
|
||||
|
||||
class AlertState:
|
||||
"""Represents the current alert state for a specific metric."""
|
||||
|
||||
def __init__(self, metric_path: str):
|
||||
"""
|
||||
Initialize alert state.
|
||||
|
||||
Args:
|
||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.level = AlertLevel.OK
|
||||
self.since = time.time()
|
||||
self.last_value = None
|
||||
self.last_check = time.time()
|
||||
self.notification_count = 0
|
||||
self.last_notification = None
|
||||
|
||||
def update(self, level: AlertLevel, value: Any) -> bool:
|
||||
"""
|
||||
Update alert state.
|
||||
|
||||
Args:
|
||||
level: New alert level
|
||||
value: Current metric value
|
||||
|
||||
Returns:
|
||||
True if state changed (notification needed), False otherwise
|
||||
"""
|
||||
now = time.time()
|
||||
self.last_check = now
|
||||
self.last_value = value
|
||||
|
||||
# Check if state changed
|
||||
if level != self.level:
|
||||
logger.info(
|
||||
"Alert state change for %s: %s -> %s (value: %s)",
|
||||
self.metric_path,
|
||||
self.level.name,
|
||||
level.name,
|
||||
value
|
||||
)
|
||||
self.level = level
|
||||
self.since = now
|
||||
self.notification_count = 0
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert alert state to dictionary for serialization."""
|
||||
return {
|
||||
"metric_path": self.metric_path,
|
||||
"level": self.level.name,
|
||||
"since": self.since,
|
||||
"last_value": self.last_value,
|
||||
"last_check": self.last_check,
|
||||
"notification_count": self.notification_count,
|
||||
}
|
||||
|
||||
|
||||
class ThresholdConfig:
|
||||
"""Configuration for a single threshold check."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
metric_path: str,
|
||||
warning: Optional[float] = None,
|
||||
critical: Optional[float] = None,
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize threshold configuration.
|
||||
|
||||
Args:
|
||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||
warning: Warning threshold value
|
||||
critical: Critical threshold value
|
||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||
enabled: Whether this threshold is enabled
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.warning = warning
|
||||
self.critical = critical
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
self.operator = ComparisonOperator(operator)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Invalid operator '%s' for %s, using '>' as default",
|
||||
operator,
|
||||
metric_path
|
||||
)
|
||||
self.operator = ComparisonOperator.GT
|
||||
|
||||
def evaluate(self, value: float) -> AlertLevel:
|
||||
"""
|
||||
Evaluate a value against this threshold.
|
||||
|
||||
Args:
|
||||
value: Metric value to check
|
||||
|
||||
Returns:
|
||||
AlertLevel indicating the severity
|
||||
"""
|
||||
if not self.enabled:
|
||||
return AlertLevel.OK
|
||||
|
||||
try:
|
||||
# Convert value to float for comparison
|
||||
value = float(value)
|
||||
except (TypeError, ValueError):
|
||||
logger.warning("Cannot convert value %s to float for %s", value, self.metric_path)
|
||||
return AlertLevel.UNKNOWN
|
||||
|
||||
# Check critical threshold first
|
||||
if self.critical is not None:
|
||||
if self._compare(value, self.critical):
|
||||
return AlertLevel.CRITICAL
|
||||
|
||||
# Then check warning threshold
|
||||
if self.warning is not None:
|
||||
if self._compare(value, self.warning):
|
||||
return AlertLevel.WARNING
|
||||
|
||||
return AlertLevel.OK
|
||||
|
||||
def evaluate_with_hysteresis(
|
||||
self,
|
||||
value: float,
|
||||
current_level: AlertLevel
|
||||
) -> AlertLevel:
|
||||
"""
|
||||
Evaluate with hysteresis to prevent flapping.
|
||||
|
||||
Args:
|
||||
value: Current metric value
|
||||
current_level: Current alert level
|
||||
|
||||
Returns:
|
||||
New alert level considering hysteresis
|
||||
"""
|
||||
new_level = self.evaluate(value)
|
||||
|
||||
# If no hysteresis, return new level
|
||||
if self.hysteresis == 0.0:
|
||||
return new_level
|
||||
|
||||
# If improving (going to a lower severity), apply hysteresis
|
||||
if new_level.value < current_level.value:
|
||||
# For recovery, value must be better by hysteresis amount
|
||||
if current_level == AlertLevel.CRITICAL and self.critical is not None:
|
||||
threshold = self.critical
|
||||
elif current_level == AlertLevel.WARNING and self.warning is not None:
|
||||
threshold = self.warning
|
||||
else:
|
||||
return new_level
|
||||
|
||||
# Calculate hysteresis threshold
|
||||
hysteresis_amount = abs(threshold * self.hysteresis)
|
||||
|
||||
if self.operator in [ComparisonOperator.GT, ComparisonOperator.GTE]:
|
||||
# For "greater than" thresholds, value must go below by hysteresis
|
||||
recovery_threshold = threshold - hysteresis_amount
|
||||
if value >= recovery_threshold:
|
||||
# Not enough improvement, keep current level
|
||||
return current_level
|
||||
elif self.operator in [ComparisonOperator.LT, ComparisonOperator.LTE]:
|
||||
# For "less than" thresholds, value must go above by hysteresis
|
||||
recovery_threshold = threshold + hysteresis_amount
|
||||
if value <= recovery_threshold:
|
||||
# Not enough improvement, keep current level
|
||||
return current_level
|
||||
|
||||
return new_level
|
||||
|
||||
def _compare(self, value: float, threshold: float) -> bool:
|
||||
"""Perform comparison based on operator."""
|
||||
if self.operator == ComparisonOperator.GT:
|
||||
return value > threshold
|
||||
elif self.operator == ComparisonOperator.GTE:
|
||||
return value >= threshold
|
||||
elif self.operator == ComparisonOperator.LT:
|
||||
return value < threshold
|
||||
elif self.operator == ComparisonOperator.LTE:
|
||||
return value <= threshold
|
||||
elif self.operator == ComparisonOperator.EQ:
|
||||
return abs(value - threshold) < 1e-9 # Float comparison
|
||||
elif self.operator == ComparisonOperator.NEQ:
|
||||
return abs(value - threshold) >= 1e-9
|
||||
return False
|
||||
|
||||
|
||||
class ThresholdChecker:
|
||||
"""Main threshold checking and alerting system."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Dict[str, Any],
|
||||
notification_callback: Optional[Callable] = None,
|
||||
renotify_interval: int = 3600,
|
||||
journal: Optional[Any] = None,
|
||||
):
|
||||
"""
|
||||
Initialize threshold checker.
|
||||
|
||||
Args:
|
||||
config: Threshold configuration dictionary from YAML
|
||||
notification_callback: Function to call for notifications
|
||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||
journal: Optional MessageJournal instance for logging threshold events
|
||||
"""
|
||||
self.thresholds = {} # {metric_path: ThresholdConfig}
|
||||
self.notification_callback = notification_callback
|
||||
self.renotify_interval = renotify_interval
|
||||
self.journal = journal
|
||||
|
||||
# Parse configuration
|
||||
self._parse_config(config)
|
||||
|
||||
logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
|
||||
|
||||
def _parse_config(self, config: Dict[str, Any]):
|
||||
"""Parse threshold configuration from YAML structure."""
|
||||
if not config or "thresholds" not in config:
|
||||
logger.info("No thresholds configured")
|
||||
return
|
||||
|
||||
thresholds_config = config["thresholds"]
|
||||
|
||||
for plugin_name, plugin_thresholds in thresholds_config.items():
|
||||
if not isinstance(plugin_thresholds, dict):
|
||||
continue
|
||||
|
||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
|
||||
|
||||
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
||||
"""Parse thresholds for a specific plugin."""
|
||||
for metric_name, threshold_config in thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Handle nested metrics (e.g., partitions./.percent)
|
||||
if metric_name == "partitions":
|
||||
self._parse_partition_thresholds(plugin_name, threshold_config)
|
||||
continue
|
||||
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
# Extract threshold values
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No thresholds defined for %s, skipping", metric_path)
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
|
||||
metric_path,
|
||||
warning,
|
||||
critical,
|
||||
operator
|
||||
)
|
||||
|
||||
def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
|
||||
"""Parse partition-specific thresholds for disk monitoring."""
|
||||
for partition, metrics in partitions.items():
|
||||
if not isinstance(metrics, dict):
|
||||
continue
|
||||
|
||||
for metric_name, threshold_config in metrics.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Create metric path like "disk_monitor./dev/sda1.percent"
|
||||
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
||||
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
|
||||
def check_plugin_data(
|
||||
self,
|
||||
host_name: str,
|
||||
plugin_name: str,
|
||||
data: Dict[str, Any],
|
||||
alert_states: Dict[str, AlertState],
|
||||
) -> list:
|
||||
"""
|
||||
Check plugin data against configured thresholds.
|
||||
|
||||
Args:
|
||||
host_name: Name of the host
|
||||
plugin_name: Name of the plugin
|
||||
data: Plugin data dictionary
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
List of (metric_path, old_level, new_level, value) tuples for state changes
|
||||
"""
|
||||
state_changes = []
|
||||
|
||||
# Check flat metrics
|
||||
for metric_name, value in data.items():
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
if metric_path not in self.thresholds:
|
||||
continue
|
||||
|
||||
threshold = self.thresholds[metric_path]
|
||||
|
||||
# Get or create alert state
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
|
||||
alert_state = alert_states[metric_path]
|
||||
|
||||
# Evaluate threshold with hysteresis
|
||||
new_level = threshold.evaluate_with_hysteresis(
|
||||
value,
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
host_name,
|
||||
plugin_name,
|
||||
data,
|
||||
alert_states,
|
||||
state_changes
|
||||
)
|
||||
|
||||
return state_changes
|
||||
|
||||
def _check_nested_metrics(
|
||||
self,
|
||||
host_name: str,
|
||||
plugin_name: str,
|
||||
data: Dict[str, Any],
|
||||
alert_states: Dict[str, AlertState],
|
||||
state_changes: list,
|
||||
):
|
||||
"""Check nested metrics like partition-specific thresholds."""
|
||||
# Look for partition data in disk_monitor
|
||||
if plugin_name == "disk_monitor" and "partitions" in data:
|
||||
partitions = data["partitions"]
|
||||
if not isinstance(partitions, dict):
|
||||
return
|
||||
|
||||
for partition, metrics in partitions.items():
|
||||
if not isinstance(metrics, dict):
|
||||
continue
|
||||
|
||||
for metric_name, value in metrics.items():
|
||||
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
||||
|
||||
if metric_path not in self.thresholds:
|
||||
continue
|
||||
|
||||
threshold = self.thresholds[metric_path]
|
||||
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
|
||||
alert_state = alert_states[metric_path]
|
||||
|
||||
new_level = threshold.evaluate_with_hysteresis(
|
||||
value,
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(
|
||||
host_name,
|
||||
metric_path,
|
||||
old_level,
|
||||
new_level,
|
||||
value
|
||||
)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
|
||||
def _trigger_notification(
|
||||
self,
|
||||
host_name: str,
|
||||
metric_path: str,
|
||||
old_level: AlertLevel,
|
||||
new_level: AlertLevel,
|
||||
value: Any,
|
||||
):
|
||||
"""Trigger a notification for an alert state change."""
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
message = f"WARNING: {host_name} - {metric_path} = {value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
|
||||
else:
|
||||
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
|
||||
|
||||
# Send notification
|
||||
if self.notification_callback is not None:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
|
||||
# Log to journal
|
||||
if self.journal is not None:
|
||||
try:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(self.journal.log_threshold_event(
|
||||
host_name=host_name,
|
||||
metric_path=metric_path,
|
||||
old_level=old_level.name,
|
||||
new_level=new_level.name,
|
||||
value=value,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to log threshold event to journal: {e}")
|
||||
|
||||
def _check_renotify(
|
||||
self,
|
||||
host_name: str,
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
value: Any,
|
||||
):
|
||||
"""Check if we should send a repeat notification."""
|
||||
if alert_state.level == AlertLevel.OK:
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
|
||||
# Check if we should re-notify
|
||||
if alert_state.last_notification is None:
|
||||
# First notification already sent during state change
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count = 1
|
||||
return
|
||||
|
||||
if (now - alert_state.last_notification) >= self.renotify_interval:
|
||||
# Time to re-notify
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
|
||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||
"""
|
||||
Get all currently active (non-OK) alerts.
|
||||
|
||||
Args:
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
List of AlertState objects that are not OK
|
||||
"""
|
||||
return [
|
||||
state for state in alert_states.values()
|
||||
if state.level != AlertLevel.OK
|
||||
]
|
||||
|
||||
def get_alert_summary(self, alert_states: Dict[str, AlertState]) -> Dict[str, int]:
|
||||
"""
|
||||
Get summary counts of alert levels.
|
||||
|
||||
Args:
|
||||
alert_states: Host's alert_states dictionary
|
||||
|
||||
Returns:
|
||||
Dictionary with counts: {"ok": N, "warning": N, "critical": N}
|
||||
"""
|
||||
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
|
||||
|
||||
for state in alert_states.values():
|
||||
if state.level == AlertLevel.OK:
|
||||
summary["ok"] += 1
|
||||
elif state.level == AlertLevel.WARNING:
|
||||
summary["warning"] += 1
|
||||
elif state.level == AlertLevel.CRITICAL:
|
||||
summary["critical"] += 1
|
||||
elif state.level == AlertLevel.UNKNOWN:
|
||||
summary["unknown"] += 1
|
||||
|
||||
return summary
|
||||
Reference in New Issue
Block a user