Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:

- Restructuring of the project directory into client and server components
- Renaming of modules and classes to better reflect their purpose and functionality
- Moving common utilities and configurations to a shared location
- Updating import statements to reflect the new structure
- Adding new documentation files for better clarity on various aspects of the project
- Removing deprecated or unused code to streamline the codebase
- Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
This commit is contained in:
Andreas Wrede
2026-03-29 11:13:40 -04:00
parent 7e2038ecac
commit 0543266c92
65 changed files with 11371 additions and 140 deletions
+9
View File
@@ -0,0 +1,9 @@
Plan
Heartbeat is a client/server based network monitor and host observer. hbd, the server portion receives heartbeat and state messages from clients and maintaines state and hisgtory of the informations it receives.
hbc, the client portion gathers information on various aspects of the
system it is running on, and sends it to hbd. Initially this info is basic, like OS make and version, hardware info (CPU type, memory and disks), fileystem info and some resource info. hbc/hbd support a plugin system to extend the info gathered and stored.
hbd also can send notification based on missed hbc updates, and on violation of pre-set limits for various state paramaters.
+13 -7
View File
@@ -1,11 +1,17 @@
"""hbd package - scaffolding for heartbeat daemon
"""hbd package - heartbeat monitoring system
This package contains the refactored modules for the original monolithic
`hbd` script. The initial implementation contains small scaffolds so you can
start moving functionality into the package.
This package contains both the heartbeat client (hbc) and server (hbd) components,
organized into separate subpackages:
- hbd.client: Client component with system monitoring plugins
- hbd.server: Server/daemon component with web UI and notifications
- hbd.common: Shared utilities and protocol definitions
Install options:
- pip install hbd[client] # Client only
- pip install hbd[server] # Server only
- pip install hbd[all] # Both client and server
"""
__all__ = ["main", "__version__"]
__all__ = ["__version__"]
__version__ = "5.0.5"
from .cli import main
+3
View File
@@ -0,0 +1,3 @@
"""HeartBeat Client (hbc) - System monitoring client."""
__version__ = "5.0.5"
+54
View File
@@ -0,0 +1,54 @@
"""Configuration loader and defaults for hbc (HeartBeat Client)."""
import logging
import os
try:
import yaml
except Exception:
yaml = None
CLIENT_DEFAULTS = {
# Network settings
"hb_port": 50003, # Port where hbd servers listen
"interval": 10, # Heartbeat interval in seconds
# Runtime flags
"foreground": False,
"verbose": False,
"debug": 0,
# Plugin configuration
"plugins": {}, # Per-plugin configuration
"thresholds": {}, # Threshold configuration for monitoring
}
def load_config(path=None):
"""Load configuration from a YAML file and merge with client defaults.
If YAML is not available or the file does not exist, defaults are returned.
Args:
path: Path to YAML config file (default: ~/.hb.yaml)
Returns:
Dictionary with configuration
"""
cfg = CLIENT_DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
if os.path.exists(path):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# Merge YAML data with defaults
# Keep all keys from YAML to support plugin configs and future extensions
for k, v in data.items():
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
return cfg
+643
View File
@@ -0,0 +1,643 @@
#!/usr/bin/env python3
"""
HeartBeat Client (hbc) - Async version with plugin support.
Sends heartbeat messages to HeartBeat Daemon (hbd) servers and collects
system information via plugins.
"""
import argparse
import asyncio
import logging
import os
import signal
import socket
import sys
import time
from hashlib import md5
from pathlib import Path
from typing import Dict, List, Optional
# Import protocol and config
from .config import load_config
from ..common.proto import dicttos, stodict
# Import plugin system
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
# Constants
PORT = 50003
INTERVAL = 10
VER = 6
MAXRECV = 32767
# Global state
running = True
dorestart = False
class AsyncConnection:
"""Async UDP connection to a heartbeat server."""
def __init__(self, conn_id: int, addr: str, port: int, af: int, name: str):
self.conn_id = conn_id
self.addr = addr
self.port = port
self.af = af
self.name = name
self.ackcount = 0
self.lastack = 0.0
self.send_count = 0
self.lastsend = 0.0
self.rtts = [0.0]
self.transport: Optional[asyncio.DatagramTransport] = None
self.protocol: Optional[asyncio.DatagramProtocol] = None
self.logger = logging.getLogger(f"hbc.conn.{addr}")
async def open(self) -> bool:
"""Open the UDP connection.
Returns:
True if successful, False otherwise
"""
try:
loop = asyncio.get_event_loop()
# Create datagram endpoint
self.transport, self.protocol = await loop.create_datagram_endpoint(
lambda: HeartbeatProtocol(self),
family=self.af
)
self.logger.debug(f"Opened connection to {self.addr}:{self.port}")
return True
except Exception as e:
self.logger.error(f"Failed to open connection: {e}")
return False
def close(self):
"""Close the connection."""
if self.transport:
self.transport.close()
self.transport = None
self.protocol = None
async def sendto(self, msg: dict, msg_id: str = "HTB"):
"""Send a message to the server.
Args:
msg: Message dictionary
msg_id: Message ID (HTB, PLG, etc.)
"""
if not self.transport:
await self.open()
if not self.transport:
self.logger.error("Cannot send - no transport")
return
# Add standard fields
msg["name"] = shortname(self.name)
msg["id"] = self.conn_id
msg["ver"] = VER
msg["time"] = time.time()
# Encode message
data = dicttos(msg_id, msg, compress=True)
# Send
self.transport.sendto(data, (self.addr, self.port))
self.send_count += 1
self.lastsend = time.time()
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
def handle_ack(self, msg: dict, now: float):
"""Handle ACK message from server."""
try:
self.lastack = msg.get("time", now)
rtt = (self.lastack - self.lastsend) * 2000.0 # Convert to ms
except Exception:
self.lastack = now
rtt = (self.lastack - self.lastsend) * 1000.0
self.rtts.append(rtt)
if len(self.rtts) > 10:
self.rtts.pop(0)
self.ackcount += 1
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
class HeartbeatProtocol(asyncio.DatagramProtocol):
"""Protocol handler for incoming UDP messages."""
def __init__(self, connection: AsyncConnection):
self.connection = connection
self.logger = logging.getLogger("hbc.protocol")
def datagram_received(self, data: bytes, addr):
"""Handle incoming datagram."""
try:
msg = stodict(data)
if not msg:
self.logger.warning(f"Failed to parse message from {addr}")
return
now = time.time()
msg_id = msg.get("ID")
if msg_id == "ACK":
self.connection.handle_ack(msg, now)
elif msg_id == "CMD":
# Command from server
asyncio.create_task(handle_command(self.connection, msg))
elif msg_id == "UPD":
# Update from server
asyncio.create_task(handle_update(self.connection, msg))
else:
self.logger.warning(f"Unknown message type: {msg_id}")
except Exception as e:
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
def error_received(self, exc):
"""Handle protocol errors."""
self.logger.error(f"Protocol error: {exc}")
async def handle_command(conn: AsyncConnection, msg: dict):
"""Execute a command received from server."""
import subprocess
cmd = msg.get("cmd", "")
if not cmd:
return
logger = logging.getLogger("hbc.command")
logger.info(f"Executing command: {cmd}")
try:
result = subprocess.check_output(
cmd, shell=True, stderr=subprocess.STDOUT, timeout=30
).decode()
status = "OK"
except subprocess.CalledProcessError as e:
result = str(e)
status = "CalledProcessError"
except subprocess.TimeoutExpired:
result = "Command timed out"
status = "Timeout"
except Exception as e:
result = str(e)
status = "Error"
# Send response
response = {
"service": "command",
"msg": f"{status} {result}"
}
await conn.sendto(response)
async def handle_update(conn: AsyncConnection, msg: dict):
"""Handle self-update from server."""
import codecs
import shutil
logger = logging.getLogger("hbc.update")
try:
code = codecs.decode(msg["code"], "base64").decode()
csum = msg["csum"]
except Exception as e:
error = f"Missing code/csum: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Verify checksum
m = md5()
m.update(code.encode())
if m.hexdigest() != csum:
error = "Checksum mismatch"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Backup current file
fn = sys.argv[0]
ofn = f"{fn}.sav"
try:
shutil.copy2(fn, ofn)
except Exception as e:
error = f"Backup failed: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
# Write new code
try:
with open(fn, "w") as fh:
fh.write(code)
except Exception as e:
error = f"Write failed: {e}"
logger.error(error)
await conn.sendto({"service": "update", "msg": error})
return
logger.info("Update successful, restart required")
await conn.sendto({"service": "update", "msg": "OK"})
# Trigger restart
global dorestart
dorestart = True
stop()
async def heartbeat_sender(conn: AsyncConnection, interval: int):
"""Send periodic heartbeats.
Args:
conn: Connection to send on
interval: Heartbeat interval in seconds
"""
logger = logging.getLogger("hbc.heartbeat")
while running:
try:
msg = {
"acks": conn.ackcount,
"rtt": conn.rtts[-1],
"interval": interval
}
await conn.sendto(msg, "HTB")
except Exception as e:
logger.error(f"Error sending heartbeat: {e}", exc_info=True)
# Wait for next interval
await asyncio.sleep(interval)
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
"""Collect and send plugin data.
Args:
conn: Connection to send on
registry: Plugin registry
"""
logger = logging.getLogger("hbc.plugins")
# Collect InfoPlugins once at startup
info_plugins = registry.get_by_type(InfoPlugin)
for plugin in info_plugins:
try:
data = await plugin.collect()
if data:
# Create PLG message with plugin name
plugin_msg = {"plugin": plugin.name, **data}
await conn.sendto(plugin_msg, "PLG")
logger.info(f"Sent {plugin.name} data")
except Exception as e:
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
# Schedule MonitorPlugins
# Group plugins by interval
from collections import defaultdict
by_interval = defaultdict(list)
monitor_plugins = registry.get_by_type(MonitorPlugin)
for plugin in monitor_plugins:
by_interval[plugin.interval].append(plugin)
# Create tasks for each interval
tasks = []
for interval, plugins in by_interval.items():
task = asyncio.create_task(
plugin_collector_interval(conn, plugins, interval)
)
tasks.append(task)
# Wait for all tasks
if tasks:
await asyncio.gather(*tasks)
async def plugin_collector_interval(
conn: AsyncConnection,
plugins: List,
interval: int
):
"""Collect plugins on a specific interval.
Args:
conn: Connection to send on
plugins: List of plugins to collect
interval: Collection interval in seconds
"""
logger = logging.getLogger(f"hbc.plugins.{interval}s")
while running:
for plugin in plugins:
try:
data = await plugin.collect()
if data:
# Don't use encode_plugin_data - create dict directly
plugin_msg = {"plugin": plugin.name, **data}
await conn.sendto(plugin_msg, "PLG")
logger.debug(f"Sent {plugin.name} data")
except Exception as e:
logger.error(
f"Error collecting {plugin.name}: {e}",
exc_info=True
)
await asyncio.sleep(interval)
def shortname(name: str) -> str:
"""Extract short hostname."""
return name.split(".")[0]
def stop():
"""Stop the event loop."""
global running
running = False
async def cleanup(connections: List[AsyncConnection]):
"""Cleanup connections on shutdown."""
logger = logging.getLogger("hbc.cleanup")
logger.info("Cleaning up connections")
for conn in connections:
try:
msg = {
"shutdown": 1,
"acks": conn.ackcount
}
await conn.sendto(msg)
except Exception as e:
logger.error(f"Error sending shutdown: {e}")
conn.close()
# Give messages time to send
await asyncio.sleep(0.5)
async def async_main(args, config):
"""Async main function."""
global running
logger = logging.getLogger("hbc.main")
# Setup
iam = socket.gethostname()
if args.name:
iam = args.name
hb_hosts = args.hosts
hb_port = config.get("hb_port", PORT)
interval = config.get("interval", INTERVAL)
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
logger.info(f"Port: {hb_port}, Interval: {interval}s")
# Create connections
connections = []
conn_id = 1
for host in hb_hosts:
try:
addrs = socket.getaddrinfo(host, hb_port, 0, 0, socket.SOL_UDP)
except socket.gaierror as e:
logger.error(f"Cannot resolve {host}: {e}")
continue
for addr_info in addrs:
af = addr_info[0]
addr = addr_info[4][0]
conn = AsyncConnection(conn_id, addr, hb_port, af, iam)
if await conn.open():
connections.append(conn)
conn_id += 1
if not connections:
logger.error("No connections established")
return 1
logger.info(f"Created {len(connections)} connections")
# Send boot/message if requested
if args.boot or args.message:
boot_msg = {}
if args.boot:
boot_msg["boot"] = 1
if args.message:
boot_msg["service"] = "service"
boot_msg["msg"] = args.message
boot_msg["acks"] = 0
for conn in connections:
await conn.sendto(boot_msg)
if args.message and not args.daemon:
# Message-only mode
await cleanup(connections)
return 0
# Load plugins
registry = PluginRegistry()
loader = PluginLoader(registry)
plugin_dir = Path(__file__).parent / "plugins"
if plugin_dir.exists():
count = await loader.load_from_directory(plugin_dir, config)
logger.info(f"Loaded {count} plugins")
else:
logger.warning(f"Plugin directory not found: {plugin_dir}")
# Start async tasks
tasks = []
# Heartbeat senders (one per connection)
for conn in connections:
task = asyncio.create_task(heartbeat_sender(conn, interval))
tasks.append(task)
# Plugin collector (uses all connections, but we'll use first one)
if connections and registry.get_enabled():
task = asyncio.create_task(plugin_collector(connections[0], registry))
tasks.append(task)
# Setup signal handlers
loop = asyncio.get_event_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, stop)
# Wait for stop or tasks to complete
try:
await asyncio.gather(*tasks)
except asyncio.CancelledError:
pass
# Cleanup
await cleanup(connections)
await loader.unload_all()
return 0
def daemonize(
working_dir="/",
stdin="/dev/zero",
stdout="/dev/null",
stderr="/dev/null"
):
"""UNIX double-fork daemonization."""
try:
pid = os.fork()
if pid > 0:
os._exit(0)
except OSError as e:
sys.stderr.write(f"fork #1 failed: {e}\n")
os._exit(1)
os.chdir(working_dir)
os.setsid()
os.umask(0)
try:
pid = os.fork()
if pid > 0:
os._exit(0)
except OSError as e:
sys.stderr.write(f"fork #2 failed: {e}\n")
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = open(stdin, "r")
so = open(stdout, "a+")
se = open(stderr, "a+")
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
def build_parser():
"""Build argument parser."""
parser = argparse.ArgumentParser(
prog="hbc",
description="HeartBeatClient - send heartbeat messages to HeartBeatDaemon",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-b", "--boot",
action="store_true",
help="Send a boot message"
)
parser.add_argument(
"-c", "--config",
dest="configfile",
help="Config file path (YAML)"
)
parser.add_argument(
"-m", "--message",
dest="message",
help="Send a message"
)
parser.add_argument(
"-n", "--name",
dest="name",
help="Name to use in heartbeat message"
)
parser.add_argument(
"-d", "--daemon",
action="store_true",
help="Run in daemon mode"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Verbose output"
)
parser.add_argument(
"-x", "--debug",
action="count",
default=0,
help="Increase debug level"
)
parser.add_argument(
"hosts",
nargs="+",
help="Heartbeat daemon hosts to send to"
)
return parser
def main(argv=None):
"""Main entry point."""
global running, dorestart
parser = build_parser()
args = parser.parse_args(argv)
# Load config
config = load_config(args.configfile)
# Setup logging
log_level = logging.INFO
if args.verbose:
log_level = logging.DEBUG
if args.debug:
log_level = logging.DEBUG
logging.basicConfig(
level=log_level,
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
# Daemonize if requested
if args.daemon:
print("Daemonizing...")
import syslog
syslog.openlog("hbc", syslog.LOG_PID, syslog.LOG_DAEMON)
syslog.syslog(syslog.LOG_INFO, f"Starting heartbeat to {', '.join(args.hosts)}")
daemonize()
# Reconfigure logging for syslog
logging.basicConfig(
level=log_level,
format="hbc[%(process)d]: %(name)s %(levelname)s: %(message)s"
)
# Run async main
try:
exit_code = asyncio.run(async_main(args, config))
except KeyboardInterrupt:
logging.info("Interrupted by user")
exit_code = 0
except Exception as e:
logging.error(f"Fatal error: {e}", exc_info=True)
exit_code = 1
# Handle restart
if dorestart:
logging.info("Restarting...")
os.execv(sys.argv[0], sys.argv)
sys.exit(exit_code)
if __name__ == "__main__":
main()
+410
View File
@@ -0,0 +1,410 @@
"""Plugin system for extending Heartbeat data collection and monitoring.
This module provides the base classes and infrastructure for the plugin system
that enables extending hbc (client) data collection and hbd (server) processing.
Plugin Types:
- InfoPlugin: Collects static or rarely-changing information (OS, hardware)
- MonitorPlugin: Collects periodic monitoring data (CPU, memory, disk usage)
Plugins run on the client (hbc) to gather data, which is then sent to the server
(hbd) for storage, threshold checking, and display.
"""
import importlib.util
import inspect
import logging
import sys
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Type
class Plugin(ABC):
"""Base class for all plugins.
Attributes:
name: Unique plugin identifier (e.g., "os_info", "cpu_monitor")
version: Plugin version string
description: Human-readable description
interval: Collection interval in seconds (0 for InfoPlugin = collect once)
enabled: Whether plugin is active (can be disabled via config)
"""
name: str = ""
version: str = "1.0.0"
description: str = ""
interval: int = 0
enabled: bool = True
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""Initialize plugin with optional configuration.
Args:
config: Plugin-specific configuration from YAML (e.g., thresholds, paths)
"""
self.config = config or {}
self.logger = logging.getLogger(f"plugin.{self.name}")
self._initialized = False
@abstractmethod
async def initialize(self) -> bool:
"""Initialize plugin (load resources, check dependencies).
Called once when plugin is loaded. Plugins should validate dependencies
(e.g., check if psutil is available) and prepare any resources.
Returns:
True if initialization succeeded, False otherwise
"""
pass
@abstractmethod
async def collect(self) -> Dict[str, Any]:
"""Collect data from the system.
This is the main method called on each collection interval. Should return
a dictionary of key-value pairs representing the collected data.
Keys should be strings (metric names). Values can be:
- Scalars: int, float, str, bool
- Lists/dicts (will be serialized appropriately)
Returns:
Dictionary of collected metrics, or empty dict on error
"""
pass
async def cleanup(self) -> None:
"""Cleanup plugin resources before shutdown.
Called when plugin is being unloaded or on system shutdown.
Override to release resources, close connections, etc.
"""
pass
def validate_data(self, data: Dict[str, Any]) -> bool:
"""Validate collected data before sending to server.
Override to implement custom validation logic.
Args:
data: Data returned from collect()
Returns:
True if data is valid, False otherwise
"""
return isinstance(data, dict)
class InfoPlugin(Plugin):
"""Plugin for collecting static or rarely-changing information.
InfoPlugins collect data that doesn't change frequently:
- OS name and version
- Hardware specifications (CPU model, RAM size)
- Network interface MAC addresses
Characteristics:
- interval = 0 (collected once at startup by default)
- Can specify interval > 0 for periodic refresh (e.g., check for hardware changes)
- Data is cached and reused until next collection
"""
interval: int = 0 # Collect once at startup
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._cached_data: Optional[Dict[str, Any]] = None
async def get_cached_data(self) -> Optional[Dict[str, Any]]:
"""Get cached data if available (avoids re-collection).
Returns:
Cached data dict, or None if not yet collected
"""
return self._cached_data
async def collect(self) -> Dict[str, Any]:
"""Collect and cache static information."""
if self._cached_data is None:
self._cached_data = await self._collect_info()
return self._cached_data
@abstractmethod
async def _collect_info(self) -> Dict[str, Any]:
"""Internal method to perform actual data collection.
Override this method instead of collect() for InfoPlugins.
"""
pass
def invalidate_cache(self) -> None:
"""Force re-collection on next collect() call."""
self._cached_data = None
class MonitorPlugin(Plugin):
"""Plugin for collecting periodic monitoring data.
MonitorPlugins collect time-series metrics that change frequently:
- CPU usage percentage
- Memory consumption
- Disk I/O statistics
- Network traffic
Characteristics:
- interval > 0 (e.g., 30 seconds for CPU, 60 for disk)
- Collected continuously on schedule
- Data includes timestamps for time-series tracking
"""
interval: int = 30 # Default: collect every 30 seconds
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._last_reading: Optional[Dict[str, Any]] = None
def get_last_reading(self) -> Optional[Dict[str, Any]]:
"""Get the last collected reading.
Returns:
Last reading dict with timestamp, or None if not yet collected
"""
return self._last_reading
async def collect(self) -> Dict[str, Any]:
"""Collect monitoring data and store as last reading."""
data = await self._collect_metrics()
if data:
# Add collection timestamp
import time
data['_timestamp'] = time.time()
self._last_reading = data
return data
@abstractmethod
async def _collect_metrics(self) -> Dict[str, Any]:
"""Internal method to perform actual metric collection.
Override this method instead of collect() for MonitorPlugins.
"""
pass
class PluginRegistry:
"""Registry for managing loaded plugins.
Maintains a collection of loaded plugins and provides methods to
query plugins by name, type, or interval.
"""
def __init__(self):
self._plugins: Dict[str, Plugin] = {}
self.logger = logging.getLogger("plugin.registry")
def register(self, plugin: Plugin) -> bool:
"""Register a plugin instance.
Args:
plugin: Plugin instance to register
Returns:
True if registered successfully, False if name conflict
"""
if plugin.name in self._plugins:
self.logger.error(f"Plugin '{plugin.name}' already registered")
return False
self._plugins[plugin.name] = plugin
self.logger.info(f"Registered plugin: {plugin.name} v{plugin.version}")
return True
def unregister(self, name: str) -> bool:
"""Unregister a plugin by name.
Args:
name: Plugin name to unregister
Returns:
True if unregistered, False if not found
"""
if name in self._plugins:
del self._plugins[name]
self.logger.info(f"Unregistered plugin: {name}")
return True
return False
def get(self, name: str) -> Optional[Plugin]:
"""Get plugin by name.
Args:
name: Plugin name
Returns:
Plugin instance or None if not found
"""
return self._plugins.get(name)
def get_all(self) -> List[Plugin]:
"""Get all registered plugins."""
return list(self._plugins.values())
def get_enabled(self) -> List[Plugin]:
"""Get all enabled plugins."""
return [p for p in self._plugins.values() if p.enabled]
def get_by_type(self, plugin_type: Type[Plugin]) -> List[Plugin]:
"""Get all plugins of a specific type.
Args:
plugin_type: Plugin class (InfoPlugin or MonitorPlugin)
Returns:
List of plugins matching the type
"""
return [p for p in self._plugins.values() if isinstance(p, plugin_type)]
def get_by_interval(self, interval: int) -> List[Plugin]:
"""Get all plugins with a specific collection interval.
Args:
interval: Interval in seconds (0 for one-time collection)
Returns:
List of plugins with matching interval
"""
return [p for p in self._plugins.values() if p.interval == interval]
class PluginLoader:
"""Load plugins from filesystem and instantiate them.
Scans plugin directories for Python modules containing Plugin subclasses,
loads them dynamically, and registers them with the PluginRegistry.
"""
def __init__(self, registry: PluginRegistry):
self.registry = registry
self.logger = logging.getLogger("plugin.loader")
self._loaded_modules: Dict[str, Any] = {}
async def load_from_directory(
self,
directory: Path,
config: Optional[Dict[str, Any]] = None
) -> int:
"""Load all plugins from a directory.
Scans for .py files, imports them, finds Plugin subclasses,
instantiates them with config, initializes, and registers.
Args:
directory: Path to plugin directory
config: Configuration dict (may contain per-plugin config)
Returns:
Number of plugins successfully loaded
"""
if not directory.exists() or not directory.is_dir():
self.logger.warning(f"Plugin directory not found: {directory}")
return 0
loaded_count = 0
plugin_config = config or {}
# Scan for Python files
for plugin_file in directory.glob("*.py"):
if plugin_file.name.startswith("_"):
continue # Skip __init__.py and private modules
self.logger.debug(f"Processing plugin file: {plugin_file.name}")
try:
# Load module dynamically
module_name = f"plugins.{plugin_file.stem}"
spec = importlib.util.spec_from_file_location(module_name, plugin_file)
if not spec or not spec.loader:
self.logger.warning(f"Could not create spec for {plugin_file}")
continue
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
self._loaded_modules[module_name] = module
self.logger.debug(f"Loaded module: {module_name}")
# Track which plugin classes we've already processed to avoid duplicates
processed_classes = set()
# Find Plugin subclasses in module
for name, obj in inspect.getmembers(module, inspect.isclass):
# Skip base classes and non-Plugin classes
if obj in (Plugin, InfoPlugin, MonitorPlugin):
self.logger.debug(f"Skipping base class: {name}")
continue
if not issubclass(obj, Plugin):
self.logger.debug(f"Skipping non-Plugin class: {name}")
continue
# Skip if we've already processed this class (handles module-level aliases)
if id(obj) in processed_classes:
self.logger.debug(f"Skipping duplicate reference to: {obj.__name__}")
continue
processed_classes.add(id(obj))
self.logger.debug(f"Found plugin class: {name}")
# Instantiate plugin with config
plugin_instance_config = plugin_config.get(obj.name, {})
plugin = obj(config=plugin_instance_config)
# Initialize plugin
try:
initialized = await plugin.initialize()
if not initialized:
self.logger.warning(
f"Plugin {plugin.name} failed initialization, skipping"
)
continue
except Exception as e:
self.logger.error(
f"Error initializing plugin {plugin.name}: {e}",
exc_info=True
)
continue
# Register with registry
if self.registry.register(plugin):
loaded_count += 1
self.logger.info(
f"Loaded plugin: {plugin.name} v{plugin.version} "
f"(interval: {plugin.interval}s)"
)
except Exception as e:
self.logger.error(
f"Error loading plugin from {plugin_file}: {e}",
exc_info=True
)
return loaded_count
async def unload_all(self) -> None:
"""Unload all plugins and cleanup resources."""
for plugin in self.registry.get_all():
try:
await plugin.cleanup()
except Exception as e:
self.logger.error(
f"Error cleaning up plugin {plugin.name}: {e}",
exc_info=True
)
self.registry.unregister(plugin.name)
# Remove loaded modules
for module_name in self._loaded_modules:
if module_name in sys.modules:
del sys.modules[module_name]
self._loaded_modules.clear()
+129
View File
@@ -0,0 +1,129 @@
"""CPU Monitoring Plugin for Heartbeat.
Collects CPU usage statistics including overall CPU percentage, per-core usage,
load average, and process counts.
"""
from typing import Any, Dict, Optional
import sys
from pathlib import Path
# Import from parent package
from hbd.client.plugin import MonitorPlugin
class CPUMonitorPlugin(MonitorPlugin):
"""Monitor CPU usage and load.
Collects:
- Overall CPU usage percentage
- Per-core CPU usage (if enabled in config)
- Load average (1min, 5min, 15min)
- Process count
- CPU frequency (if available)
"""
name = "cpu_monitor"
version = "1.0.0"
description = "CPU usage and load monitoring"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.psutil = None
self.per_core = config.get("per_core", False) if config else False
self.interval = config.get("interval", 300) if config else 300
async def initialize(self) -> bool:
"""Initialize the CPU monitor plugin.
Checks if psutil is available.
Returns:
True if psutil is available, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
try:
import psutil
self.psutil = psutil
self.logger.info(f"{self.name} initialized successfully")
return True
except ImportError:
self.logger.error(
"psutil module not available. Install with: pip install psutil"
)
return False
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect CPU metrics.
Returns:
Dictionary with CPU metrics
"""
if not self.psutil:
return {}
try:
data = {}
# Overall CPU usage percentage (non-blocking, interval=1 for accuracy)
# Note: first call to cpu_percent() returns 0.0, subsequent calls work correctly
data["cpu_percent"] = self.psutil.cpu_percent(interval=1)
# Per-core CPU usage (if enabled)
if self.per_core:
per_core_percents = self.psutil.cpu_percent(interval=0, percpu=True)
data["cpu_per_core"] = per_core_percents
data["cpu_core_count"] = len(per_core_percents)
else:
# Just report core count
data["cpu_core_count"] = self.psutil.cpu_count()
# Load average (Unix-like systems only)
try:
load_avg = self.psutil.getloadavg()
data["load_1min"] = round(load_avg[0], 2)
data["load_5min"] = round(load_avg[1], 2)
data["load_15min"] = round(load_avg[2], 2)
except (AttributeError, OSError):
# Not available on Windows
pass
# Process count
try:
data["process_count"] = len(self.psutil.pids())
except Exception as e:
self.logger.warning(f"Could not get process count: {e}")
# CPU frequency (if available)
try:
freq = self.psutil.cpu_freq()
if freq:
data["cpu_freq_current"] = round(freq.current, 2)
data["cpu_freq_min"] = round(freq.min, 2)
data["cpu_freq_max"] = round(freq.max, 2)
except (AttributeError, OSError, RuntimeError, SystemError) as e:
# Not available on all systems, or may fail on FreeBSD with sysctl issues
self.logger.debug(f"CPU frequency not available: {e}")
pass
# CPU times (user, system, idle, etc.)
try:
cpu_times = self.psutil.cpu_times_percent(interval=0)
data["cpu_user"] = round(cpu_times.user, 1)
data["cpu_system"] = round(cpu_times.system, 1)
data["cpu_idle"] = round(cpu_times.idle, 1)
if hasattr(cpu_times, "iowait"):
data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
)
return data
except Exception as e:
self.logger.error(f"Error collecting CPU metrics: {e}", exc_info=True)
return {}
+199
View File
@@ -0,0 +1,199 @@
"""
Disk monitoring plugin for Heartbeat.
Collects disk usage and I/O statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class DiskMonitorPlugin(MonitorPlugin):
"""
Monitor disk usage and I/O statistics.
Collects:
- Disk partition information
- Disk usage per partition (total, used, free, percent)
- Disk I/O counters (read/write bytes, read/write count)
- Disk I/O time statistics
Configuration:
interval: Collection interval in seconds (default: 300)
partitions: List of mount points to monitor (default: all)
include_io: Include disk I/O statistics (default: True)
exclude_types: List of filesystem types to exclude (default: tmpfs, devtmpfs, squashfs)
"""
name = "disk_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the disk monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- partitions: List of specific mount points to monitor
- include_io: Include I/O statistics (default: True)
- exclude_types: List of filesystem types to exclude
"""
super().__init__(config)
self.partitions = self.config.get('partitions', None) # None = all partitions
self.include_io = self.config.get('include_io', True)
self.exclude_types = set(self.config.get('exclude_types', ['tmpfs', 'devtmpfs', 'squashfs']))
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for disk_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - disk_monitor cannot run")
return False
logger.info(f"Disk monitor initialized (interval: {self.interval}s, io: {self.include_io})")
# Initialize I/O counters if available
if self.include_io:
try:
self._prev_io = psutil.disk_io_counters(perdisk=True)
except Exception as e:
logger.warning(f"Could not initialize disk I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current disk statistics.
Returns:
Dictionary with disk metrics organized by partition:
- partitions: Dict of partition data, keyed by mount point
- device: Device name (e.g., /dev/sda1)
- fstype: Filesystem type (e.g., ext4)
- total: Total space in bytes
- used: Used space in bytes
- free: Free space in bytes
- percent: Usage percentage
- io_counters: Dict of I/O statistics, keyed by disk name (if include_io)
- read_count: Number of reads
- write_count: Number of writes
- read_bytes: Bytes read
- write_bytes: Bytes written
- read_time: Time spent reading in ms
- write_time: Time spent writing in ms
- read_bytes_delta: Bytes read since last collection
- write_bytes_delta: Bytes written since last collection
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected disk metrics: {len(data.get('partitions', {}))} partitions")
return data
except Exception as e:
logger.error(f"Error collecting disk metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect disk metrics from psutil."""
metrics = {}
# Collect partition usage
partitions_data = {}
partitions = psutil.disk_partitions(all=False)
for partition in partitions:
# Skip unwanted filesystem types
if partition.fstype in self.exclude_types:
continue
# Skip if we're only monitoring specific partitions
if self.partitions and partition.mountpoint not in self.partitions:
continue
try:
usage = psutil.disk_usage(partition.mountpoint)
partitions_data[partition.mountpoint] = {
'device': partition.device,
'fstype': partition.fstype,
'total': usage.total,
'used': usage.used,
'free': usage.free,
'percent': usage.percent
}
except PermissionError:
logger.debug(f"Permission denied accessing {partition.mountpoint}")
continue
except Exception as e:
logger.warning(f"Error reading {partition.mountpoint}: {e}")
continue
metrics['partitions'] = partitions_data
# Collect I/O statistics
if self.include_io:
try:
io_counters = psutil.disk_io_counters(perdisk=True)
io_data = {}
for disk_name, counters in io_counters.items():
disk_stats = {
'read_count': counters.read_count,
'write_count': counters.write_count,
'read_bytes': counters.read_bytes,
'write_bytes': counters.write_bytes,
}
# Add time statistics if available
if hasattr(counters, 'read_time'):
disk_stats['read_time'] = counters.read_time
if hasattr(counters, 'write_time'):
disk_stats['write_time'] = counters.write_time
if hasattr(counters, 'busy_time'):
disk_stats['busy_time'] = counters.busy_time
# Calculate deltas from previous collection
if disk_name in self._prev_io:
prev = self._prev_io[disk_name]
disk_stats['read_bytes_delta'] = counters.read_bytes - prev.read_bytes
disk_stats['write_bytes_delta'] = counters.write_bytes - prev.write_bytes
disk_stats['read_count_delta'] = counters.read_count - prev.read_count
disk_stats['write_count_delta'] = counters.write_count - prev.write_count
io_data[disk_name] = disk_stats
metrics['io_counters'] = io_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect disk I/O statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Disk monitor cleanup")
# Plugin instance for automatic discovery
plugin = DiskMonitorPlugin
+168
View File
@@ -0,0 +1,168 @@
"""
Filesystem information plugin for Heartbeat.
Collects static filesystem and partition information using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import InfoPlugin
logger = logging.getLogger(__name__)
class FilesystemInfoPlugin(InfoPlugin):
"""
Collect filesystem and partition information.
This is an InfoPlugin that collects static information once during startup.
By default, only reports physical mounted filesystems (e.g., ext4, xfs, btrfs).
Set include_pseudo=True to also include pseudo filesystems (proc, sysfs, tmpfs, etc.).
Collects:
- List of mounted filesystems
- Partition details (device, mount point, filesystem type, options)
- Filesystem capabilities and features
Configuration:
include_pseudo: Include pseudo/virtual filesystems (default: False)
exclude_types: List of additional filesystem types to exclude (default: [])
"""
name = "filesystem_info"
interval = 0 # InfoPlugin - collect once
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the filesystem info plugin.
Args:
config: Optional configuration dict with keys:
- include_pseudo: Include pseudo/virtual filesystems (default: False)
- exclude_types: List of filesystem types to exclude (default: [])
"""
super().__init__(config)
self.include_pseudo = self.config.get('include_pseudo', False)
# By default, no exclusions since all=False filters most pseudo filesystems
# Users can add specific types to exclude if needed
self.exclude_types = set(self.config.get('exclude_types', []))
if psutil is None:
raise ImportError("psutil library is required for filesystem_info plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - filesystem_info cannot run")
return False
logger.info(f"Filesystem info initialized (pseudo: {self.include_pseudo})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect filesystem information.
Returns only physical mounted filesystems by default.
Returns:
Dictionary with filesystem data:
- filesystems: List of filesystem dictionaries:
- device: Device name (e.g., /dev/sda1)
- mountpoint: Mount point path
- fstype: Filesystem type (e.g., ext4, xfs, btrfs)
- opts: Mount options (comma-separated string)
- maxfile: Maximum filename length
- maxpath: Maximum path length
- filesystem_types: List of unique filesystem types found
- mount_count: Total number of mounted filesystems
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_info()
logger.info(f"Collected filesystem info: {len(data.get('filesystems', []))} filesystems")
return data
except Exception as e:
logger.error(f"Error collecting filesystem info: {e}")
return {"error": str(e)}
async def _collect_info(self) -> Dict[str, Any]:
"""Collect filesystem information from psutil."""
info = {}
filesystems = []
filesystem_types = set()
# Get mounted disk partitions
# all=False returns only physical devices (real mounted filesystems)
# all=True would include pseudo filesystems (proc, sysfs, etc.)
partitions = psutil.disk_partitions(all=self.include_pseudo)
for partition in partitions:
# Additional filtering if exclude_types is specified
if partition.fstype in self.exclude_types:
continue
fs_info = {
'device': partition.device,
'mountpoint': partition.mountpoint,
'fstype': partition.fstype,
'opts': partition.opts,
}
# Try to get filesystem capabilities
try:
# Get path configuration for this mount point
import os
if hasattr(os, 'pathconf'):
try:
# Maximum filename length
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
if max_name:
fs_info['maxfile'] = max_name
except (OSError, ValueError):
pass
try:
# Maximum path length
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
if max_path:
fs_info['maxpath'] = max_path
except (OSError, ValueError):
pass
except Exception as e:
logger.debug(f"Could not get pathconf for {partition.mountpoint}: {e}")
filesystems.append(fs_info)
filesystem_types.add(partition.fstype)
info['filesystems'] = filesystems
info['filesystem_types'] = sorted(list(filesystem_types))
info['mount_count'] = len(filesystems)
# Add some additional filesystem statistics
try:
# Get boot time (useful for determining filesystem mount times)
boot_time = psutil.boot_time()
info['boot_time'] = boot_time
except Exception as e:
logger.debug(f"Could not get boot time: {e}")
return info
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Filesystem info cleanup")
# Plugin instance for automatic discovery
plugin = FilesystemInfoPlugin
+147
View File
@@ -0,0 +1,147 @@
"""
Memory monitoring plugin for Heartbeat.
Collects memory and swap usage statistics using psutil.
"""
import logging
from typing import Dict, Any, Optional
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class MemoryMonitorPlugin(MonitorPlugin):
"""
Monitor memory and swap usage.
Collects:
- Physical memory (RAM) usage and statistics
- Virtual memory details
- Swap memory usage and statistics
- Memory available for applications
Configuration:
interval: Collection interval in seconds (default: 300)
include_swap: Include swap statistics (default: True)
"""
name = "memory_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the memory monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- include_swap: Include swap statistics (default: True)
"""
super().__init__(config)
self.include_swap = self.config.get('include_swap', True)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for memory_monitor plugin")
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - memory_monitor cannot run")
return False
logger.info(f"Memory monitor initialized (interval: {self.interval}s, swap: {self.include_swap})")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current memory statistics.
Returns:
Dictionary with memory metrics:
- memory_total: Total physical RAM in bytes
- memory_available: Available memory in bytes
- memory_used: Used memory in bytes
- memory_free: Free memory in bytes
- memory_percent: Memory usage percentage
- memory_active: Active memory (Unix)
- memory_inactive: Inactive memory (Unix)
- memory_buffers: Buffers (Linux)
- memory_cached: Cached (Linux)
- memory_shared: Shared (Linux)
- swap_total: Total swap in bytes (if include_swap)
- swap_used: Used swap in bytes (if include_swap)
- swap_free: Free swap in bytes (if include_swap)
- swap_percent: Swap usage percentage (if include_swap)
- swap_sin: Bytes swapped in from disk (if include_swap)
- swap_sout: Bytes swapped out to disk (if include_swap)
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected memory metrics: {len(data)} fields")
return data
except Exception as e:
logger.error(f"Error collecting memory metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect memory metrics from psutil."""
metrics = {}
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available
metrics['memory_used'] = vmem.used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
metrics['memory_active'] = vmem.active
if hasattr(vmem, 'inactive'):
metrics['memory_inactive'] = vmem.inactive
if hasattr(vmem, 'buffers'):
metrics['memory_buffers'] = vmem.buffers
if hasattr(vmem, 'cached'):
metrics['memory_cached'] = vmem.cached
if hasattr(vmem, 'shared'):
metrics['memory_shared'] = vmem.shared
# Swap memory statistics
if self.include_swap:
try:
swap = psutil.swap_memory()
metrics['swap_total'] = swap.total
metrics['swap_used'] = swap.used
metrics['swap_free'] = swap.free
metrics['swap_percent'] = swap.percent
# Swap in/out counters (may not be available on all platforms)
if hasattr(swap, 'sin'):
metrics['swap_sin'] = swap.sin
if hasattr(swap, 'sout'):
metrics['swap_sout'] = swap.sout
except Exception as e:
logger.warning(f"Could not collect swap statistics: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Memory monitor cleanup")
# Plugin instance for automatic discovery
plugin = MemoryMonitorPlugin
+283
View File
@@ -0,0 +1,283 @@
"""Nagios Plugin Runner for Heartbeat.
Executes Nagios-compatible monitoring plugins and parses their output.
Nagios Plugin Standard:
- Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
- Output format: Single line status message, optional performance data
- Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
Example configuration in ~/.hb.yaml:
```yaml
nagios_runner:
interval: 60
commands:
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
```
"""
import re
import subprocess
from typing import Any, Dict, List, Optional, Tuple
from hbd.client.plugin import MonitorPlugin
# Nagios exit codes
NAGIOS_OK = 0
NAGIOS_WARNING = 1
NAGIOS_CRITICAL = 2
NAGIOS_UNKNOWN = 3
STATUS_NAMES = {
NAGIOS_OK: "OK",
NAGIOS_WARNING: "WARNING",
NAGIOS_CRITICAL: "CRITICAL",
NAGIOS_UNKNOWN: "UNKNOWN"
}
class NagiosRunnerPlugin(MonitorPlugin):
"""Run Nagios-compatible monitoring plugins.
This plugin executes external Nagios plugins and collects their output,
including status codes, messages, and performance data.
Configuration:
interval: Collection interval in seconds (default: 300)
commands: List of command definitions with 'name' and 'command' keys
timeout: Command execution timeout in seconds (default: 30)
shell: Whether to execute commands via shell (default: True)
Example:
nagios_runner:
interval: 300 # Check every 5 minutes
timeout: 30
commands:
- name: check_disk
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10%
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
"""
name = "nagios_runner"
version = "1.0.0"
description = "Execute Nagios-compatible monitoring plugins"
interval = 300 # MonitorPlugin: collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
# Extract configuration
self.commands: List[Dict[str, str]] = config.get("commands", []) if config else []
self.timeout: int = config.get("timeout", 30) if config else 30
self.shell: bool = config.get("shell", True) if config else True
self.interval = config.get("interval", 300) if config else 300
# Validate commands
if not self.commands:
self.logger.warning(
"No Nagios commands configured. Add 'nagios_runner.commands' to config."
)
async def initialize(self) -> bool:
"""Initialize the Nagios runner plugin.
Returns:
True if at least one command is configured, False otherwise
"""
self.logger.info(f"Initializing {self.name} plugin")
if not self.commands:
self.logger.error("No Nagios commands configured")
return False
self.logger.info(f"Configured to run {len(self.commands)} Nagios plugin(s)")
for cmd_config in self.commands:
name = cmd_config.get("name", "unnamed")
self.logger.info(f" - {name}: {cmd_config.get('command', 'N/A')}")
return True
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect metrics from all configured Nagios plugins.
Returns:
Dictionary with results from all plugins
"""
results = {}
# Track overall status (worst status wins)
worst_status = NAGIOS_OK
for cmd_config in self.commands:
name = cmd_config.get("name")
command = cmd_config.get("command")
if not name or not command:
self.logger.warning("Skipping command with missing name or command")
continue
# Execute plugin
try:
status_code, output, perfdata = await self._run_nagios_plugin(command)
# Store results
results[f"{name}_status"] = STATUS_NAMES.get(status_code, "UNKNOWN")
results[f"{name}_status_code"] = status_code
results[f"{name}_output"] = output
# Track worst status
if status_code > worst_status:
worst_status = status_code
# Parse and add performance data
if perfdata:
for metric_name, metric_value in perfdata.items():
results[f"{name}_{metric_name}"] = metric_value
self.logger.debug(
f"Executed {name}: {STATUS_NAMES.get(status_code, 'UNKNOWN')} - {output[:50]}"
)
except Exception as e:
self.logger.error(f"Error running {name}: {e}", exc_info=True)
results[f"{name}_status"] = "ERROR"
results[f"{name}_status_code"] = NAGIOS_UNKNOWN
results[f"{name}_output"] = str(e)
worst_status = NAGIOS_UNKNOWN
# Add overall status
results["overall_status"] = STATUS_NAMES.get(worst_status, "UNKNOWN")
results["overall_status_code"] = worst_status
results["plugin_count"] = len(self.commands)
return results
async def _run_nagios_plugin(
self,
command: str
) -> Tuple[int, str, Dict[str, Any]]:
"""Execute a Nagios plugin and parse its output.
Args:
command: Command string to execute
Returns:
Tuple of (status_code, output_message, performance_data_dict)
"""
try:
# Run command
result = subprocess.run(
command,
shell=self.shell,
capture_output=True,
timeout=self.timeout,
text=True
)
status_code = result.returncode
output = result.stdout.strip()
# Nagios plugins can return codes > 3, treat as UNKNOWN
if status_code > 3:
status_code = NAGIOS_UNKNOWN
# Parse performance data
perfdata = self._parse_perfdata(output)
# Extract just the status message (before the pipe if present)
if '|' in output:
output_msg = output.split('|')[0].strip()
else:
output_msg = output
return status_code, output_msg, perfdata
except subprocess.TimeoutExpired:
self.logger.error(f"Command timed out: {command}")
return NAGIOS_UNKNOWN, f"Command timed out after {self.timeout}s", {}
except Exception as e:
self.logger.error(f"Error executing command: {e}")
return NAGIOS_UNKNOWN, f"Execution error: {str(e)}", {}
def _parse_perfdata(self, output: str) -> Dict[str, Any]:
"""Parse Nagios performance data from plugin output.
Nagios performance data format:
'label'=value[UOM];[warn];[crit];[min];[max]
Multiple metrics separated by spaces.
Args:
output: Plugin output string
Returns:
Dictionary of metric_name: value
"""
perfdata = {}
# Performance data comes after the pipe character
if '|' not in output:
return perfdata
perf_section = output.split('|', 1)[1].strip()
# Regex to match performance data format
# Matches: 'label'=value or label=value
perf_regex = r"'?([^'=]+)'?=([\d.]+)([a-zA-Z%]*);?([\d.]*);?([\d.]*);?([\d.]*);?([\d.]*)"
for match in re.finditer(perf_regex, perf_section):
label = match.group(1).strip()
value_str = match.group(2)
uom = match.group(3) or ""
warn = match.group(4)
crit = match.group(5)
min_val = match.group(6)
max_val = match.group(7)
# Convert value to float
try:
value = float(value_str)
except ValueError:
continue
# Store the value
perfdata[label] = value
# Optionally store UOM as separate field
if uom:
perfdata[f"{label}_uom"] = uom
# Store thresholds if present
if warn:
try:
perfdata[f"{label}_warn"] = float(warn)
except ValueError:
pass
if crit:
try:
perfdata[f"{label}_crit"] = float(crit)
except ValueError:
pass
if min_val:
try:
perfdata[f"{label}_min"] = float(min_val)
except ValueError:
pass
if max_val:
try:
perfdata[f"{label}_max"] = float(max_val)
except ValueError:
pass
return perfdata
+240
View File
@@ -0,0 +1,240 @@
"""
Network monitoring plugin for Heartbeat.
Collects network interface statistics and connection information using psutil.
"""
import logging
from typing import Dict, Any, Optional, List
try:
import psutil
except ImportError:
psutil = None
from hbd.client.plugin import MonitorPlugin
logger = logging.getLogger(__name__)
class NetworkMonitorPlugin(MonitorPlugin):
"""
Monitor network interface statistics and connections.
Collects:
- Network interface I/O counters (bytes sent/received, packets, errors, drops)
- Per-interface statistics
- Network connection counts by state
- Interface addresses and configuration
Configuration:
interval: Collection interval in seconds (default: 300)
interfaces: List of interfaces to monitor (default: all)
include_connections: Include connection statistics (default: True)
include_addresses: Include interface addresses (default: False)
"""
name = "network_monitor"
interval = 300 # Collect every 5 minutes by default
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the network monitor plugin.
Args:
config: Optional configuration dict with keys:
- interval: Collection interval in seconds (default: 300)
- interfaces: List of specific interfaces to monitor
- include_connections: Include connection stats (default: True)
- include_addresses: Include interface addresses (default: False)
"""
super().__init__(config)
self.interfaces = self.config.get('interfaces', None) # None = all interfaces
self.include_connections = self.config.get('include_connections', True)
self.include_addresses = self.config.get('include_addresses', False)
self.interval = self.config.get('interval', 300)
if psutil is None:
raise ImportError("psutil library is required for network_monitor plugin")
# Store previous I/O counters for delta calculation
self._prev_io = {}
async def initialize(self):
"""Initialize the plugin (check psutil availability)."""
if psutil is None:
logger.error("psutil not available - network_monitor cannot run")
return False
logger.info(f"Network monitor initialized (interval: {self.interval}s, "
f"connections: {self.include_connections})")
# Initialize I/O counters
try:
self._prev_io = psutil.net_io_counters(pernic=True)
except Exception as e:
logger.warning(f"Could not initialize network I/O counters: {e}")
return True
async def collect(self) -> Dict[str, Any]:
"""
Collect current network statistics.
Returns:
Dictionary with network metrics:
- interfaces: Dict of interface statistics, keyed by interface name
- bytes_sent: Total bytes sent
- bytes_recv: Total bytes received
- packets_sent: Total packets sent
- packets_recv: Total packets received
- errin: Total incoming errors
- errout: Total outgoing errors
- dropin: Total incoming packets dropped
- dropout: Total outgoing packets dropped
- bytes_sent_delta: Bytes sent since last collection
- bytes_recv_delta: Bytes received since last collection
- packets_sent_delta: Packets sent since last collection
- packets_recv_delta: Packets received since last collection
- connections: Connection statistics by state (if include_connections)
- ESTABLISHED: Count of established connections
- LISTEN: Count of listening sockets
- TIME_WAIT: Count of TIME_WAIT connections
- etc.
- addresses: Interface address information (if include_addresses)
- Dict keyed by interface name with address details
"""
if psutil is None:
logger.error("psutil not available")
return {}
try:
data = await self._collect_metrics()
logger.debug(f"Collected network metrics: {len(data.get('interfaces', {}))} interfaces")
return data
except Exception as e:
logger.error(f"Error collecting network metrics: {e}")
return {"error": str(e)}
async def _collect_metrics(self) -> Dict[str, Any]:
"""Collect network metrics from psutil."""
metrics = {}
# Collect per-interface I/O counters
try:
io_counters = psutil.net_io_counters(pernic=True)
interfaces_data = {}
for iface_name, counters in io_counters.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_stats = {
'bytes_sent': counters.bytes_sent,
'bytes_recv': counters.bytes_recv,
'packets_sent': counters.packets_sent,
'packets_recv': counters.packets_recv,
'errin': counters.errin,
'errout': counters.errout,
'dropin': counters.dropin,
'dropout': counters.dropout,
}
# Calculate deltas from previous collection
if iface_name in self._prev_io:
prev = self._prev_io[iface_name]
iface_stats['bytes_sent_delta'] = counters.bytes_sent - prev.bytes_sent
iface_stats['bytes_recv_delta'] = counters.bytes_recv - prev.bytes_recv
iface_stats['packets_sent_delta'] = counters.packets_sent - prev.packets_sent
iface_stats['packets_recv_delta'] = counters.packets_recv - prev.packets_recv
interfaces_data[iface_name] = iface_stats
metrics['interfaces'] = interfaces_data
# Store current counters for next delta calculation
self._prev_io = io_counters
except Exception as e:
logger.warning(f"Could not collect network I/O counters: {e}")
# Collect connection statistics
if self.include_connections:
try:
connections = psutil.net_connections(kind='inet')
conn_stats = {}
# Count connections by state
for conn in connections:
state = conn.status
conn_stats[state] = conn_stats.get(state, 0) + 1
metrics['connections'] = conn_stats
except (PermissionError, psutil.AccessDenied):
logger.debug("Permission denied for net_connections (requires root/admin)")
except Exception as e:
logger.warning(f"Could not collect connection statistics: {e}")
# Collect interface addresses
if self.include_addresses:
try:
addresses = psutil.net_if_addrs()
addr_data = {}
for iface_name, addrs in addresses.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
iface_addrs = []
for addr in addrs:
addr_info = {
'family': str(addr.family),
'address': addr.address,
}
if addr.netmask:
addr_info['netmask'] = addr.netmask
if addr.broadcast:
addr_info['broadcast'] = addr.broadcast
iface_addrs.append(addr_info)
addr_data[iface_name] = iface_addrs
metrics['addresses'] = addr_data
except Exception as e:
logger.warning(f"Could not collect interface addresses: {e}")
# Add interface stats (up/down status, speed, mtu)
try:
if_stats = psutil.net_if_stats()
stats_data = {}
for iface_name, stats in if_stats.items():
# Skip if we're only monitoring specific interfaces
if self.interfaces and iface_name not in self.interfaces:
continue
stats_data[iface_name] = {
'isup': stats.isup,
'duplex': str(stats.duplex) if hasattr(stats, 'duplex') else None,
'speed': stats.speed,
'mtu': stats.mtu,
}
metrics['interface_stats'] = stats_data
except Exception as e:
logger.warning(f"Could not collect interface stats: {e}")
return metrics
async def cleanup(self):
"""Cleanup (nothing to do for this plugin)."""
logger.info("Network monitor cleanup")
# Plugin instance for automatic discovery
plugin = NetworkMonitorPlugin
+136
View File
@@ -0,0 +1,136 @@
"""OS Information Plugin for Heartbeat.
Collects static operating system information including OS name, version,
kernel, architecture, and distribution details.
"""
import platform
import sys
from pathlib import Path
from typing import Any, Dict, Optional
# Import from parent package
from hbd.client.plugin import InfoPlugin
class OSInfoPlugin(InfoPlugin):
"""Collect operating system information.
This plugin gathers static OS information that rarely changes:
- OS name and version
- Kernel version
- Architecture (x86_64, arm64, etc.)
- Distribution details (for Linux)
- Python version (used by hbc)
"""
name = "os_info"
version = "1.0.0"
description = "Operating system and platform information"
interval = 0 # InfoPlugin: collect once at startup
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
async def initialize(self) -> bool:
"""Initialize the OS info plugin.
Returns:
True (always succeeds - platform module is stdlib)
"""
self.logger.info(f"Initializing {self.name} plugin")
return True
async def _collect_info(self) -> Dict[str, Any]:
"""Collect OS information.
Returns:
Dictionary with OS details
"""
try:
data = {
"system": platform.system(), # e.g., "Linux", "Darwin", "Windows"
"node": platform.node(), # hostname
"release": platform.release(), # kernel version
"version": platform.version(), # detailed version
"machine": platform.machine(), # e.g., "x86_64", "arm64"
"processor": platform.processor(), # processor name
"architecture": platform.architecture()[0], # e.g., "64bit"
"python_version": platform.python_version(),
"python_implementation": platform.python_implementation(),
}
# Add Linux-specific distribution info
if platform.system() == "Linux":
data.update(self._get_linux_distro())
# Add macOS-specific info
elif platform.system() == "Darwin":
data["macos_version"] = platform.mac_ver()[0]
# Add Windows-specific info
elif platform.system() == "Windows":
win_ver = platform.win32_ver()
data["windows_release"] = win_ver[0]
data["windows_version"] = win_ver[1]
data["windows_sp"] = win_ver[2]
data["windows_type"] = win_ver[3]
self.logger.debug(f"Collected OS info: {data['system']} {data['release']}")
return data
except Exception as e:
self.logger.error(f"Error collecting OS info: {e}", exc_info=True)
return {}
def _get_linux_distro(self) -> Dict[str, str]:
"""Get Linux distribution information.
Returns:
Dictionary with distribution details
"""
distro_info = {}
# Try reading /etc/os-release (standard on modern Linux)
os_release = Path("/etc/os-release")
if os_release.exists():
try:
with open(os_release) as f:
for line in f:
line = line.strip()
if "=" in line and not line.startswith("#"):
key, value = line.split("=", 1)
# Remove quotes from value
value = value.strip('"').strip("'")
# Map common keys
if key == "NAME":
distro_info["distro_name"] = value
elif key == "VERSION":
distro_info["distro_version"] = value
elif key == "ID":
distro_info["distro_id"] = value
elif key == "VERSION_ID":
distro_info["distro_version_id"] = value
elif key == "PRETTY_NAME":
distro_info["distro_pretty_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/os-release: {e}")
# Fallback: try lsb_release (older systems)
elif Path("/etc/lsb-release").exists():
try:
with open("/etc/lsb-release") as f:
for line in f:
line = line.strip()
if "=" in line:
key, value = line.split("=", 1)
if key == "DISTRIB_ID":
distro_info["distro_id"] = value
elif key == "DISTRIB_RELEASE":
distro_info["distro_version"] = value
elif key == "DISTRIB_DESCRIPTION":
distro_info["distro_name"] = value
except Exception as e:
self.logger.warning(f"Could not read /etc/lsb-release: {e}")
return distro_info
+579
View File
@@ -0,0 +1,579 @@
"""
Threshold checking and alerting for plugin metrics.
This module provides a flexible threshold checking system that:
- Evaluates plugin metrics against configured warning/critical thresholds
- Tracks alert states per host and metric
- Prevents alert flapping with hysteresis
- Triggers notifications only on state changes
- Supports multiple comparison operators
"""
import logging
import time
from enum import Enum
from typing import Dict, Any, Optional, Tuple, Callable
logger = logging.getLogger(__name__)
class AlertLevel(Enum):
"""Alert severity levels."""
OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3
class ComparisonOperator(Enum):
"""Supported comparison operators for threshold checks."""
GT = ">" # Greater than
GTE = ">=" # Greater than or equal
LT = "<" # Less than
LTE = "<=" # Less than or equal
EQ = "==" # Equal to
NEQ = "!=" # Not equal to
class AlertState:
"""Represents the current alert state for a specific metric."""
def __init__(self, metric_path: str):
"""
Initialize alert state.
Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
"""
self.metric_path = metric_path
self.level = AlertLevel.OK
self.since = time.time()
self.last_value = None
self.last_check = time.time()
self.notification_count = 0
self.last_notification = None
def update(self, level: AlertLevel, value: Any) -> bool:
"""
Update alert state.
Args:
level: New alert level
value: Current metric value
Returns:
True if state changed (notification needed), False otherwise
"""
now = time.time()
self.last_check = now
self.last_value = value
# Check if state changed
if level != self.level:
logger.info(
"Alert state change for %s: %s -> %s (value: %s)",
self.metric_path,
self.level.name,
level.name,
value
)
self.level = level
self.since = now
self.notification_count = 0
return True
return False
def to_dict(self) -> dict:
"""Convert alert state to dictionary for serialization."""
return {
"metric_path": self.metric_path,
"level": self.level.name,
"since": self.since,
"last_value": self.last_value,
"last_check": self.last_check,
"notification_count": self.notification_count,
}
class ThresholdConfig:
"""Configuration for a single threshold check."""
def __init__(
self,
metric_path: str,
warning: Optional[float] = None,
critical: Optional[float] = None,
operator: str = ">",
hysteresis: float = 0.0,
enabled: bool = True,
):
"""
Initialize threshold configuration.
Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
warning: Warning threshold value
critical: Critical threshold value
operator: Comparison operator (>, >=, <, <=, ==, !=)
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled
"""
self.metric_path = metric_path
self.warning = warning
self.critical = critical
self.enabled = enabled
self.hysteresis = hysteresis
# Parse operator
try:
self.operator = ComparisonOperator(operator)
except ValueError:
logger.warning(
"Invalid operator '%s' for %s, using '>' as default",
operator,
metric_path
)
self.operator = ComparisonOperator.GT
def evaluate(self, value: float) -> AlertLevel:
"""
Evaluate a value against this threshold.
Args:
value: Metric value to check
Returns:
AlertLevel indicating the severity
"""
if not self.enabled:
return AlertLevel.OK
try:
# Convert value to float for comparison
value = float(value)
except (TypeError, ValueError):
logger.warning("Cannot convert value %s to float for %s", value, self.metric_path)
return AlertLevel.UNKNOWN
# Check critical threshold first
if self.critical is not None:
if self._compare(value, self.critical):
return AlertLevel.CRITICAL
# Then check warning threshold
if self.warning is not None:
if self._compare(value, self.warning):
return AlertLevel.WARNING
return AlertLevel.OK
def evaluate_with_hysteresis(
self,
value: float,
current_level: AlertLevel
) -> AlertLevel:
"""
Evaluate with hysteresis to prevent flapping.
Args:
value: Current metric value
current_level: Current alert level
Returns:
New alert level considering hysteresis
"""
new_level = self.evaluate(value)
# If no hysteresis, return new level
if self.hysteresis == 0.0:
return new_level
# If improving (going to a lower severity), apply hysteresis
if new_level.value < current_level.value:
# For recovery, value must be better by hysteresis amount
if current_level == AlertLevel.CRITICAL and self.critical is not None:
threshold = self.critical
elif current_level == AlertLevel.WARNING and self.warning is not None:
threshold = self.warning
else:
return new_level
# Calculate hysteresis threshold
hysteresis_amount = abs(threshold * self.hysteresis)
if self.operator in [ComparisonOperator.GT, ComparisonOperator.GTE]:
# For "greater than" thresholds, value must go below by hysteresis
recovery_threshold = threshold - hysteresis_amount
if value >= recovery_threshold:
# Not enough improvement, keep current level
return current_level
elif self.operator in [ComparisonOperator.LT, ComparisonOperator.LTE]:
# For "less than" thresholds, value must go above by hysteresis
recovery_threshold = threshold + hysteresis_amount
if value <= recovery_threshold:
# Not enough improvement, keep current level
return current_level
return new_level
def _compare(self, value: float, threshold: float) -> bool:
"""Perform comparison based on operator."""
if self.operator == ComparisonOperator.GT:
return value > threshold
elif self.operator == ComparisonOperator.GTE:
return value >= threshold
elif self.operator == ComparisonOperator.LT:
return value < threshold
elif self.operator == ComparisonOperator.LTE:
return value <= threshold
elif self.operator == ComparisonOperator.EQ:
return abs(value - threshold) < 1e-9 # Float comparison
elif self.operator == ComparisonOperator.NEQ:
return abs(value - threshold) >= 1e-9
return False
class ThresholdChecker:
"""Main threshold checking and alerting system."""
def __init__(
self,
config: Dict[str, Any],
notification_callback: Optional[Callable] = None,
renotify_interval: int = 3600,
journal: Optional[Any] = None,
):
"""
Initialize threshold checker.
Args:
config: Threshold configuration dictionary from YAML
notification_callback: Function to call for notifications
renotify_interval: Seconds between repeat notifications (default: 1 hour)
journal: Optional MessageJournal instance for logging threshold events
"""
self.thresholds = {} # {metric_path: ThresholdConfig}
self.notification_callback = notification_callback
self.renotify_interval = renotify_interval
self.journal = journal
# Parse configuration
self._parse_config(config)
logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
def _parse_config(self, config: Dict[str, Any]):
"""Parse threshold configuration from YAML structure."""
if not config or "thresholds" not in config:
logger.info("No thresholds configured")
return
thresholds_config = config["thresholds"]
for plugin_name, plugin_thresholds in thresholds_config.items():
if not isinstance(plugin_thresholds, dict):
continue
self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
"""Parse thresholds for a specific plugin."""
for metric_name, threshold_config in thresholds.items():
if not isinstance(threshold_config, dict):
continue
# Handle nested metrics (e.g., partitions./.percent)
if metric_name == "partitions":
self._parse_partition_thresholds(plugin_name, threshold_config)
continue
metric_path = f"{plugin_name}.{metric_name}"
# Extract threshold values
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
if warning is None and critical is None:
logger.warning("No thresholds defined for %s, skipping", metric_path)
continue
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
)
self.thresholds[metric_path] = threshold
logger.debug(
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
metric_path,
warning,
critical,
operator
)
def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
"""Parse partition-specific thresholds for disk monitoring."""
for partition, metrics in partitions.items():
if not isinstance(metrics, dict):
continue
for metric_name, threshold_config in metrics.items():
if not isinstance(threshold_config, dict):
continue
# Create metric path like "disk_monitor./dev/sda1.percent"
metric_path = f"{plugin_name}.{partition}.{metric_name}"
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1)
enabled = threshold_config.get("enabled", True)
if warning is None and critical is None:
continue
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
)
self.thresholds[metric_path] = threshold
def check_plugin_data(
self,
host_name: str,
plugin_name: str,
data: Dict[str, Any],
alert_states: Dict[str, AlertState],
) -> list:
"""
Check plugin data against configured thresholds.
Args:
host_name: Name of the host
plugin_name: Name of the plugin
data: Plugin data dictionary
alert_states: Host's alert_states dictionary
Returns:
List of (metric_path, old_level, new_level, value) tuples for state changes
"""
state_changes = []
# Check flat metrics
for metric_name, value in data.items():
metric_path = f"{plugin_name}.{metric_name}"
if metric_path not in self.thresholds:
continue
threshold = self.thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path]
# Evaluate threshold with hysteresis
new_level = threshold.evaluate_with_hysteresis(
value,
alert_state.level
)
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value)
# Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics(
host_name,
plugin_name,
data,
alert_states,
state_changes
)
return state_changes
def _check_nested_metrics(
self,
host_name: str,
plugin_name: str,
data: Dict[str, Any],
alert_states: Dict[str, AlertState],
state_changes: list,
):
"""Check nested metrics like partition-specific thresholds."""
# Look for partition data in disk_monitor
if plugin_name == "disk_monitor" and "partitions" in data:
partitions = data["partitions"]
if not isinstance(partitions, dict):
return
for partition, metrics in partitions.items():
if not isinstance(metrics, dict):
continue
for metric_name, value in metrics.items():
metric_path = f"{plugin_name}.{partition}.{metric_name}"
if metric_path not in self.thresholds:
continue
threshold = self.thresholds[metric_path]
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
alert_state = alert_states[metric_path]
new_level = threshold.evaluate_with_hysteresis(
value,
alert_state.level
)
old_level = alert_state.level
if alert_state.update(new_level, value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(
host_name,
metric_path,
old_level,
new_level,
value
)
elif new_level != AlertLevel.OK:
self._check_renotify(host_name, alert_state, metric_path, value)
def _trigger_notification(
self,
host_name: str,
metric_path: str,
old_level: AlertLevel,
new_level: AlertLevel,
value: Any,
):
"""Trigger a notification for an alert state change."""
# Format message
if new_level == AlertLevel.OK:
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
message = f"WARNING: {host_name} - {metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL:
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
else:
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
# Send notification
if self.notification_callback is not None:
try:
self.notification_callback(message)
logger.info("Notification sent: %s", message)
except Exception as e:
logger.error("Failed to send notification: %s", e)
# Log to journal
if self.journal is not None:
try:
import asyncio
loop = asyncio.get_event_loop()
loop.create_task(self.journal.log_threshold_event(
host_name=host_name,
metric_path=metric_path,
old_level=old_level.name,
new_level=new_level.name,
value=value,
))
except Exception as e:
logger.debug(f"Failed to log threshold event to journal: {e}")
def _check_renotify(
self,
host_name: str,
alert_state: AlertState,
metric_path: str,
value: Any,
):
"""Check if we should send a repeat notification."""
if alert_state.level == AlertLevel.OK:
return
now = time.time()
# Check if we should re-notify
if alert_state.last_notification is None:
# First notification already sent during state change
alert_state.last_notification = now
alert_state.notification_count = 1
return
if (now - alert_state.last_notification) >= self.renotify_interval:
# Time to re-notify
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
if self.notification_callback:
try:
self.notification_callback(message)
alert_state.last_notification = now
alert_state.notification_count += 1
logger.info("Re-notification sent: %s", message)
except Exception as e:
logger.error("Failed to send re-notification: %s", e)
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
"""
Get all currently active (non-OK) alerts.
Args:
alert_states: Host's alert_states dictionary
Returns:
List of AlertState objects that are not OK
"""
return [
state for state in alert_states.values()
if state.level != AlertLevel.OK
]
def get_alert_summary(self, alert_states: Dict[str, AlertState]) -> Dict[str, int]:
"""
Get summary counts of alert levels.
Args:
alert_states: Host's alert_states dictionary
Returns:
Dictionary with counts: {"ok": N, "warning": N, "critical": N}
"""
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
for state in alert_states.values():
if state.level == AlertLevel.OK:
summary["ok"] += 1
elif state.level == AlertLevel.WARNING:
summary["warning"] += 1
elif state.level == AlertLevel.CRITICAL:
summary["critical"] += 1
elif state.level == AlertLevel.UNKNOWN:
summary["unknown"] += 1
return summary
+3
View File
@@ -0,0 +1,3 @@
"""Common utilities shared between hbc and hbd."""
__version__ = "5.0.5"
+160
View File
@@ -0,0 +1,160 @@
"""Message encoding/decoding utilities for hbd protocol.
Message Types:
HTB: Heartbeat message (client -> server)
ACK: Acknowledgment (server -> client)
CMD: Command message (server -> client)
UPD: Update message (server -> client)
PLG: Plugin data message (client -> server)
"""
from typing import Dict, Any, Union
import json
import zlib
def encode_value(v: Any) -> str:
"""Encode a value for protocol transmission.
Args:
v: Value to encode (int, float, str, bool, list, dict, etc.)
Returns:
String representation suitable for protocol
"""
if isinstance(v, float):
return f"{v:0.5f}"
elif isinstance(v, (list, dict)):
# Use JSON encoding for complex types, prefixed with @
return "@" + json.dumps(v)
elif isinstance(v, bool):
return str(int(v)) # True->1, False->0
else:
return str(v)
def decode_value(val: str) -> Any:
"""Decode a value from protocol format.
Args:
val: String value from protocol
Returns:
Decoded Python object
"""
if not val:
return val
# Check for JSON-encoded complex types
if val.startswith("@"):
try:
return json.loads(val[1:])
except Exception:
return val[1:] # Return as string without @
# Try numeric evaluation (original behavior)
if val[0].isdigit() or (val[0] == '-' and len(val) > 1 and val[1].isdigit()):
try:
return eval(val)
except Exception:
return val
return val
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
"""Serialize a dict to protocol message bytes.
If compress is True, the payload is zlib-compressed and the message is
prefixed with `!ID:` as the original script did. Otherwise the format is
`ID:key=value;...` (bytes).
"""
s = []
for k in d:
v = d[k]
encoded_val = encode_value(v)
s.append(f"{k}={encoded_val}")
pk = ";".join(s)
if compress:
zpk = zlib.compress(pk.encode(), 6)
hdr = ("!" + ID + ":").encode()
return hdr + zpk
else:
return (ID + ":" + pk).encode()
def stodict(msg: bytes):
"""Deserialize a protocol message into a dict.
Mirrors original behaviour: detects compressed messages starting with
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
message ID and the parsed key/value pairs.
"""
d = {}
if len(msg) > 0 and chr(msg[0]) == "!":
# message is: b'!ID:' + compressed_payload
# original code used msg[1:4].decode() for ID (3 bytes including colon)
try:
pk = zlib.decompress(msg[5:]).decode()
except Exception:
# malformed compressed payload
return {}
d["ID"] = msg[1:4].decode()
else:
try:
r0 = msg.split(b":", 1)
pk = r0[1].decode()
d["ID"] = r0[0].decode()
except Exception:
return {}
if not pk:
return d
parts = pk.split(";")
for v in parts:
if not v:
continue
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
val = vr[1].strip()
d[k] = decode_value(val)
return d
def oldmtodict(msg: bytes):
"""Compatibility wrapper for old-style messages (no ID prefix).
The original implementation prefixed with 'HTB:' and called stodict.
"""
return stodict(b"HTB:" + msg)
def encode_plugin_data(plugin_name: str, data: Dict[str, Any], compress: bool = False) -> bytes:
"""Encode plugin data into a PLG message.
Args:
plugin_name: Name of the plugin (e.g., "os_info", "cpu_monitor")
data: Plugin data dictionary
compress: Whether to compress the payload
Returns:
Encoded message bytes
"""
# Add plugin name to data
full_data = {"plugin": plugin_name, **data}
return dicttos("PLG", full_data, compress)
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
"""Decode a PLG message into plugin data.
Args:
msg: Raw message bytes
Returns:
Dictionary with 'ID', 'plugin', and plugin data fields
"""
return stodict(msg)
+12 -5
View File
@@ -39,6 +39,15 @@ DEFAULTS = {
"cert_path": "/usr/local/etc/ssl/",
"wss_pem": "fullchain.pem",
"wss_key": "privkey.pem",
# Message journal configuration
"journal_enabled": True,
"journal_dir": "/var/log/heartbeat",
"journal_file": "messages.journal",
"journal_max_size": 100 * 1024 * 1024, # 100MB
"journal_max_backups": 10,
"plugins": {},
"thresholds": {},
"threshold_renotify_interval": 3600,
}
@@ -56,12 +65,10 @@ def load_config(path=None):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# only keep known keys
# Merge YAML data with defaults
# Keep all keys from YAML to support plugin configs and future extensions
for k, v in data.items():
if k in cfg:
cfg[k] = v
else:
logging.warning("unknown config key %s in %s", k, path)
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
+196
View File
@@ -0,0 +1,196 @@
# Example Heartbeat Client Configuration
# This file demonstrates all available configuration options for the heartbeat client (hbc)
# and its plugin system.
# ==============================================================================
# Server Configuration
# ==============================================================================
server: hbd.example.com # Heartbeat server hostname or IP
port: 50003 # Server UDP port (default: 50003)
interval: 30 # Heartbeat interval in seconds (default: 30)
# ==============================================================================
# Plugin Configuration
# ==============================================================================
# Plugins are configured under the "plugins" section. Each plugin can be enabled/disabled
# and configured with plugin-specific settings.
plugins:
# --------------------------------------------------------------------------
# OS Information Plugin (InfoPlugin - runs once at startup)
# --------------------------------------------------------------------------
os_info:
enabled: true
# No additional configuration needed
# --------------------------------------------------------------------------
# CPU Monitor Plugin (MonitorPlugin - periodic collection)
# --------------------------------------------------------------------------
cpu_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
per_core: false # Collect per-core CPU statistics (default: false)
# When per_core is true, will report CPU usage for each core separately
# --------------------------------------------------------------------------
# Memory Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
memory_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_swap: true # Include swap memory statistics (default: true)
# --------------------------------------------------------------------------
# Disk Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
disk_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_io: true # Include I/O statistics (default: true)
# Optional: Monitor only specific partitions
# partitions:
# - /
# - /home
# - /var
# Optional: Exclude specific filesystem types
exclude_types:
- tmpfs
- devtmpfs
- squashfs
# --------------------------------------------------------------------------
# Network Monitor Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
network_monitor:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
include_connections: true # Include connection statistics (default: true)
include_addresses: false # Include interface addresses (default: false)
# Optional: Monitor only specific interfaces
# interfaces:
# - eth0
# - wlan0
# --------------------------------------------------------------------------
# Filesystem Info Plugin (InfoPlugin - runs once at startup)
# --------------------------------------------------------------------------
filesystem_info:
enabled: true
include_pseudo: false # Include pseudo/virtual filesystems (default: false)
# When false (default), only reports physical mounted filesystems (ext4, zfs, xfs, etc.)
# When true, also includes pseudo filesystems (proc, sysfs, tmpfs, devtmpfs, etc.)
# Optional: Exclude additional specific filesystem types
# exclude_types:
# - squashfs
# - iso9660
# --------------------------------------------------------------------------
# Nagios Runner Plugin (MonitorPlugin)
# --------------------------------------------------------------------------
nagios_runner:
enabled: true
interval: 300 # Collection interval in seconds (default: 300 = 5 minutes)
timeout: 30 # Plugin execution timeout in seconds (default: 30)
# List of Nagios plugins to execute
# Each command is executed as-is, so provide full paths and arguments
commands:
# System load monitoring
- /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
# Disk space monitoring
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
- /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
# Process monitoring
- /usr/lib/nagios/plugins/check_procs -w 250 -c 400 -s RSZDT
# Swap usage
- /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
# Custom script example
# - /usr/local/bin/check_my_app.sh
# ==============================================================================
# Advanced Options
# ==============================================================================
# These options control client behavior
# Compression: Enable zlib compression for heartbeat messages (default: true)
compress: true
# Hostname: Override the system hostname (default: auto-detect)
# hostname: myhost.example.com
# Message: Custom message included in heartbeat (optional)
# message: "Production web server"
# Logging
log_level: INFO # Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
# logfile: /var/log/hbc.log # Optional log file path
# ==============================================================================
# Example Profiles
# ==============================================================================
# Below are example configuration profiles for different use cases
# Minimal Configuration (default settings):
# -----------------------------------------
# server: hbd.example.com
# interval: 30
# Monitoring Server (comprehensive metrics):
# ------------------------------------------
# server: monitoring.example.com
# interval: 30
# plugins:
# cpu_monitor:
# enabled: true
# interval: 15
# per_core: true
# memory_monitor:
# enabled: true
# interval: 15
# disk_monitor:
# enabled: true
# interval: 60
# network_monitor:
# enabled: true
# interval: 30
# include_connections: true
# Nagios Integration (leverage existing plugins):
# -----------------------------------------------
# server: hbd.example.com
# plugins:
# nagios_runner:
# enabled: true
# interval: 300 # Check every 5 minutes
# commands:
# - /usr/lib/nagios/plugins/check_http -H localhost -p 80
# - /usr/lib/nagios/plugins/check_mysql -H localhost -u monitor -p password
# - /usr/lib/nagios/plugins/check_smtp -H mail.example.com
# ==============================================================================
# Threshold Configuration (for Heartbeat Daemon)
# ==============================================================================
# NOTE: Thresholds are configured on the SERVER side (hbd), not the client (hbc).
# This is just an example - see config_thresholds_example.yaml for comprehensive examples.
#
# Basic threshold example:
# thresholds:
# cpu_monitor:
# cpu_percent:
# warning: 80.0
# critical: 90.0
# memory_monitor:
# percent:
# warning: 85.0
# critical: 95.0
# disk_monitor:
# partitions:
# /:
# percent:
# warning: 80.0
# critical: 90.0
+111
View File
@@ -0,0 +1,111 @@
# Heartbeat Configuration Example with Nagios Plugin Runner
# This example shows how to configure the Nagios Runner plugin
# to execute existing Nagios-compatible monitoring plugins
# Basic server settings (existing config)
hb_port: 50003
hbd_port: 50004
interval: 20
grace: 2
# Plugin configuration
# Each plugin can have its own configuration section
# CPU Monitor Plugin
cpu_monitor:
interval: 300 # Collect every 5 minutes (default)
per_core: false # Set to true to get per-core CPU usage
# Nagios Runner Plugin
nagios_runner:
interval: 300 # Run Nagios plugins every 5 minutes (default)
timeout: 30 # Command execution timeout in seconds
shell: true # Execute commands via shell
# List of Nagios plugins to run
commands:
# Example 1: Check disk space
- name: check_disk_root
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
# Example 2: Check disk space for /home
- name: check_disk_home
command: /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home
# Example 3: Check system load
- name: check_load
command: /usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
# Example 4: Check process count
- name: check_procs
command: /usr/lib/nagios/plugins/check_procs -w 250 -c 400
# Example 5: Check SSH service
- name: check_ssh
command: /usr/lib/nagios/plugins/check_ssh localhost
# Example 6: Check HTTP service
- name: check_http
command: /usr/lib/nagios/plugins/check_http -H localhost
# Example 7: Check swap usage
- name: check_swap
command: /usr/lib/nagios/plugins/check_swap -w 20% -c 10%
# Example 8: Custom script (Nagios plugin format)
- name: check_custom
command: /usr/local/bin/my_custom_check.sh
# Example 9: Check specific log file
- name: check_logs
command: /usr/lib/nagios/plugins/check_log -F /var/log/syslog -O /var/tmp/check_log.old -q "ERROR"
# Notes:
#
# 1. Nagios Plugin Output Format:
# - Single line: STATUS - Message | performance_data
# - Performance data format: 'label'=value[UOM];[warn];[crit];[min];[max]
#
# 2. Exit Codes:
# - 0 = OK
# - 1 = WARNING
# - 2 = CRITICAL
# - 3 = UNKNOWN
#
# 3. Performance Data:
# - Automatically parsed and included in heartbeat data
# - Metrics are stored as: {plugin_name}_{metric_name}
# - Example: check_disk_root_/ will contain the disk usage percentage
#
# 4. Overall Status:
# - The plugin reports the worst status from all commands
# - Useful for quick health checks
#
# 5. Plugin Paths:
# Common Nagios plugin directories:
# - Debian/Ubuntu: /usr/lib/nagios/plugins/
# - RHEL/CentOS: /usr/lib64/nagios/plugins/
# - Custom installs: /usr/local/nagios/libexec/
#
# 6. Installing Nagios Plugins:
# Debian/Ubuntu: sudo apt-get install nagios-plugins
# RHEL/CentOS: sudo yum install nagios-plugins-all
# Arch Linux: sudo pacman -S monitoring-plugins
#
# 7. Writing Custom Nagios Plugins:
# Any script can be a Nagios plugin if it:
# - Returns appropriate exit codes (0-3)
# - Prints status message to stdout
# - Optionally includes performance data after "|"
#
# Example custom plugin (save as /usr/local/bin/check_example.sh):
# #!/bin/bash
# if [ $(uptime | awk '{print $1}') -gt 50 ]; then
# echo "CRITICAL - Too many users | users=52;40;50;0"
# exit 2
# else
# echo "OK - Normal user count | users=25;40;50;0"
# exit 0
# fi
+254
View File
@@ -0,0 +1,254 @@
# ==============================================================================
# Heartbeat Daemon Threshold Configuration Example
# ==============================================================================
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
# Thresholds can be defined for any metric collected by monitoring plugins.
#
# Threshold levels:
# - WARNING: First level of concern, typically for early notification
# - CRITICAL: Severe condition requiring immediate attention
#
# Alert notifications are sent when:
# - A metric crosses from OK to WARNING or CRITICAL
# - A metric crosses from WARNING to CRITICAL
# - A metric recovers (returns to a lower severity level)
#
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Threshold definitions per plugin
thresholds:
# ----------------------------------------------------------------------------
# CPU Monitor Thresholds
# ----------------------------------------------------------------------------
cpu_monitor:
# Overall CPU usage percentage (0-100)
cpu_percent:
warning: 80.0 # Warn when CPU usage exceeds 80%
critical: 90.0 # Critical when CPU usage exceeds 90%
operator: ">" # Alert when value is GREATER than threshold
hysteresis: 0.1 # 10% hysteresis to prevent flapping
enabled: true
# 1-minute load average
load_1min:
warning: 4.0 # Warn when 1-min load exceeds 4.0
critical: 8.0 # Critical when 1-min load exceeds 8.0
operator: ">"
hysteresis: 0.15 # 15% hysteresis
enabled: true
# 5-minute load average
load_5min:
warning: 3.0
critical: 6.0
operator: ">"
hysteresis: 0.15
enabled: true
# 15-minute load average
load_15min:
warning: 2.0
critical: 4.0
operator: ">"
hysteresis: 0.15
enabled: true
# ----------------------------------------------------------------------------
# Memory Monitor Thresholds
# ----------------------------------------------------------------------------
memory_monitor:
# Memory usage percentage
percent:
warning: 85.0 # Warn at 85% memory usage
critical: 95.0 # Critical at 95% memory usage
operator: ">"
hysteresis: 0.1
enabled: true
# Available memory in MB (inverse threshold - alert when LOW)
available_mb:
warning: 1000 # Warn when less than 1GB available
critical: 500 # Critical when less than 500MB available
operator: "<" # Alert when value is LESS than threshold
hysteresis: 0.1
enabled: true
# Swap usage percentage
swap_percent:
warning: 50.0 # Warn at 50% swap usage
critical: 80.0 # Critical at 80% swap usage
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Disk Monitor Thresholds
# ----------------------------------------------------------------------------
disk_monitor:
# Partition-specific thresholds
# Use the mount point as the key
partitions:
# Root filesystem
/:
percent:
warning: 80.0 # Warn at 80% disk usage
critical: 90.0 # Critical at 90% disk usage
operator: ">"
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
enabled: true
free_gb:
warning: 10.0 # Warn when less than 10GB free
critical: 5.0 # Critical when less than 5GB free
operator: "<"
hysteresis: 0.1
enabled: true
# Home filesystem (if separate partition)
/home:
percent:
warning: 85.0
critical: 95.0
operator: ">"
hysteresis: 0.05
enabled: true
# Var filesystem (logs, etc.)
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
hysteresis: 0.05
enabled: true
free_gb:
warning: 5.0 # Var needs space for logs
critical: 2.0
operator: "<"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Network Monitor Thresholds
# ----------------------------------------------------------------------------
network_monitor:
# Total error count across all interfaces
errors_total:
warning: 100 # Warn at 100 errors
critical: 1000 # Critical at 1000 errors
operator: ">"
hysteresis: 0.2 # 20% hysteresis for counters
enabled: true
# Total dropped packets
dropin_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
dropout_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
# TCP connections in TIME_WAIT state
connections_TIME_WAIT:
warning: 1000 # Warn at 1000 TIME_WAIT connections
critical: 5000 # Critical at 5000 TIME_WAIT connections
operator: ">"
hysteresis: 0.2
enabled: true
# Total established connections
connections_ESTABLISHED:
warning: 500
critical: 1000
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Nagios Plugin Thresholds (if using nagios_runner)
# ----------------------------------------------------------------------------
nagios_runner:
# Nagios plugins report exit codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
# We can threshold on the exit_code directly
exit_code:
warning: 1 # Map Nagios WARNING to our WARNING
critical: 2 # Map Nagios CRITICAL to our CRITICAL
operator: ">=" # Alert when exit code >= threshold
hysteresis: 0.0 # No hysteresis for exit codes
enabled: true
# ==============================================================================
# Notification Configuration
# ==============================================================================
# Configure notification methods (email, pushover, etc.)
# These are used when threshold violations occur
# Email notifications
toemail:
- admin@example.com
- oncall@example.com
fromemail: heartbeat@example.com
smtpserver: smtp.example.com
smtpport: 587
smtpuser: heartbeat@example.com
smtppassword: your-password-here
# Pushover notifications (optional)
# pushover_token: your-pushover-app-token
# pushover_user: your-pushover-user-key
# Mattermost webhook (optional)
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
# ==============================================================================
# Watched Hosts
# ==============================================================================
# Hosts in this list will trigger notifications for:
# - Heartbeat timeouts/overdue
# - Threshold violations
# - Boot messages
watchhosts:
- webserver01
- database01
- mailserver
- critical-app
# ==============================================================================
# Additional Server Settings
# ==============================================================================
hb_port: 50003 # UDP port for heartbeat messages
hbd_port: 50004 # HTTP port for web interface
grace: 10 # Grace period for overdue detection (seconds)
debug: 0 # Debug level (0-3)
verbose: false # Verbose output
# Journal settings (message logging)
journal_enabled: true
journal_path: /var/log/heartbeat/messages.journal
journal_max_size: 104857600 # 100MB before rotation
journal_max_backups: 10
# ==============================================================================
# Example: Production Configuration with Conservative Thresholds
# ==============================================================================
# For production systems, consider:
# - Higher warning thresholds to reduce alert fatigue
# - Appropriate hysteresis values (5-15% typical)
# - Re-notification intervals matching on-call rotation
# - Multiple escalation contacts
# - Integration with incident management systems
# ==============================================================================
View File
-82
View File
@@ -1,82 +0,0 @@
"""Message encoding/decoding utilities for hbd protocol."""
from typing import Dict, Any
import zlib
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
"""Serialize a dict to protocol message bytes.
If compress is True, the payload is zlib-compressed and the message is
prefixed with `!ID:` as the original script did. Otherwise the format is
`ID:key=value;...` (bytes).
"""
s = []
for k in d:
v = d[k]
if isinstance(v, float):
s.append(f"{k}={v:0.5f}")
else:
s.append(f"{k}={v}")
pk = ";".join(s)
if compress:
zpk = zlib.compress(pk.encode(), 6)
hdr = ("!" + ID + ":").encode()
return hdr + zpk
else:
return (ID + ":" + pk).encode()
def stodict(msg: bytes):
"""Deserialize a protocol message into a dict.
Mirrors original behaviour: detects compressed messages starting with
'!' and decodes accordingly. Returns a dict with key 'ID' set to the
message ID and the parsed key/value pairs.
"""
d = {}
if len(msg) > 0 and chr(msg[0]) == "!":
# message is: b'!ID:' + compressed_payload
# original code used msg[1:4].decode() for ID (3 bytes including colon)
try:
pk = zlib.decompress(msg[5:]).decode()
except Exception:
# malformed compressed payload
return {}
d["ID"] = msg[1:4].decode()
else:
try:
r0 = msg.split(b":", 1)
pk = r0[1].decode()
d["ID"] = r0[0].decode()
except Exception:
return {}
if not pk:
return d
parts = pk.split(";")
for v in parts:
if not v:
continue
vr = v.split("=", 1)
k = vr[0].strip()
if len(vr) == 1:
d[k] = None
else:
val = vr[1].strip()
if val and val[0].isdigit():
try:
val_e = eval(val)
except Exception:
val_e = val
d[k] = val_e
else:
d[k] = val
return d
def oldmtodict(msg: bytes):
"""Compatibility wrapper for old-style messages (no ID prefix).
The original implementation prefixed with 'HTB:' and called stodict.
"""
return stodict(b"HTB:" + msg)
+3
View File
@@ -0,0 +1,3 @@
"""HeartBeat Daemon (hbd) - Server/daemon component."""
__version__ = "5.0.5"
+1 -1
View File
@@ -3,7 +3,7 @@
import argparse
from .config import load_config
from .server import run as run_server
from .main import run as run_server
PUSHSRVS = ["all", "pushover", "mattermost"]
+103
View File
@@ -0,0 +1,103 @@
"""Configuration loader and defaults for hbd (HeartBeat Daemon/Server)."""
import logging
import os
try:
import yaml
except Exception:
yaml = None
SERVER_DEFAULTS = {
# Network settings
"hb_port": 50003, # Port to listen for heartbeats
"hbd_port": 50004, # HTTP API port
"hbd_host": "", # Bind address (empty = all interfaces)
# Persistence
"pickfile": "/tmp/hb.pick",
# Logging
"logfile": "/var/log/heartbeat.log",
"logfmt": "text", # text or msg or json
# Notification settings
"pushsrv": "pushover", # pushover, mattermost, or all
"pushover_token": "",
"pushover_user": "",
# Monitoring settings
"interval": 20, # Expected heartbeat interval (for server checks)
"grace": 2, # Grace multiplier (interval * grace = timeout)
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
# Host management
"watchhosts": [], # Hosts to monitor and notify about
"dyndnshosts": [], # Hosts with dynamic DNS
"drophosts": [], # Hosts to ignore
"dyndomains": ["wrede.org"],
# DNS updates
"nsupdate_bin": "/usr/bin/nsupdate",
# Email settings
"smtpserver": "smtp.fastmail.com",
"smtpuser": "andreas@wrede.ca",
"smtppassword": "pvtvefyp5gbhnch2",
"smtpport": 587,
"toemail": ["aew.hbd.notify@wrede.ca"],
"fromemail": "aew.hbd@wrede.ca",
# WebSocket settings
"ws_port": 50005,
"wss_port": None,
"cert_path": "/usr/local/etc/ssl/",
"wss_pem": "fullchain.pem",
"wss_key": "privkey.pem",
# Message journal configuration
"journal_enabled": True,
"journal_dir": "/var/log/heartbeat",
"journal_file": "messages.journal",
"journal_max_size": 100 * 1024 * 1024, # 100MB
"journal_max_backups": 10,
# Runtime flags
"foreground": False,
"verbose": False,
"debug": 0,
# Plugin/threshold configs (for clients reporting to this server)
"plugins": {},
"thresholds": {},
}
def load_config(path=None):
"""Load configuration from a YAML file and merge with server defaults.
If YAML is not available or the file does not exist, defaults are returned.
Args:
path: Path to YAML config file (default: ~/.hb.yaml)
Returns:
Dictionary with configuration
"""
cfg = SERVER_DEFAULTS.copy()
if not path:
# default path (~/.hb.yaml)
path = os.path.join(os.path.expanduser("~"), ".hb.yaml")
if os.path.exists(path):
if yaml:
with open(path) as fh:
data = yaml.safe_load(fh)
# Merge YAML data with defaults
# Keep all keys from YAML to support plugin configs and future extensions
for k, v in data.items():
cfg[k] = v
else:
# yaml not installed: do not attempt to parse; user must ensure defaults
pass
return cfg
View File
@@ -179,6 +179,11 @@ class Host:
self.cver = 0
self.connections = {}
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
self.plugin_data = {}
self.plugin_retention = 100 # Keep last N samples per plugin
# Alert state tracking: {metric_path: AlertState}
self.alert_states = {}
def statedict(self):
d = {}
@@ -272,8 +277,72 @@ class Host:
addr = addr[7:]
self.connections[c].addr = addr
# Add plugin_data if missing (for backward compatibility)
if not hasattr(self, "plugin_data"):
self.plugin_data = {}
if not hasattr(self, "plugin_retention"):
self.plugin_retention = 100
if not hasattr(self, "alert_states"):
self.alert_states = {}
pass
def add_plugin_data(self, plugin_name, data, timestamp=None):
"""Store plugin data with timestamp.
Args:
plugin_name: Name of the plugin (e.g., "cpu_monitor")
data: Dict of plugin data
timestamp: Optional timestamp (default: current time)
"""
if timestamp is None:
timestamp = time.time()
if plugin_name not in self.plugin_data:
self.plugin_data[plugin_name] = []
# Add new data
self.plugin_data[plugin_name].append((timestamp, data))
# Enforce retention limit (keep last N samples)
if len(self.plugin_data[plugin_name]) > self.plugin_retention:
self.plugin_data[plugin_name] = self.plugin_data[plugin_name][-self.plugin_retention:]
def get_plugin_data(self, plugin_name, limit=None):
"""Retrieve plugin data for a specific plugin.
Args:
plugin_name: Name of the plugin
limit: Optional limit on number of recent samples to return
Returns:
List of (timestamp, data) tuples, most recent last
"""
data = self.plugin_data.get(plugin_name, [])
if limit and len(data) > limit:
return data[-limit:]
return data
def get_latest_plugin_data(self, plugin_name):
"""Get the most recent plugin data for a plugin.
Args:
plugin_name: Name of the plugin
Returns:
(timestamp, data) tuple or None if no data
"""
data = self.plugin_data.get(plugin_name, [])
return data[-1] if data else None
def get_all_plugin_data(self):
"""Get all plugin data for this host.
Returns:
Dict of {plugin_name: [(timestamp, data), ...]}
"""
return self.plugin_data
# def dispstate(self):
# if self.state in ["down", "overdue"]:
# state = "<b>%s</b>" % self.state
+193
View File
@@ -32,6 +32,7 @@ async def start(
verbose=False,
get_now=None,
VER="",
threshold_checker=None,
):
"""Start an aiohttp web server and block until cancelled.
@@ -183,17 +184,209 @@ async def start(
return web.Response(status=404, text="Not Found")
return web.FileResponse(path=target)
# -------------------------------------------------------------------------
# Plugin Data API Endpoints
# -------------------------------------------------------------------------
async def api_host_plugins(request):
"""Get all plugin data for a specific host."""
hostname = request.match_info.get("hostname")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get plugin data with most recent sample for each plugin
plugins_summary = {}
for plugin_name, samples in host.plugin_data.items():
if samples:
# Get most recent sample
timestamp, data = samples[-1]
plugins_summary[plugin_name] = {
"timestamp": timestamp,
"data": data,
"sample_count": len(samples),
}
return web.json_response({
"hostname": hostname,
"plugins": plugins_summary,
})
async def api_host_plugin_detail(request):
"""Get detailed data for a specific plugin on a host."""
hostname = request.match_info.get("hostname")
plugin_name = request.match_info.get("plugin_name")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get limit from query parameter
limit = request.rel_url.query.get("limit", "10")
try:
limit = int(limit)
except ValueError:
limit = 10
# Get plugin data
samples = host.get_plugin_data(plugin_name, limit=limit)
if not samples:
return web.json_response(
{"error": f"No data for plugin '{plugin_name}' on host '{hostname}'"},
status=404
)
# Format samples
formatted_samples = [
{
"timestamp": ts,
"data": data,
}
for ts, data in samples
]
return web.json_response({
"hostname": hostname,
"plugin": plugin_name,
"samples": formatted_samples,
"sample_count": len(formatted_samples),
})
async def api_host_alerts(request):
"""Get alert states for a specific host."""
hostname = request.match_info.get("hostname")
if hostname not in hbdclass.Host.hosts:
return web.json_response(
{"error": f"Host '{hostname}' not found"},
status=404
)
host = hbdclass.Host.hosts[hostname]
# Get alert states
alerts = []
for metric_path, alert_state in host.alert_states.items():
alerts.append(alert_state.to_dict())
# Get summary if threshold_checker available
summary = {"ok": 0, "warning": 0, "critical": 0, "unknown": 0}
if threshold_checker:
summary = threshold_checker.get_alert_summary(host.alert_states)
return web.json_response({
"hostname": hostname,
"alerts": alerts,
"summary": summary,
})
async def api_all_alerts(request):
"""Get all active alerts across all hosts."""
all_alerts = []
for hostname, host in hbdclass.Host.hosts.items():
if threshold_checker:
active_alerts = threshold_checker.get_active_alerts(host.alert_states)
else:
# Fallback if no threshold checker
from hbd.client.threshold import AlertLevel
active_alerts = [
state for state in host.alert_states.values()
if state.level != AlertLevel.OK
]
for alert in active_alerts:
alert_dict = alert.to_dict()
alert_dict["hostname"] = hostname
all_alerts.append(alert_dict)
# Sort by level (critical first) then by hostname
level_order = {"CRITICAL": 0, "WARNING": 1, "UNKNOWN": 2, "OK": 3}
all_alerts.sort(
key=lambda a: (level_order.get(a["level"], 99), a["hostname"], a["metric_path"])
)
# Get summary counts
summary = {"critical": 0, "warning": 0, "unknown": 0, "total": len(all_alerts)}
for alert in all_alerts:
level = alert["level"].lower()
if level in summary:
summary[level] += 1
return web.json_response({
"alerts": all_alerts,
"summary": summary,
"host_count": len(hbdclass.Host.hosts),
})
# -------------------------------------------------------------------------
# UI Pages
# -------------------------------------------------------------------------
async def plugins_page(request):
"""Render the plugin metrics visualization page."""
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
# Collect all hosts with plugin data
hosts_with_plugins = []
for hostname in sorted(hbdclass.Host.hosts.keys()):
host = hbdclass.Host.hosts[hostname]
if host.plugin_data:
hosts_with_plugins.append({
"name": hostname,
"plugins": list(host.plugin_data.keys()),
})
tmpl = env.get_template("plugins.html")
body = tmpl.render(
title="Plugin Metrics - Heartbeat",
header="Plugin Metrics",
hosts=hosts_with_plugins,
)
return web.Response(text=body, content_type="text/html")
async def alerts_page(request):
"""Render the alerts dashboard page."""
pkg_dir = os.path.dirname(__file__)
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
tmpl = env.get_template("alerts.html")
body = tmpl.render(
title="Alerts Dashboard - Heartbeat",
header="Alerts Dashboard",
)
return web.Response(text=body, content_type="text/html")
app = web.Application()
app.add_routes(
[
web.get("/", index),
web.get("/api/0/hosts", api_hosts),
web.get("/api/0/messages", api_messages),
web.get("/api/0/hosts/{hostname}/plugins", api_host_plugins),
web.get("/api/0/hosts/{hostname}/plugins/{plugin_name}", api_host_plugin_detail),
web.get("/api/0/hosts/{hostname}/alerts", api_host_alerts),
web.get("/api/0/alerts", api_all_alerts),
web.get("/c", cmd),
web.get("/d", drop),
web.get("/n", register),
web.get("/u", update),
web.get("/live", live),
web.get("/plugins", plugins_page),
web.get("/alerts", alerts_page),
web.get("/static/{path:.*}", static),
web.get("/favicon.ico", favicon),
]
+342
View File
@@ -0,0 +1,342 @@
"""
Journal logging for heartbeat messages.
Provides size-based rotating log files for all received heartbeat messages.
Messages are logged in JSON format for easy parsing and analysis.
"""
import json
import logging
import os
import asyncio
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Optional
logger = logging.getLogger(__name__)
class MessageJournal:
"""
Journal logger for heartbeat messages with size-based rotation.
Features:
- Logs all received messages in JSON format
- Automatic rotation when file size exceeds threshold
- Keeps configurable number of rotated logs
- Thread-safe and async-safe operation
- Configurable log directory and file naming
Configuration:
journal_dir: Directory for journal files (default: /var/log/heartbeat/)
journal_file: Base filename (default: messages.journal)
max_size: Maximum file size in bytes before rotation (default: 100MB)
max_backups: Number of backup files to keep (default: 10)
enabled: Enable/disable journaling (default: True)
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the message journal.
Args:
config: Configuration dictionary with journal settings
"""
self.config = config or {}
# Configuration options
self.journal_dir = Path(self.config.get('journal_dir', '/var/log/heartbeat'))
self.journal_file = self.config.get('journal_file', 'messages.journal')
self.max_size = self.config.get('journal_max_size', 100 * 1024 * 1024) # 100MB default
self.max_backups = self.config.get('journal_max_backups', 10)
self.enabled = self.config.get('journal_enabled', True)
# Runtime state
self._file_handle = None
self._current_size = 0
self._lock = asyncio.Lock()
self._initialized = False
# Full path to current journal file
self.journal_path = self.journal_dir / self.journal_file
async def initialize(self) -> bool:
"""
Initialize the journal.
Creates journal directory if needed and opens the journal file.
Returns:
True if initialization successful, False otherwise
"""
if not self.enabled:
logger.info("Message journal disabled in configuration")
return True
try:
# Create journal directory if it doesn't exist
self.journal_dir.mkdir(parents=True, exist_ok=True)
# Open journal file in append mode
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
# Get current file size
try:
self._current_size = os.path.getsize(self.journal_path)
except OSError:
self._current_size = 0
self._initialized = True
logger.info(f"Message journal initialized: {self.journal_path} "
f"(current size: {self._current_size:,} bytes, "
f"max: {self.max_size:,} bytes)")
return True
except Exception as e:
logger.error(f"Failed to initialize message journal: {e}")
self.enabled = False
return False
async def log_message(
self,
msg: Dict[str, Any],
addr: tuple,
timestamp: Optional[float] = None
):
"""
Log a received message to the journal.
Args:
msg: Parsed message dictionary
addr: Source address (ip, port) tuple
timestamp: Message timestamp (defaults to current time)
"""
if not self.enabled or not self._initialized:
return
# Skip HTB (heartbeat) messages - too verbose
msg_id = msg.get('ID', '')
if msg_id == 'HTB':
return
async with self._lock:
try:
# Prepare journal entry
if timestamp is None:
import time
timestamp = time.time()
entry = {
'timestamp': timestamp,
'datetime': datetime.fromtimestamp(timestamp).isoformat(),
'source_ip': addr[0] if isinstance(addr, (tuple, list)) else str(addr),
'source_port': addr[1] if isinstance(addr, (tuple, list)) and len(addr) > 1 else None,
'message': msg
}
# Serialize to JSON (one line per entry)
json_line = json.dumps(entry, separators=(',', ':')) + '\n'
json_bytes = json_line.encode('utf-8')
# Check if rotation is needed
if self._current_size + len(json_bytes) > self.max_size:
await self._rotate()
# Write to journal
if self._file_handle:
self._file_handle.write(json_line)
self._file_handle.flush() # Ensure data is written
self._current_size += len(json_bytes)
logger.debug(f"Logged message from {addr[0]}: {msg.get('ID', 'UNKNOWN')}")
except Exception as e:
logger.error(f"Error writing to journal: {e}")
async def _rotate(self):
"""
Rotate the journal file.
Renames current file with timestamp, opens new file, and removes
old backups exceeding max_backups limit.
"""
try:
# Close current file
if self._file_handle:
self._file_handle.close()
self._file_handle = None
# Generate backup filename with timestamp
timestamp_str = datetime.now().strftime('%Y%m%d-%H%M%S')
backup_name = f"{self.journal_file}.{timestamp_str}"
backup_path = self.journal_dir / backup_name
# Rename current file to backup
if self.journal_path.exists():
self.journal_path.rename(backup_path)
logger.info(f"Rotated journal: {backup_path} "
f"(size: {self._current_size:,} bytes)")
# Open new journal file
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
self._current_size = 0
# Clean up old backups
await self._cleanup_old_backups()
except Exception as e:
logger.error(f"Error rotating journal: {e}")
# Try to reopen the file even if rotation failed
try:
self._file_handle = open(self.journal_path, 'a', encoding='utf-8')
except Exception as e2:
logger.error(f"Failed to reopen journal after rotation error: {e2}")
self.enabled = False
async def _cleanup_old_backups(self):
"""
Remove old backup files exceeding max_backups limit.
Keeps only the most recent backups based on filename (which includes timestamp).
"""
try:
# Find all backup files
backup_pattern = f"{self.journal_file}.*"
backup_files = sorted(self.journal_dir.glob(backup_pattern))
# Remove oldest backups if we have too many
if len(backup_files) > self.max_backups:
files_to_remove = backup_files[:len(backup_files) - self.max_backups]
for backup_file in files_to_remove:
try:
backup_file.unlink()
logger.info(f"Removed old backup: {backup_file.name}")
except Exception as e:
logger.warning(f"Failed to remove old backup {backup_file}: {e}")
except Exception as e:
logger.error(f"Error cleaning up old backups: {e}")
async def log_threshold_event(
self,
host_name: str,
metric_path: str,
old_level: str,
new_level: str,
value: Any,
timestamp: Optional[float] = None
):
"""
Log a threshold state change event.
Args:
host_name: Name of the host
metric_path: Full metric path (e.g., "cpu_monitor.cpu_percent")
old_level: Previous alert level
new_level: New alert level
value: Current metric value
timestamp: Event timestamp (default: current time)
"""
if not self.enabled or not self._initialized:
return
try:
if timestamp is None:
timestamp = __import__('time').time()
event = {
'timestamp': timestamp,
'iso_time': datetime.fromtimestamp(timestamp).isoformat(),
'event_type': 'threshold',
'host': host_name,
'metric': metric_path,
'old_level': old_level,
'new_level': new_level,
'value': value,
}
async with self._lock:
if not self._file_handle:
return
# Check if rotation is needed
if self._current_size >= self.max_size:
await self._rotate()
# Write event
line = json.dumps(event) + '\n'
self._file_handle.write(line)
self._file_handle.flush()
# Update size
self._current_size += len(line.encode('utf-8'))
except Exception as e:
logger.error(f"Error logging threshold event: {e}")
async def close(self):
"""
Close the journal and release resources.
Should be called during shutdown.
"""
async with self._lock:
if self._file_handle:
try:
self._file_handle.close()
logger.info("Message journal closed")
except Exception as e:
logger.error(f"Error closing journal: {e}")
finally:
self._file_handle = None
self._initialized = False
def get_stats(self) -> Dict[str, Any]:
"""
Get journal statistics.
Returns:
Dictionary with journal stats
"""
return {
'enabled': self.enabled,
'initialized': self._initialized,
'current_file': str(self.journal_path),
'current_size': self._current_size,
'max_size': self.max_size,
'max_backups': self.max_backups,
'rotation_threshold': f"{(self._current_size / self.max_size * 100):.1f}%"
}
# Global journal instance
_journal_instance: Optional[MessageJournal] = None
def get_journal(config: Optional[Dict[str, Any]] = None) -> MessageJournal:
"""
Get or create the global journal instance.
Args:
config: Configuration dictionary (only used on first call)
Returns:
MessageJournal instance
"""
global _journal_instance
if _journal_instance is None:
_journal_instance = MessageJournal(config)
return _journal_instance
async def log_message(msg: Dict[str, Any], addr: tuple, timestamp: Optional[float] = None):
"""
Convenience function to log a message using the global journal.
Args:
msg: Parsed message dictionary
addr: Source address (ip, port) tuple
timestamp: Message timestamp (defaults to current time)
"""
journal = get_journal()
await journal.log_message(msg, addr, timestamp)
+27 -1
View File
@@ -83,8 +83,23 @@ async def _run_async(config):
from . import dns as dns_mod
from . import notify as notify_mod
from . import monitor as monitor_mod
from . import journal as journal_mod
from ..client import threshold as threshold_mod
notify_mod.setup(config)
# Initialize message journal
msg_journal = journal_mod.get_journal(config)
await msg_journal.initialize()
# Initialize threshold checker
threshold_checker = threshold_mod.ThresholdChecker(
config=config,
notification_callback=notify_mod.pushmsg_from_config,
renotify_interval=config.get("threshold_renotify_interval", 3600),
journal=msg_journal,
)
logger.info("Threshold checker initialized")
pushmsg = notify_mod.pushmsg_from_config
@@ -113,6 +128,8 @@ async def _run_async(config):
log=log,
pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets,
msg_journal=msg_journal,
threshold_checker=threshold_checker,
DEBUG=config.get("debug", 0),
verbose=config.get("verbose", False),
)
@@ -135,6 +152,7 @@ async def _run_async(config):
log=log,
pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets,
threshold_checker=threshold_checker,
tcss=None,
DEBUG=config.get("debug", 0),
verbose=config.get("verbose", False),
@@ -180,10 +198,12 @@ async def _run_async(config):
ssl_context = None
try:
ws_port = config.get("ws_port", 50005)
logger.info("Starting WebSocket server on port %s", ws_port)
ws_task = asyncio.create_task(
ws_mod.start(
host=config.get("hbd_host", ""),
ws_port=config.get("ws_port", None),
ws_port=ws_port,
wss_port=config.get("wss_port", None),
ssl_context=ssl_context,
get_hosts=lambda: [
@@ -248,6 +268,12 @@ async def _run_async(config):
logger.warning("Timeout waiting for tasks to cancel")
except Exception as e:
logger.debug("Exception during task cancellation: %s", e)
# Close message journal
try:
await msg_journal.close()
except Exception as e:
logger.warning("Error closing message journal: %s", e)
# Signal DNS worker to exit and await it
try:

Before

Width:  |  Height:  |  Size: 5.3 KiB

After

Width:  |  Height:  |  Size: 5.3 KiB

+466
View File
@@ -0,0 +1,466 @@
<!DOCTYPE html>
<html>
{% include 'head.html' %}
<style>
body {
margin: 20px;
background: #f5f5f5;
}
.nav {
background: #fff;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border-radius: 4px;
}
.nav a {
margin-right: 20px;
text-decoration: none;
color: #0066cc;
font-weight: 500;
}
.nav a:hover {
text-decoration: underline;
}
.nav a.active {
color: #333;
font-weight: bold;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #333;
margin-bottom: 10px;
}
.subtitle {
color: #666;
margin-bottom: 30px;
}
.summary-cards {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.summary-card {
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
text-align: center;
}
.summary-card.critical {
border-left: 5px solid #f44336;
}
.summary-card.warning {
border-left: 5px solid #ff9800;
}
.summary-card.ok {
border-left: 5px solid #4caf50;
}
.summary-number {
font-size: 3em;
font-weight: bold;
margin: 10px 0;
}
.summary-number.critical {
color: #f44336;
}
.summary-number.warning {
color: #ff9800;
}
.summary-number.ok {
color: #4caf50;
}
.summary-label {
color: #666;
text-transform: uppercase;
font-size: 0.9em;
letter-spacing: 1px;
}
.filters {
background: white;
border-radius: 8px;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
display: flex;
gap: 15px;
align-items: center;
}
.filter-label {
font-weight: bold;
color: #555;
}
.filter-button {
padding: 8px 16px;
border: 2px solid #ddd;
background: white;
border-radius: 20px;
cursor: pointer;
transition: all 0.2s;
font-size: 0.9em;
}
.filter-button:hover {
border-color: #2196f3;
}
.filter-button.active {
background: #2196f3;
color: white;
border-color: #2196f3;
}
.alerts-container {
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.alert-item {
border-left: 5px solid #ddd;
padding: 15px;
margin-bottom: 15px;
background: #fafafa;
border-radius: 4px;
display: flex;
justify-content: space-between;
align-items: center;
transition: all 0.2s;
}
.alert-item:hover {
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
transform: translateX(5px);
}
.alert-item.critical {
border-left-color: #f44336;
background: #ffebee;
}
.alert-item.warning {
border-left-color: #ff9800;
background: #fff3e0;
}
.alert-item.unknown {
border-left-color: #9e9e9e;
background: #f5f5f5;
}
.alert-main {
flex: 1;
}
.alert-header {
display: flex;
align-items: center;
gap: 15px;
margin-bottom: 8px;
}
.alert-level {
padding: 4px 12px;
border-radius: 12px;
font-size: 0.75em;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.alert-level.critical {
background: #f44336;
color: white;
}
.alert-level.warning {
background: #ff9800;
color: white;
}
.alert-level.unknown {
background: #9e9e9e;
color: white;
}
.alert-hostname {
font-weight: bold;
color: #333;
font-size: 1.1em;
}
.alert-metric {
color: #666;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
.alert-details {
display: flex;
gap: 20px;
color: #666;
font-size: 0.9em;
}
.alert-value {
font-weight: bold;
color: #333;
}
.alert-duration {
color: #999;
font-size: 0.85em;
}
.no-alerts {
text-align: center;
padding: 60px 20px;
color: #999;
}
.no-alerts-icon {
font-size: 4em;
margin-bottom: 20px;
}
.loading {
text-align: center;
padding: 40px;
color: #666;
}
.error {
background: #ffebee;
border-left: 4px solid #f44336;
padding: 20px;
margin: 20px 0;
border-radius: 4px;
color: #c62828;
}
.refresh-info {
text-align: center;
color: #999;
font-size: 0.85em;
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #e0e0e0;
}
.last-update {
color: #666;
font-size: 0.9em;
text-align: right;
margin-bottom: 15px;
}
</style>
<body>
<div class="nav">
<a href="/live">Live Dashboard</a>
<a href="/plugins">Plugin Metrics</a>
<a href="/alerts" class="active">Alerts</a>
</div>
<div class="container">
<h1>{{ header }}</h1>
<p class="subtitle">Real-time monitoring alerts and threshold violations</p>
<div class="summary-cards" id="summary-cards">
<div class="summary-card critical">
<div class="summary-label">Critical</div>
<div class="summary-number critical" id="critical-count">-</div>
</div>
<div class="summary-card warning">
<div class="summary-label">Warning</div>
<div class="summary-number warning" id="warning-count">-</div>
</div>
<div class="summary-card ok">
<div class="summary-label">Total Hosts</div>
<div class="summary-number ok" id="host-count">-</div>
</div>
</div>
<div class="filters">
<span class="filter-label">Show:</span>
<button class="filter-button active" onclick="filterAlerts('all')">All</button>
<button class="filter-button" onclick="filterAlerts('critical')">Critical Only</button>
<button class="filter-button" onclick="filterAlerts('warning')">Warning Only</button>
</div>
<div class="alerts-container">
<div class="last-update">Last updated: <span id="last-update-time">Never</span></div>
<div id="alerts-list">
<div class="loading">Loading alerts...</div>
</div>
<div class="refresh-info">
Auto-refreshing every 15 seconds
</div>
</div>
</div>
<script>
let currentFilter = 'all';
let allAlerts = [];
async function loadAlerts() {
try {
const response = await fetch('/api/0/alerts');
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const data = await response.json();
allAlerts = data.alerts;
// Update summary cards
document.getElementById('critical-count').textContent = data.summary.critical || 0;
document.getElementById('warning-count').textContent = data.summary.warning || 0;
document.getElementById('host-count').textContent = data.host_count || 0;
// Update last update time
document.getElementById('last-update-time').textContent = new Date().toLocaleTimeString();
// Render alerts
renderAlerts(allAlerts);
} catch (error) {
document.getElementById('alerts-list').innerHTML =
`<div class="error">Failed to load alerts: ${error.message}</div>`;
}
}
function renderAlerts(alerts) {
const container = document.getElementById('alerts-list');
// Filter alerts based on current filter
let filteredAlerts = alerts;
if (currentFilter !== 'all') {
filteredAlerts = alerts.filter(alert =>
alert.level.toLowerCase() === currentFilter
);
}
if (filteredAlerts.length === 0) {
if (currentFilter === 'all' && alerts.length === 0) {
container.innerHTML = `
<div class="no-alerts">
<div class="no-alerts-icon">✓</div>
<h2>All Systems Normal</h2>
<p>No active alerts at this time</p>
</div>
`;
} else {
container.innerHTML = `
<div class="no-alerts">
<p>No ${currentFilter} alerts</p>
</div>
`;
}
return;
}
let html = '';
for (const alert of filteredAlerts) {
html += renderAlert(alert);
}
container.innerHTML = html;
}
function renderAlert(alert) {
const level = alert.level.toLowerCase();
const duration = getDuration(alert.since);
return `
<div class="alert-item ${level}">
<div class="alert-main">
<div class="alert-header">
<span class="alert-level ${level}">${alert.level}</span>
<span class="alert-hostname">${alert.hostname}</span>
</div>
<div class="alert-metric">${alert.metric_path}</div>
<div class="alert-details">
<span>Value: <span class="alert-value">${formatValue(alert.last_value)}</span></span>
<span class="alert-duration">Active for ${duration}</span>
</div>
</div>
</div>
`;
}
function formatValue(value) {
if (typeof value === 'number') {
if (value > 1000) {
return value.toLocaleString();
}
return value.toFixed(2);
}
return value;
}
function getDuration(timestamp) {
const now = Date.now() / 1000;
const seconds = Math.floor(now - timestamp);
if (seconds < 60) {
return `${seconds}s`;
} else if (seconds < 3600) {
return `${Math.floor(seconds / 60)}m`;
} else if (seconds < 86400) {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
return `${hours}h ${minutes}m`;
} else {
const days = Math.floor(seconds / 86400);
const hours = Math.floor((seconds % 86400) / 3600);
return `${days}d ${hours}h`;
}
}
function filterAlerts(filter) {
currentFilter = filter;
// Update active button
document.querySelectorAll('.filter-button').forEach(btn => {
btn.classList.remove('active');
});
event.target.classList.add('active');
// Re-render with new filter
renderAlerts(allAlerts);
}
// Auto-refresh every 15 seconds
setInterval(loadAlerts, 15000);
// Initial load
loadAlerts();
</script>
</body>
</html>
@@ -1,5 +1,5 @@
<footer>
<div id="copyright">
&copy;2002-2021 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
&copy;2002-2026 <A HREF="mailto:andreas@wrede.ca">Andreas Wrede</A> All Rights Reserved.</p>
</div>
</footer>
@@ -3,6 +3,30 @@
{% include 'head.html' %}
<style>
.nav {
background: #fff;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border-radius: 4px;
}
.nav a {
margin-right: 20px;
text-decoration: none;
color: #0066cc;
font-weight: 500;
}
.nav a:hover {
text-decoration: underline;
}
.nav a.active {
color: #333;
font-weight: bold;
}
.content {
display: flex;
flex-direction: column;
@@ -235,6 +259,12 @@
WS_Connect();
</script>
<body>
<div class="nav">
<a href="/live" class="active">Live Dashboard</a>
<a href="/plugins">Plugin Metrics</a>
<a href="/alerts">Alerts</a>
</div>
{% include 'menu.html' %}
<div id="content" class="content" style="overflow: hidden">
@@ -255,7 +285,26 @@
<th style="text-align: right">Last State</th>
</tr>
</thead>
<tbody id="ntablebody"></tbody>
<tbody id="ntablebody">
{% for host in hosts %}
<tr>
<td>{{ host.name }}</td>
<td>{{ host.ver if host.ver else '' }}</td>
{% for conn in host.connections %}
<td>{{ conn.addr if conn.addr else '' }}</td>
<td>{{ conn.state if conn.state else '' }}</td>
<td style="text-align: right">{{ conn.latency if conn.latency else '' }}</td>
<td style="text-align: right">{{ conn.last_state_ts if conn.last_state_ts else '' }}</td>
{% endfor %}
{% if host.connections|length == 0 %}
<td></td><td></td><td></td><td></td>
<td></td><td></td><td></td><td></td>
{% elif host.connections|length == 1 %}
<td></td><td></td><td></td><td></td>
{% endif %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div id="log" class="log" style="overflow: auto;">
+3
View File
@@ -0,0 +1,3 @@
<!-- <label for="drawer-toggle" id="drawer-toggle-label"></label>
s<header>{{ header }}</header> -->
+974
View File
@@ -0,0 +1,974 @@
<!DOCTYPE html>
<html>
{% include 'head.html' %}
<style>
body {
margin: 10px;
background: #f5f5f5;
overflow: hidden;
}
.nav {
background: #fff;
padding: 10px 15px;
margin-bottom: 10px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border-radius: 4px;
}
.nav a {
margin-right: 20px;
text-decoration: none;
color: #0066cc;
font-weight: 500;
font-size: 0.9em;
}
.nav a:hover {
text-decoration: underline;
}
.nav a.active {
color: #333;
font-weight: bold;
}
.container {
max-width: 1400px;
margin: 0 auto;
max-height: calc(100vh - 120px);
overflow-y: auto;
padding-right: 10px;
}
h1 {
color: #333;
margin-bottom: 5px;
font-size: 1.5em;
}
.subtitle {
color: #666;
margin-bottom: 15px;
font-size: 0.9em;
}
.host-card {
background: white;
border-radius: 6px;
padding: 10px 15px;
margin-bottom: 10px;
box-shadow: 0 1px 4px rgba(0,0,0,0.1);
transition: all 0.2s;
}
.host-card.collapsed .host-body {
display: none;
}
.host-header {
display: flex;
justify-content: space-between;
align-items: center;
cursor: pointer;
user-select: none;
padding: 5px 0;
}
.host-header:hover {
background: #f9f9f9;
}
.host-title {
display: flex;
align-items: center;
gap: 10px;
}
.collapse-icon {
font-size: 1.2em;
color: #666;
transition: transform 0.2s;
min-width: 20px;
}
.host-card.collapsed .collapse-icon {
transform: rotate(-90deg);
}
.host-name {
font-size: 1.1em;
font-weight: bold;
color: #333;
}
.host-body {
padding-top: 10px;
}
.plugin-pills {
display: flex;
gap: 6px;
flex-wrap: wrap;
margin-bottom: 10px;
}
.plugin-pill {
padding: 4px 12px;
background: #e3f2fd;
border: 1px solid #90caf9;
border-radius: 15px;
cursor: pointer;
transition: all 0.2s;
font-size: 0.85em;
}
.plugin-pill:hover {
background: #90caf9;
color: white;
}
.plugin-pill.active {
background: #2196f3;
color: white;
border-color: #2196f3;
}
.plugin-content {
margin-top: 10px;
display: none;
}
.plugin-content.active {
display: block;
}
.metric-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 10px;
margin-top: 10px;
}
.metric-card {
background: #fafafa;
border-left: 3px solid #2196f3;
padding: 8px 12px;
border-radius: 3px;
}
.metric-label {
font-size: 0.75em;
color: #666;
text-transform: uppercase;
letter-spacing: 0.3px;
margin-bottom: 3px;
}
.metric-value {
font-size: 1.4em;
font-weight: bold;
color: #333;
line-height: 1.2;
}
.metric-unit {
font-size: 0.6em;
color: #888;
font-weight: normal;
}
.timestamp {
color: #999;
font-size: 0.75em;
margin-top: 10px;
padding-top: 8px;
border-top: 1px solid #e0e0e0;
}
.no-data {
text-align: center;
padding: 20px;
color: #999;
font-style: italic;
font-size: 0.9em;
}
.loading {
text-align: center;
padding: 15px;
color: #666;
font-size: 0.9em;
}
.error {
background: #ffebee;
border-left: 3px solid #f44336;
padding: 10px;
margin: 10px 0;
border-radius: 3px;
color: #c62828;
font-size: 0.9em;
}
.nested-metrics {
margin-top: 8px;
padding-left: 12px;
border-left: 2px solid #ddd;
}
.nested-header {
font-weight: bold;
color: #555;
margin: 8px 0 5px 0;
font-size: 0.85em;
}
/* Scrollbar styling */
.container::-webkit-scrollbar {
width: 8px;
}
.container::-webkit-scrollbar-track {
background: #f1f1f1;
border-radius: 4px;
}
.container::-webkit-scrollbar-thumb {
background: #888;
border-radius: 4px;
}
.container::-webkit-scrollbar-thumb:hover {
background: #555;
}
/* Table styling for interface data */
.interface-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
font-size: 0.85em;
background: #fff;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
border-radius: 4px;
overflow: hidden;
}
.interface-table thead {
background: #2196f3;
color: white;
}
.interface-table th {
padding: 8px 10px;
text-align: left;
font-weight: 600;
text-transform: uppercase;
font-size: 0.75em;
letter-spacing: 0.5px;
}
.interface-table th.number {
text-align: right;
}
.interface-table td {
padding: 6px 10px;
border-top: 1px solid #e0e0e0;
}
.interface-table td.number {
text-align: right;
font-family: 'Courier New', monospace;
}
.interface-table tbody tr:hover {
background: #f5f5f5;
}
.interface-table tbody tr:nth-child(even) {
background: #fafafa;
}
.interface-table tbody tr:nth-child(even):hover {
background: #f0f0f0;
}
.interface-name {
font-weight: bold;
color: #2196f3;
}
</style>
<body>
<div class="nav">
<a href="/live">Live Dashboard</a>
<a href="/plugins" class="active">Plugin Metrics</a>
<a href="/alerts">Alerts</a>
</div>
<div class="container">
<h1>{{ header }}</h1>
<p class="subtitle">Real-time system metrics from monitoring plugins</p>
{% if not hosts %}
<div class="no-data">
<p>No hosts with plugin data available</p>
<p style="font-size: 0.9em; margin-top: 10px;">Hosts will appear here once they start sending plugin metrics</p>
</div>
{% else %}
<div id="hosts-container">
{% for host in hosts %}
<div class="host-card" data-hostname="{{ host.name }}">
<div class="host-header" onclick="toggleHost('{{ host.name }}')">
<div class="host-title">
<span class="collapse-icon"></span>
<span class="host-name">{{ host.name }}</span>
</div>
</div>
<div class="host-body">
<div class="plugin-pills">
{% for plugin in host.plugins %}
<div class="plugin-pill" data-plugin="{{ plugin }}" onclick="event.stopPropagation(); showPlugin('{{ host.name }}', '{{ plugin }}')">
{{ plugin }}
</div>
{% endfor %}
</div>
{% for plugin in host.plugins %}
<div class="plugin-content" id="{{ host.name }}-{{ plugin }}" data-hostname="{{ host.name }}" data-plugin="{{ plugin }}">
<div class="loading">Loading {{ plugin }} data...</div>
</div>
{% endfor %}
</div>
</div>
{% endfor %}
</div>
{% endif %}
</div>
<script>
// Track selected plugins per host
const selectedPlugins = {};
function toggleHost(hostname) {
const card = document.querySelector(`[data-hostname="${hostname}"]`);
card.classList.toggle('collapsed');
}
function showPlugin(hostname, pluginName) {
// Update selectedPlugins tracker
selectedPlugins[hostname] = pluginName;
// Update active pill
const hostCard = document.querySelector(`[data-hostname="${hostname}"]`);
hostCard.querySelectorAll('.plugin-pill').forEach(pill => {
pill.classList.remove('active');
});
hostCard.querySelector(`[data-plugin="${pluginName}"]`).classList.add('active');
// Show plugin content
hostCard.querySelectorAll('.plugin-content').forEach(content => {
content.classList.remove('active');
});
const contentDiv = document.getElementById(`${hostname}-${pluginName}`);
contentDiv.classList.add('active');
// Load data if not already loaded
if (contentDiv.querySelector('.loading')) {
loadPluginData(hostname, pluginName);
}
}
async function loadPluginData(hostname, pluginName) {
const contentDiv = document.getElementById(`${hostname}-${pluginName}`);
try {
const response = await fetch(`/api/0/hosts/${hostname}/plugins/${pluginName}?limit=1`);
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const data = await response.json();
if (data.samples && data.samples.length > 0) {
const sample = data.samples[0];
contentDiv.innerHTML = renderPluginData(sample.data, sample.timestamp);
} else {
contentDiv.innerHTML = '<div class="no-data">No data available for this plugin</div>';
}
} catch (error) {
contentDiv.innerHTML = `<div class="error">Failed to load plugin data: ${error.message}</div>`;
}
}
function renderPluginData(data, timestamp) {
let html = '<div class="metric-grid">';
for (const [key, value] of Object.entries(data)) {
// Skip nested objects for now, handle them separately
if (typeof value === 'object' && value !== null) {
continue;
}
html += renderMetric(key, value);
}
html += '</div>';
// Handle nested objects (like partitions in disk_monitor)
for (const [key, value] of Object.entries(data)) {
if (typeof value === 'object' && value !== null) {
// Check if this is interface data - render as table
if (isInterfaceData(key, value)) {
html += renderInterfaceTable(key, value);
} else if (isInterfaceStatsData(key, value)) {
html += renderInterfaceStatsTable(key, value);
} else if (isDiskPartitionData(key, value)) {
html += renderPartitionTable(key, value);
} else if (isDiskIOData(key, value)) {
html += renderDiskIOTable(key, value);
} else if (isFilesystemData(key, value)) {
html += renderFilesystemTable(key, value);
} else {
// Regular nested metrics display
html += `<div class="nested-metrics">`;
html += `<div class="nested-header">📊 ${formatLabel(key)}</div>`;
html += '<div class="metric-grid">';
if (Array.isArray(value)) {
// Handle arrays - more compact display
value.forEach((item, idx) => {
if (typeof item === 'object') {
// Add a compact separator for array items
if (idx > 0) {
html += `<div style="grid-column: 1/-1; border-top: 1px dashed #ddd; margin: 5px 0;"></div>`;
}
for (const [subKey, subValue] of Object.entries(item)) {
html += renderMetric(`${subKey}`, subValue);
}
}
});
} else {
// Handle nested objects
for (const [subKey, subValue] of Object.entries(value)) {
if (typeof subValue === 'object') {
// Another level of nesting - keep compact
html += `<div style="grid-column: 1/-1; margin-top: 5px; font-size: 0.85em; color: #666;"><strong>${subKey}:</strong></div>`;
for (const [deepKey, deepValue] of Object.entries(subValue)) {
html += renderMetric(deepKey, deepValue);
}
} else {
html += renderMetric(subKey, subValue);
}
}
}
html += '</div></div>';
}
}
}
const date = new Date(timestamp * 1000);
html += `<div class="timestamp">Last updated: ${date.toLocaleString()}</div>`;
return html;
}
function isInterfaceData(key, value) {
// Check if this is interface/network stats data (I/O counters)
if (key.toLowerCase().includes('interface') && !key.toLowerCase().includes('interface_stats')) {
// Verify it's an object with interface-like structure
if (typeof value === 'object' && !Array.isArray(value)) {
// Check if values are objects with byte/packet counters
const firstKey = Object.keys(value)[0];
if (firstKey && typeof value[firstKey] === 'object') {
const sample = value[firstKey];
return sample.hasOwnProperty('bytes_sent') ||
sample.hasOwnProperty('bytes_recv') ||
sample.hasOwnProperty('packets_sent') ||
sample.hasOwnProperty('tx_bytes') ||
sample.hasOwnProperty('rx_bytes');
}
}
}
return false;
}
function isInterfaceStatsData(key, value) {
// Check if this is interface stats data (status, speed, mtu, duplex)
if (key.toLowerCase() === 'interface_stats' || key.toLowerCase().includes('if_stats')) {
if (typeof value === 'object' && !Array.isArray(value)) {
const firstKey = Object.keys(value)[0];
if (firstKey && typeof value[firstKey] === 'object') {
const sample = value[firstKey];
return sample.hasOwnProperty('isup') ||
sample.hasOwnProperty('speed') ||
sample.hasOwnProperty('mtu') ||
sample.hasOwnProperty('duplex');
}
}
}
return false;
}
function isDiskPartitionData(key, value) {
// Check if this is disk partition data
if (key.toLowerCase() === 'partitions' || key.toLowerCase().includes('partition')) {
if (typeof value === 'object' && !Array.isArray(value)) {
const firstKey = Object.keys(value)[0];
if (firstKey && typeof value[firstKey] === 'object') {
const sample = value[firstKey];
return sample.hasOwnProperty('total') &&
sample.hasOwnProperty('used') &&
sample.hasOwnProperty('free') &&
sample.hasOwnProperty('percent');
}
}
}
return false;
}
function isDiskIOData(key, value) {
// Check if this is disk I/O counter data
if (key.toLowerCase().includes('io_counter') || key.toLowerCase().includes('disk_io')) {
if (typeof value === 'object' && !Array.isArray(value)) {
const firstKey = Object.keys(value)[0];
if (firstKey && typeof value[firstKey] === 'object') {
const sample = value[firstKey];
return sample.hasOwnProperty('read_bytes') ||
sample.hasOwnProperty('write_bytes') ||
sample.hasOwnProperty('read_count') ||
sample.hasOwnProperty('write_count');
}
}
}
return false;
}
function isFilesystemData(key, value) {
// Check if this is filesystem info data (from filesystem_info plugin)
if (key.toLowerCase() === 'filesystems' && Array.isArray(value)) {
if (value.length > 0 && typeof value[0] === 'object') {
const sample = value[0];
return sample.hasOwnProperty('device') &&
sample.hasOwnProperty('mountpoint') &&
sample.hasOwnProperty('fstype');
}
}
return false;
}
function renderInterfaceTable(key, interfaces) {
let html = `<div class="nested-metrics">`;
html += `<div class="nested-header">🌐 ${formatLabel(key)}</div>`;
html += '<table class="interface-table">';
// Determine columns based on available data
const sampleInterface = Object.values(interfaces)[0];
const hasBytes = sampleInterface.hasOwnProperty('bytes_sent') || sampleInterface.hasOwnProperty('tx_bytes');
const hasPackets = sampleInterface.hasOwnProperty('packets_sent') || sampleInterface.hasOwnProperty('tx_packets');
const hasErrors = sampleInterface.hasOwnProperty('errin') || sampleInterface.hasOwnProperty('rx_errors');
const hasDrops = sampleInterface.hasOwnProperty('dropin') || sampleInterface.hasOwnProperty('rx_dropped');
const hasDelta = sampleInterface.hasOwnProperty('bytes_sent_delta');
// Build table header
html += '<thead><tr>';
html += '<th>Interface</th>';
if (hasBytes) {
html += '<th class="number">Bytes Sent</th>';
html += '<th class="number">Bytes Recv</th>';
if (hasDelta) {
html += '<th class="number">Δ Sent</th>';
html += '<th class="number">Δ Recv</th>';
}
}
if (hasPackets) {
html += '<th class="number">Pkts Sent</th>';
html += '<th class="number">Pkts Recv</th>';
if (hasDelta) {
html += '<th class="number">Δ Pkts Sent</th>';
html += '<th class="number">Δ Pkts Recv</th>';
}
}
if (hasErrors) {
html += '<th class="number">Errors In</th>';
html += '<th class="number">Errors Out</th>';
}
if (hasDrops) {
html += '<th class="number">Drops In</th>';
html += '<th class="number">Drops Out</th>';
}
html += '</tr></thead>';
// Build table body
html += '<tbody>';
for (const [ifName, ifData] of Object.entries(interfaces)) {
html += '<tr>';
html += `<td class="interface-name">${ifName}</td>`;
if (hasBytes) {
html += `<td class="number">${formatBytes(ifData.bytes_sent || ifData.tx_bytes || 0)}</td>`;
html += `<td class="number">${formatBytes(ifData.bytes_recv || ifData.rx_bytes || 0)}</td>`;
if (hasDelta) {
html += `<td class="number">${formatBytes(ifData.bytes_sent_delta || 0)}</td>`;
html += `<td class="number">${formatBytes(ifData.bytes_recv_delta || 0)}</td>`;
}
}
if (hasPackets) {
html += `<td class="number">${(ifData.packets_sent || ifData.tx_packets || 0).toLocaleString()}</td>`;
html += `<td class="number">${(ifData.packets_recv || ifData.rx_packets || 0).toLocaleString()}</td>`;
if (hasDelta) {
html += `<td class="number">${(ifData.packets_sent_delta || 0).toLocaleString()}</td>`;
html += `<td class="number">${(ifData.packets_recv_delta || 0).toLocaleString()}</td>`;
}
}
if (hasErrors) {
html += `<td class="number">${ifData.errin || ifData.rx_errors || 0}</td>`;
html += `<td class="number">${ifData.errout || ifData.tx_errors || 0}</td>`;
}
if (hasDrops) {
html += `<td class="number">${ifData.dropin || ifData.rx_dropped || 0}</td>`;
html += `<td class="number">${ifData.dropout || ifData.tx_dropped || 0}</td>`;
}
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
html += '</div>';
return html;
}
function renderInterfaceStatsTable(key, interfaces) {
let html = `<div class="nested-metrics">`;
html += `<div class="nested-header">🔌 ${formatLabel(key)}</div>`;
html += '<table class="interface-table">';
// Table header
html += '<thead><tr>';
html += '<th>Interface</th>';
html += '<th>Status</th>';
html += '<th class="number">Speed</th>';
html += '<th>Duplex</th>';
html += '<th class="number">MTU</th>';
html += '</tr></thead>';
// Table body
html += '<tbody>';
for (const [ifName, ifData] of Object.entries(interfaces)) {
html += '<tr>';
html += `<td class="interface-name">${ifName}</td>`;
// Status with color coding
const isUp = ifData.isup;
const statusColor = isUp ? '#4caf50' : '#f44336';
const statusIcon = isUp ? '✓' : '✗';
const statusText = isUp ? 'UP' : 'DOWN';
html += `<td style="color: ${statusColor}; font-weight: bold;">${statusIcon} ${statusText}</td>`;
// Speed
const speed = ifData.speed || 0;
let speedText = '-';
if (speed > 0) {
if (speed >= 1000) {
speedText = (speed / 1000).toFixed(1) + ' Gbps';
} else {
speedText = speed + ' Mbps';
}
}
html += `<td class="number">${speedText}</td>`;
// Duplex
let duplexText = ifData.duplex || '-';
if (duplexText.includes('NicDuplex.')) {
duplexText = duplexText.replace('NicDuplex.', '');
}
if (duplexText === '2') duplexText = 'FULL';
if (duplexText === '1') duplexText = 'HALF';
if (duplexText === '0') duplexText = 'UNKNOWN';
html += `<td>${duplexText}</td>`;
// MTU
html += `<td class="number">${ifData.mtu || '-'}</td>`;
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
html += '</div>';
return html;
}
function renderPartitionTable(key, partitions) {
let html = `<div class="nested-metrics">`;
html += `<div class="nested-header">💾 ${formatLabel(key)}</div>`;
html += '<table class="interface-table">';
// Table header
html += '<thead><tr>';
html += '<th>Mount Point</th>';
html += '<th>Device</th>';
html += '<th>Type</th>';
html += '<th class="number">Total</th>';
html += '<th class="number">Used</th>';
html += '<th class="number">Free</th>';
html += '<th class="number">Use %</th>';
html += '</tr></thead>';
// Table body
html += '<tbody>';
for (const [mountPoint, partData] of Object.entries(partitions)) {
html += '<tr>';
html += `<td class="interface-name">${mountPoint}</td>`;
html += `<td>${partData.device || '-'}</td>`;
html += `<td>${partData.fstype || '-'}</td>`;
html += `<td class="number">${formatBytes(partData.total || 0)}</td>`;
html += `<td class="number">${formatBytes(partData.used || 0)}</td>`;
html += `<td class="number">${formatBytes(partData.free || 0)}</td>`;
// Color code the percentage
const percent = partData.percent || 0;
let percentColor = '#4caf50'; // green
if (percent > 90) percentColor = '#f44336'; // red
else if (percent > 75) percentColor = '#ff9800'; // orange
else if (percent > 50) percentColor = '#ffc107'; // yellow
html += `<td class="number" style="color: ${percentColor}; font-weight: bold;">${percent.toFixed(1)}%</td>`;
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
html += '</div>';
return html;
}
function renderDiskIOTable(key, disks) {
let html = `<div class="nested-metrics">`;
html += `<div class="nested-header">📈 ${formatLabel(key)}</div>`;
html += '<table class="interface-table">';
// Determine columns based on available data
const sampleDisk = Object.values(disks)[0];
const hasDeltas = sampleDisk.hasOwnProperty('read_bytes_delta');
const hasTime = sampleDisk.hasOwnProperty('read_time');
// Table header
html += '<thead><tr>';
html += '<th>Disk</th>';
html += '<th class="number">Read Bytes</th>';
html += '<th class="number">Write Bytes</th>';
if (hasDeltas) {
html += '<th class="number">Δ Read</th>';
html += '<th class="number">Δ Write</th>';
}
html += '<th class="number">Read Count</th>';
html += '<th class="number">Write Count</th>';
if (hasDeltas) {
html += '<th class="number">Δ Reads</th>';
html += '<th class="number">Δ Writes</th>';
}
if (hasTime) {
html += '<th class="number">Read Time (ms)</th>';
html += '<th class="number">Write Time (ms)</th>';
}
html += '</tr></thead>';
// Table body
html += '<tbody>';
for (const [diskName, diskData] of Object.entries(disks)) {
html += '<tr>';
html += `<td class="interface-name">${diskName}</td>`;
html += `<td class="number">${formatBytes(diskData.read_bytes || 0)}</td>`;
html += `<td class="number">${formatBytes(diskData.write_bytes || 0)}</td>`;
if (hasDeltas) {
html += `<td class="number">${formatBytes(diskData.read_bytes_delta || 0)}</td>`;
html += `<td class="number">${formatBytes(diskData.write_bytes_delta || 0)}</td>`;
}
html += `<td class="number">${(diskData.read_count || 0).toLocaleString()}</td>`;
html += `<td class="number">${(diskData.write_count || 0).toLocaleString()}</td>`;
if (hasDeltas) {
html += `<td class="number">${(diskData.read_count_delta || 0).toLocaleString()}</td>`;
html += `<td class="number">${(diskData.write_count_delta || 0).toLocaleString()}</td>`;
}
if (hasTime) {
html += `<td class="number">${(diskData.read_time || 0).toLocaleString()}</td>`;
html += `<td class="number">${(diskData.write_time || 0).toLocaleString()}</td>`;
}
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
html += '</div>';
return html;
}
function renderFilesystemTable(key, filesystems) {
let html = `<div class="nested-metrics">`;
html += `<div class="nested-header">🗄️ ${formatLabel(key)}</div>`;
html += '<table class="interface-table">';
// Table header
html += '<thead><tr>';
html += '<th>Device</th>';
html += '<th>Mount Point</th>';
html += '<th>Type</th>';
html += '<th>Options</th>';
html += '<th class="number">Max File</th>';
html += '<th class="number">Max Path</th>';
html += '</tr></thead>';
// Table body
html += '<tbody>';
for (const fs of filesystems) {
html += '<tr>';
html += `<td class="interface-name">${fs.device || '-'}</td>`;
html += `<td>${fs.mountpoint || '-'}</td>`;
html += `<td>${fs.fstype || '-'}</td>`;
// Format mount options - truncate if too long
let opts = fs.opts || '-';
if (opts.length > 40) {
opts = opts.substring(0, 37) + '...';
}
html += `<td style="font-size: 0.85em;">${opts}</td>`;
html += `<td class="number">${fs.maxfile || '-'}</td>`;
html += `<td class="number">${fs.maxpath || '-'}</td>`;
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
html += '</div>';
return html;
}
function formatBytes(bytes) {
if (bytes === 0) return '0 B';
if (bytes < 1024) return bytes + ' B';
if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
if (bytes < 1073741824) return (bytes / 1048576).toFixed(1) + ' MB';
return (bytes / 1073741824).toFixed(2) + ' GB';
}
function renderMetric(key, value) {
const label = formatLabel(key);
const formattedValue = formatValue(key, value);
const unit = getUnit(key);
return `
<div class="metric-card">
<div class="metric-label">${label}</div>
<div class="metric-value">
${formattedValue}
${unit ? `<span class="metric-unit">${unit}</span>` : ''}
</div>
</div>
`;
}
function formatLabel(key) {
return key
.replace(/_/g, ' ')
.replace(/\b\w/g, l => l.toUpperCase());
}
function formatValue(key, value) {
if (typeof value === 'number') {
// Format percentages
if (key.includes('percent') || key.includes('usage')) {
return value.toFixed(1);
}
// Format bytes to MB/GB
if (key.includes('bytes') || key.includes('_mb') || key.includes('_gb')) {
if (value > 1073741824) {
return (value / 1073741824).toFixed(2);
} else if (value > 1048576) {
return (value / 1048576).toFixed(2);
}
}
// Default number formatting
if (value > 1000) {
return value.toLocaleString();
}
return value.toFixed(2);
}
return value;
}
function getUnit(key) {
if (key.includes('percent') || key.includes('usage')) return '%';
if (key.includes('_gb')) return 'GB';
if (key.includes('_mb')) return 'MB';
if (key.includes('bytes') && !key.includes('_mb') && !key.includes('_gb')) {
// Determine unit based on typical size
return 'bytes';
}
if (key.includes('count')) return '';
if (key.includes('mhz')) return 'MHz';
return '';
}
// Auto-refresh data every 30 seconds
setInterval(() => {
for (const [hostname, pluginName] of Object.entries(selectedPlugins)) {
const contentDiv = document.getElementById(`${hostname}-${pluginName}`);
if (contentDiv && contentDiv.classList.contains('active')) {
loadPluginData(hostname, pluginName);
}
}
}, 30000);
// Initialize by selecting first plugin for each host
document.addEventListener('DOMContentLoaded', () => {
const hostCards = document.querySelectorAll('.host-card');
hostCards.forEach((card, index) => {
const hostname = card.dataset.hostname;
const firstPlugin = card.querySelector('.plugin-pill');
if (firstPlugin) {
const pluginName = firstPlugin.dataset.plugin;
showPlugin(hostname, pluginName);
}
// Collapse all hosts except the first one
if (index > 0) {
card.classList.add('collapsed');
}
});
});
</script>
</body>
</html>
+53 -3
View File
@@ -4,8 +4,8 @@ import asyncio
import zlib
import logging
from .proto import stodict, oldmtodict
from hbd.utils import dur
from ..common.proto import stodict, oldmtodict
from ..common.utils import dur
logger = logging.getLogger(__name__)
@@ -72,11 +72,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
- log: callable(loghost, message)
- pushmsg: callable(message)
- msg_to_websockets: callable(typ, data)
- msg_journal: MessageJournal instance for logging all messages
- DEBUG, verbose
"""
if not msg:
return
now = __import__("time").time()
# Log message to journal
msg_journal = ctx.get("msg_journal")
if msg_journal:
# Create async task to log message (non-blocking)
import asyncio
try:
loop = asyncio.get_event_loop()
loop.create_task(msg_journal.log_message(msg, addr, now))
except Exception as e:
logger.debug(f"Failed to log message to journal: {e}")
cfg = ctx.get("config", {})
hbdcls = ctx.get("hbdclass")
log = ctx.get("log")
@@ -88,7 +101,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# normalize addr (ip, port)
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
name = msg.get("name", "unknown")
from hbd.utils import shortname
from ..common.utils import shortname
uname = shortname(name)
@@ -110,6 +123,43 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if msg.get("ID") == "HTB":
host.doesack = msg.get("acks", -1)
elif msg.get("ID") == "PLG":
# Handle plugin data message
plugin_name = msg.get("plugin")
if plugin_name:
# Extract all fields except ID and plugin name
plugin_data = {k: v for k, v in msg.items() if k not in ["ID", "plugin"]}
# Store plugin data with timestamp
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
if DEBUG > 1:
print(f"Stored plugin data for {uname}: {plugin_name}")
# Check thresholds if checker is available
threshold_checker = ctx.get("threshold_checker")
if threshold_checker:
try:
state_changes = threshold_checker.check_plugin_data(
host_name=uname,
plugin_name=plugin_name,
data=plugin_data,
alert_states=host.alert_states,
)
if DEBUG > 1 and state_changes:
print(f"Threshold state changes for {uname}: {state_changes}")
except Exception as e:
logger.error(f"Error checking thresholds for {uname}.{plugin_name}: {e}")
# Notify websockets of plugin update
if msg_to_websockets:
try:
msg_to_websockets("plugin", {
"host": uname,
"plugin": plugin_name,
"data": plugin_data,
"timestamp": now
})
except Exception:
pass
host.setcver(msg.get("ver", 0))
try:
+23 -15
View File
@@ -25,19 +25,28 @@ async def _handler(websocket, path=None):
remote_address = websocket.remote_address
if path is None:
path = getattr(websocket, "path", None)
if _verbose:
logger.info("DBG ws_serve: %s: %s", remote_address, path)
logger.info("WebSocket connection from %s: %s", remote_address, path)
try:
# send initial hosts
if _get_hosts:
for h in _get_hosts():
jmsg = json.dumps({"type": "host", "data": h})
await websocket.send(jmsg)
try:
hosts = list(_get_hosts())
logger.debug("Sending %d hosts to new WebSocket client", len(hosts))
for h in hosts:
jmsg = json.dumps({"type": "host", "data": h})
await websocket.send(jmsg)
except Exception as e:
logger.error("Error sending initial hosts: %s", e, exc_info=True)
# send recent messages
if _get_msgs:
for m in list(_get_msgs())[-100:]:
jmsg = json.dumps({"type": "message", "data": m})
await websocket.send(jmsg)
try:
msgs = list(_get_msgs())[-100:]
logger.debug("Sending %d recent messages to new WebSocket client", len(msgs))
for m in msgs:
jmsg = json.dumps({"type": "message", "data": m})
await websocket.send(jmsg)
except Exception as e:
logger.error("Error sending initial messages: %s", e, exc_info=True)
# keep connection open until client disconnects
async for _ in websocket:
@@ -50,11 +59,11 @@ async def _handler(websocket, path=None):
websockets.exceptions.ConnectionClosedOK,
websockets.exceptions.ConnectionClosedError,
) as e:
if _verbose:
logger.info("ws closed: %r", e)
logger.info("WebSocket closed from %s: %r", remote_address, e)
except Exception as e:
logger.exception("ws handler exception: %s", e)
logger.exception("WebSocket handler exception from %s: %s", remote_address, e)
finally:
logger.debug("Removing WebSocket connection from %s", remote_address)
try:
_connections.remove(websocket)
except KeyError:
@@ -101,10 +110,9 @@ async def start(
for srv in servers:
await srv
if _verbose:
logger.info(
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
)
logger.info(
"WebSocket server(s) started on port %s (wss %s)", ws_port, wss_port
)
# block forever (until loop is stopped or cancelled)
await asyncio.Future()
-3
View File
@@ -1,3 +0,0 @@
<label for="drawer-toggle" id="drawer-toggle-label"></label>
<header>{{ header }}</header>