From c5770006f71adc8f6a75409ef17b300cc0a4915f Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Thu, 2 Apr 2026 07:17:00 -0400 Subject: [PATCH] hbc proper termination, hbd config reloadable --- .hb.yaml | 2 +- docs/CONFIG_RELOAD.md | 292 +++++++++++++++++++++++++++++++++++++++ docs/HTTP_API.md | 1 - docs/MESSAGE_JOURNAL.md | 2 +- hbd/client/main.py | 96 ++++++++++--- hbd/server/cli.py | 3 +- hbd/server/config.py | 86 ++++++++++++ hbd/server/hbdclass.py | 7 - hbd/server/http.py | 2 +- hbd/server/main.py | 122 ++++++++++++++-- hbd/server/notify.py | 14 ++ hbd/server/threshold.py | 25 ++++ hbd/server/udp.py | 19 +-- scripts/demo_http_api.py | 1 - test_journal.py | 2 - 15 files changed, 612 insertions(+), 62 deletions(-) create mode 100644 docs/CONFIG_RELOAD.md diff --git a/.hb.yaml b/.hb.yaml index 4551661..61ef9b6 100644 --- a/.hb.yaml +++ b/.hb.yaml @@ -248,7 +248,7 @@ threshold_configs: critical: 2 operator: ">=" rtt: - warning: 50 + warning: 100 critical: 250.0 diff --git a/docs/CONFIG_RELOAD.md b/docs/CONFIG_RELOAD.md new file mode 100644 index 0000000..ded5709 --- /dev/null +++ b/docs/CONFIG_RELOAD.md @@ -0,0 +1,292 @@ +# Configuration Reload + +The heartbeat daemon (hbd) supports runtime configuration reloading without requiring a full restart. This allows you to update certain configuration settings while the service continues running. + +## How to Reload Configuration + +Send a SIGHUP signal to the running hbd process: + +```bash +# Find the process ID +ps aux | grep hbd + +# Or use pidof/pgrep +pidof hbd +pgrep -f hbd + +# Send SIGHUP signal +kill -HUP + +# Or if using systemd +systemctl reload heartbeat +``` + +## What Can Be Reloaded + +The following configuration sections can be reloaded without restarting: + +### ✅ Fully Reloadable + +- **Notification Channels** (`notification_channels`) + - Add, remove, or modify notification channel definitions + - Update tokens, API keys, SMTP credentials + - Change recipient lists + +- **Threshold Configurations** (`threshold_configs`) + - Modify warning and critical thresholds + - Add or remove threshold rules + - Change operators and hysteresis values + - Update display formats + +- **Host Configuration** (`hosts`) + - Change watch status + - Update notification channel assignments + - Modify threshold config assignments + - Change dyndns status + +- **Host Lists** + - `watchhosts` - hosts to monitor + - `dyndnshosts` - hosts with dynamic DNS + - `drophosts` - hosts to ignore + +- **Runtime Settings** + - `grace` - grace period multiplier + - `interval` - expected heartbeat interval + - `threshold_renotify_interval` - re-notification interval + - `debug` - debug level + - `verbose` - verbose output + +- **DNS Settings** + - `dyndomains` - dynamic DNS domains + - `nsupdate_bin` - nsupdate binary path + - `rndc_key` - RNDC key path + +### ⚠️ Requires Restart + +The following settings **cannot** be reloaded and require a service restart: + +- **Network Ports** + - `hb_port` - UDP heartbeat port + - `hbd_port` - HTTP API port + - `ws_port` - WebSocket port + - `wss_port` - Secure WebSocket port + +- **SSL/TLS Settings** + - `cert_path` - SSL certificate path + - `wss_pem` - SSL certificate file + - `wss_key` - SSL key file + +- **Persistence** + - `pickfile` - Pickle file path + +- **Logging** + - `logfile` - Log file path + - `logfmt` - Log format + +- **Journal Settings** + - `journal_enabled` - Enable/disable journaling + - `journal_dir` - Journal directory + - `journal_file` - Journal filename + - `journal_max_size` - Maximum journal size + - `journal_max_backups` - Number of backup files + +## Reload Process + +When a SIGHUP signal is received: + +1. **Configuration File Loading** + - The config file is re-read from disk + - YAML parsing is performed + - Validation checks are run + +2. **Component Updates** + - Notification system is updated with new channel definitions + - Threshold checker reloads all threshold configurations + - Alert states are preserved to maintain hysteresis + +3. **Error Handling** + - If reload fails, the previous configuration is kept + - Error messages are logged + - Service continues running with old configuration + +4. **Logging** + - Reload start and completion are logged + - Each component reports its reload status + - Total number of thresholds is reported + +## Example Reload Session + +```bash +# Terminal 1: Watch the logs +tail -f /var/log/heartbeat.log + +# Terminal 2: Edit configuration +vim /path/to/.hb.yaml + +# Make changes to notification channels or thresholds +# Save the file + +# Terminal 3: Trigger reload +kill -HUP $(pgrep -f hbd) + +# Terminal 1: See reload messages +2026-04-01 12:34:56 INFO: Received SIGHUP, initiating config reload... +2026-04-01 12:34:56 INFO: ============================================================ +2026-04-01 12:34:56 INFO: Starting configuration reload... +2026-04-01 12:34:56 INFO: ============================================================ +2026-04-01 12:34:56 INFO: Configuration reloaded from /path/to/.hb.yaml +2026-04-01 12:34:56 INFO: Notification configuration reloaded +2026-04-01 12:34:56 INFO: Reloading threshold configuration... +2026-04-01 12:34:56 INFO: Threshold configuration reloaded: 42 total thresholds +2026-04-01 12:34:56 INFO: ============================================================ +2026-04-01 12:34:56 INFO: Configuration reload completed successfully +2026-04-01 12:34:56 INFO: ============================================================ +``` + +## Common Use Cases + +### 1. Update Notification Credentials + +If you need to rotate API keys or update SMTP passwords: + +```yaml +notification_channels: + pushover_standard: + type: pushover + token: new-token-here # Updated + user: new-user-key-here # Updated +``` + +Just edit the config file and send SIGHUP - no restart needed. + +### 2. Adjust Threshold Values + +Fine-tune alerting thresholds based on observed behavior: + +```yaml +threshold_configs: + default: + thresholds: + cpu_monitor: + cpu_percent: + warning: 85.0 # Increased from 80.0 + critical: 95.0 # Increased from 90.0 +``` + +Send SIGHUP to apply the new thresholds immediately. + +### 3. Add New Notification Channels + +Add a new notification destination: + +```yaml +notification_channels: + email_oncall: + type: email + recipients: [oncall@example.com] + sender: alerts@example.com + smtp_server: smtp.example.com + +hosts: + critical_server: + threshold_config: default + watch: true + notification_channels: [pushover_standard, email_oncall] # Added +``` + +The new channel becomes active immediately after SIGHUP. + +### 4. Update Watch List + +Start or stop monitoring hosts without restart: + +```yaml +hosts: + new_server: + threshold_config: default + watch: true # Start watching + notification_channels: [pushover_standard] +``` + +## Best Practices + +1. **Test Configuration Before Reload** + - Validate YAML syntax before sending SIGHUP + - Check for typos in channel names + - Verify threshold values are reasonable + +2. **Monitor Reload Logs** + - Always check logs after reload to confirm success + - Look for error messages if reload fails + - Verify expected number of thresholds loaded + +3. **Backup Before Changes** + - Keep a backup of working configuration + - Use version control (git) for config files + - Document why changes were made + +4. **Gradual Rollout** + - Test changes on development server first + - Apply to one production server at a time + - Verify behavior before applying everywhere + +5. **Plan for Restart-Required Changes** + - Schedule downtime for port or SSL changes + - Use blue-green deployment if possible + - Keep service downtime minimal + +## Troubleshooting + +### Reload Doesn't Apply Changes + +**Check:** +- Is the config file path correct? +- Did you save the file after editing? +- Are there YAML syntax errors? +- Check the logs for error messages + +**Solution:** +```bash +# Validate YAML syntax +python -c "import yaml; yaml.safe_load(open('.hb.yaml'))" + +# Check file modification time +ls -l .hb.yaml + +# View logs +journalctl -u heartbeat -f +``` + +### Partial Configuration Applied + +**Cause:** Some sections reloaded, others didn't. + +**Solution:** Check logs to see which components failed. Common issues: +- Invalid channel type +- Missing required threshold fields +- Invalid host references + +### Service Becomes Unresponsive + +**Cause:** Malformed configuration caused an exception. + +**Solution:** +1. Revert to backup configuration +2. Send SIGHUP again to reload the good config +3. If service is completely stuck, restart it + +## Implementation Details + +The reload mechanism uses: + +- **Signal Handling**: SIGHUP triggers reload event +- **Async-Safe Reloading**: Configuration is loaded asynchronously +- **Component Coordination**: All affected components are updated atomically +- **State Preservation**: Alert states and hysteresis information are maintained +- **Error Recovery**: Failed reloads don't affect running configuration + +## See Also + +- [NOTIFICATIONS.md](NOTIFICATIONS.md) - Notification channel configuration +- [THRESHOLD_ALERTING.md](THRESHOLD_ALERTING.md) - Threshold configuration details +- Configuration examples in `hbd/config_*.yaml` diff --git a/docs/HTTP_API.md b/docs/HTTP_API.md index d247377..8579efb 100644 --- a/docs/HTTP_API.md +++ b/docs/HTTP_API.md @@ -28,7 +28,6 @@ Get list of all monitored hosts with their state information. { "name": "webserver01", "dyn": false, - "ver": 6, "connections": [...] } ] diff --git a/docs/MESSAGE_JOURNAL.md b/docs/MESSAGE_JOURNAL.md index 74d56d5..8af6261 100644 --- a/docs/MESSAGE_JOURNAL.md +++ b/docs/MESSAGE_JOURNAL.md @@ -48,7 +48,7 @@ journal_max_backups: 10 # Number of backup files to keep Messages are logged in JSONL (JSON Lines) format - one JSON object per line: ```json -{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30,"ver":1}} +{"timestamp":1711234567.123,"datetime":"2026-03-28T12:34:56","source_ip":"192.168.1.100","source_port":50003,"message":{"ID":"HTB","name":"webserver1","interval":30}} {"timestamp":1711234597.456,"datetime":"2026-03-28T12:35:37","source_ip":"192.168.1.101","source_port":50003,"message":{"ID":"PLG","plugin":"cpu_monitor","cpu_percent":45.2,"load_1min":1.5}} ``` diff --git a/hbd/client/main.py b/hbd/client/main.py index 1356bb3..6c23221 100644 --- a/hbd/client/main.py +++ b/hbd/client/main.py @@ -28,12 +28,13 @@ from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin # Constants PORT = 50003 INTERVAL = 10 -VER = 6 MAXRECV = 32767 # Global state running = True dorestart = False +shutdown_event: Optional[asyncio.Event] = None +active_tasks: List[asyncio.Task] = [] class AsyncConnection: @@ -101,7 +102,6 @@ class AsyncConnection: # Add standard fields msg["name"] = shortname(self.name) msg["id"] = self.conn_id - msg["ver"] = VER msg["time"] = time.time() # Encode message @@ -278,9 +278,25 @@ async def heartbeat_sender(conn: AsyncConnection, interval: int): except Exception as e: logger.error(f"Error sending heartbeat: {e}", exc_info=True) + except asyncio.CancelledError: + logger.debug("Heartbeat sender cancelled") + raise - # Wait for next interval - await asyncio.sleep(interval) + # Wait for next interval or shutdown event + try: + if shutdown_event: + await asyncio.wait_for( + shutdown_event.wait(), + timeout=interval + ) + break + else: + await asyncio.sleep(interval) + except asyncio.TimeoutError: + pass # Normal timeout, continue loop + except asyncio.CancelledError: + logger.debug("Heartbeat sender cancelled during sleep") + raise async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry): @@ -324,7 +340,14 @@ async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry): # Wait for all tasks if tasks: - await asyncio.gather(*tasks) + try: + await asyncio.gather(*tasks, return_exceptions=True) + except asyncio.CancelledError: + logger.debug("Plugin collector cancelled, cancelling sub-tasks") + for task in tasks: + if not task.done(): + task.cancel() + raise async def plugin_collector_interval( @@ -350,13 +373,30 @@ async def plugin_collector_interval( plugin_msg = {"plugin": plugin.name, **data} await conn.sendto(plugin_msg, "PLG") logger.debug(f"Sent {plugin.name} data") + except asyncio.CancelledError: + logger.debug("Plugin collector cancelled") + raise except Exception as e: logger.error( f"Error collecting {plugin.name}: {e}", exc_info=True ) - await asyncio.sleep(interval) + # Wait for next interval or shutdown event + try: + if shutdown_event: + await asyncio.wait_for( + shutdown_event.wait(), + timeout=interval + ) + break + else: + await asyncio.sleep(interval) + except asyncio.TimeoutError: + pass # Normal timeout, continue loop + except asyncio.CancelledError: + logger.debug("Plugin collector cancelled during sleep") + raise def shortname(name: str) -> str: @@ -368,6 +408,15 @@ def stop(): """Stop the event loop.""" global running running = False + + # Set shutdown event to wake up sleeping tasks + if shutdown_event: + shutdown_event.set() + + # Cancel all active tasks + for task in active_tasks: + if not task.done(): + task.cancel() async def cleanup(connections: List[AsyncConnection]): @@ -393,7 +442,11 @@ async def cleanup(connections: List[AsyncConnection]): async def async_main(args, config): """Async main function.""" - global running + global running, shutdown_event, active_tasks + + # Create shutdown event + shutdown_event = asyncio.Event() + active_tasks = [] logger = logging.getLogger("hbc.main") @@ -464,31 +517,30 @@ async def async_main(args, config): else: logger.warning(f"Plugin directory not found: {plugin_dir}") - # Start async tasks - tasks = [] - - # Heartbeat senders (one per connection) - for conn in connections: - task = asyncio.create_task(heartbeat_sender(conn, interval)) - tasks.append(task) - - # Plugin collector (uses all connections, but we'll use first one) - if connections and registry.get_enabled(): - task = asyncio.create_task(plugin_collector(connections[0], registry)) - tasks.append(task) - # Setup signal handlers loop = asyncio.get_event_loop() for sig in (signal.SIGTERM, signal.SIGINT): loop.add_signal_handler(sig, stop) + # Start async tasks + # Heartbeat senders (one per connection) + for conn in connections: + task = asyncio.create_task(heartbeat_sender(conn, interval)) + active_tasks.append(task) + + # Plugin collector (uses all connections, but we'll use first one) + if connections and registry.get_enabled(): + task = asyncio.create_task(plugin_collector(connections[0], registry)) + active_tasks.append(task) + # Wait for stop or tasks to complete try: - await asyncio.gather(*tasks) + await asyncio.gather(*active_tasks, return_exceptions=True) except asyncio.CancelledError: - pass + logger.info("Tasks cancelled") # Cleanup + logger.info("Shutting down...") await cleanup(connections) await loader.unload_all() diff --git a/hbd/server/cli.py b/hbd/server/cli.py index 0a4f0a6..8f8ccad 100644 --- a/hbd/server/cli.py +++ b/hbd/server/cli.py @@ -46,7 +46,8 @@ def main(argv=None): if args.debug > 0: config["debug"] = args.debug - run_server(config) + # Pass config_path for reloading support + run_server(config, config_path=args.configfile) if __name__ == "__main__": diff --git a/hbd/server/config.py b/hbd/server/config.py index 1c945a6..4f91b0b 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -1,5 +1,6 @@ """Configuration loader and defaults for hbd (HeartBeat Daemon/Server).""" +import asyncio import logging import os @@ -95,6 +96,91 @@ def load_config(path=None): return cfg +class ReloadableConfig: + """Thread-safe/async-safe configuration wrapper that supports runtime reloading. + + This class wraps the configuration dictionary and provides: + - Thread-safe config reloading via SIGHUP + - Backward-compatible dict-like access + - Async lock to prevent concurrent reloads + """ + + def __init__(self, initial_config, config_path=None): + """Initialize with initial configuration. + + Args: + initial_config: Initial configuration dictionary + config_path: Path to config file for reloading (optional) + """ + self._config = initial_config + self._config_path = config_path + self._lock = asyncio.Lock() + self._logger = logging.getLogger(__name__) + + async def reload(self, config_path=None): + """Reload configuration from file. + + Args: + config_path: Path to config file (uses stored path if not provided) + + Returns: + New configuration dictionary + + Raises: + Exception if reload fails (keeps existing config) + """ + path = config_path or self._config_path + if not path: + raise ValueError("No config path specified for reload") + + async with self._lock: + try: + # Load new config + new_config = load_config(path) + + # Store old config for rollback if needed + old_config = self._config + + # Update config + self._config = new_config + self._logger.info(f"Configuration reloaded from {path}") + + return new_config + except Exception as e: + self._logger.error(f"Failed to reload config from {path}: {e}", exc_info=True) + # Keep existing config on error + raise + + def get(self, key, default=None): + """Get a config value (dict-compatible).""" + return self._config.get(key, default) + + def __getitem__(self, key): + """Get a config value via subscript (dict-compatible).""" + return self._config[key] + + def __contains__(self, key): + """Check if key exists (dict-compatible).""" + return key in self._config + + def keys(self): + """Return config keys (dict-compatible).""" + return self._config.keys() + + def items(self): + """Return config items (dict-compatible).""" + return self._config.items() + + def values(self): + """Return config values (dict-compatible).""" + return self._config.values() + + @property + def config(self): + """Get the underlying config dict (for components that need full dict).""" + return self._config + + def get_watchhosts(config): """Extract watchhosts from config, supporting both new and legacy formats. diff --git a/hbd/server/hbdclass.py b/hbd/server/hbdclass.py index 92ebc83..b8e45df 100644 --- a/hbd/server/hbdclass.py +++ b/hbd/server/hbdclass.py @@ -291,7 +291,6 @@ class Host: self.interval = 0 self.doesack = -1 self.cmds = [] - self.cver = 0 self.connections = {} # Plugin data storage: {plugin_name: [(timestamp, data), ...]} self.plugin_data = {} @@ -307,7 +306,6 @@ class Host: if self.watched: d["name"] = "%s" % d["name"] d["dyn"] = str(self.dyn) - d["ver"] = str(self.cver) d["num"] = self.num for c in ["IPv4", "IPv6"]: if c in self.connections: @@ -323,7 +321,6 @@ class Host: d = {} d["name"] = "Name" d["dyn"] = "Dyn" - d["ver"] = "Ver" d["num"] = "??" for c in ["IPv4", "IPv6"]: cs = ubConnection.headerdict(c) @@ -371,9 +368,6 @@ class Host: def jsons(self): return json.dumps(self.stateinfo()) - def setcver(self, cver): - self.cver = cver - def isDynDns(self): return self.dyn @@ -483,7 +477,6 @@ class Host: "IPv6.state", ("IPv6.rtt", 'style="text-align: right;"'), ("IPv6.statetime", 'style="text-align: right;"'), - "ver", ] hostfields_short = [ diff --git a/hbd/server/http.py b/hbd/server/http.py index 2a3c4ca..20b8678 100644 --- a/hbd/server/http.py +++ b/hbd/server/http.py @@ -115,7 +115,7 @@ async def start( if uname != "All": names = [uname] else: - names = [n for n in hbdclass.Host.hosts if hbdclass.Host.hosts[n].cver >= 2] + names = [n for n in hbdclass.Host.hosts] out = [] for n in names: err = None diff --git a/hbd/server/main.py b/hbd/server/main.py index 5bd5fe2..2af91ce 100644 --- a/hbd/server/main.py +++ b/hbd/server/main.py @@ -50,19 +50,78 @@ def cleanup_function(config, hbdclass): logger.info("Cleanup complete.") -async def _run_async(config): +async def reload_configuration(config_obj, config_path, components): + """Reload configuration and update all components. + + Args: + config_obj: ReloadableConfig instance + config_path: Path to config file + components: Dict with threshold_checker and other components + + Returns: + True if reload succeeded, False otherwise + """ + try: + logger.info("=" * 60) + logger.info("Starting configuration reload...") + logger.info("=" * 60) + + # Reload config file + new_config = await config_obj.reload(config_path) + + # Update notify module + notify_mod.reload_config(new_config) + + # Reload threshold checker + if 'threshold_checker' in components: + components['threshold_checker'].reload(new_config) + + # Note: Changes to the following require restart: + # - hb_port, hbd_port, ws_port (already bound) + # - SSL certificates (already loaded) + # - pickfile (already opened) + # - journal settings (journal already initialized) + + # These are reloadable and effective immediately: + # - notification_channels + # - threshold_configs + # - hosts (watchhosts, dyndnshosts, notification_channels) + # - grace period (used on next heartbeat) + # - debug/verbose flags (used on next message) + + logger.info("=" * 60) + logger.info("Configuration reload completed successfully") + logger.info("=" * 60) + return True + + except Exception as e: + logger.error("=" * 60) + logger.error(f"Failed to reload configuration: {e}", exc_info=True) + logger.error("Keeping previous configuration") + logger.error("=" * 60) + return False + + +async def _run_async(config, config_path=None): loop = asyncio.get_running_loop() shutdown_event = asyncio.Event() + reload_event = asyncio.Event() - # Signal handlers for graceful shutdown + # Signal handlers for graceful shutdown and reload def signal_handler(signum, frame): sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum logger.info(f"Received {sig_name}, initiating shutdown...") loop.call_soon_threadsafe(shutdown_event.set) + + def reload_handler(signum, frame): + sig_name = signal.Signals(signum).name if hasattr(signal, "Signals") else signum + logger.info(f"Received {sig_name}, initiating config reload...") + loop.call_soon_threadsafe(reload_event.set) # Register signal handlers loop.add_signal_handler(signal.SIGINT, signal_handler, signal.SIGINT, None) loop.add_signal_handler(signal.SIGTERM, signal_handler, signal.SIGTERM, None) + loop.add_signal_handler(signal.SIGHUP, reload_handler, signal.SIGHUP, None) from . import http as http_mod from . import dns as dns_mod @@ -83,6 +142,12 @@ async def _run_async(config): journal=msg_journal, ) logger.info("Threshold checker initialized") + + # Components dict for reload orchestration + components = { + 'threshold_checker': threshold_checker, + 'msg_journal': msg_journal, + } sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) # Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well) @@ -128,7 +193,6 @@ async def _run_async(config): port=config.get("hbd_port", 50004), config=config, hbdclass=hbdclass, - threshold_checker=threshold_checker, tcss=None, verbose=config.get("verbose", False), get_now=lambda: time.time(), @@ -193,10 +257,43 @@ async def _run_async(config): except Exception as e: logger.exception("websocket server failed to start: %s", e) + # Main event loop - monitor shutdown and reload events try: - # run forever until shutdown event is set - await shutdown_event.wait() - logger.info("Shutdown signal received, stopping services...") + while True: + # Wait for either shutdown or reload event + done, pending = await asyncio.wait( + [ + asyncio.create_task(shutdown_event.wait()), + asyncio.create_task(reload_event.wait()), + ], + return_when=asyncio.FIRST_COMPLETED + ) + + # Check which event was triggered + if shutdown_event.is_set(): + logger.info("Shutdown signal received, stopping services...") + # Cancel pending wait tasks + for task in pending: + task.cancel() + break + + if reload_event.is_set(): + # Clear the event for next reload + reload_event.clear() + + # Cancel pending wait tasks + for task in pending: + task.cancel() + + # Perform reload if config_path is available + if config_path: + await reload_configuration(config, config_path, components) + else: + logger.warning("Cannot reload: no config path available") + + # Continue main loop + continue + except Exception as e: logger.exception("Error in main loop: %s", e) finally: @@ -298,10 +395,14 @@ def load_pickled_hosts(config, hbdclass): logger.info("no pickled data") -def run(config): +def run(config, config_path=None): """Start the hbd service (blocking). Manually manages the event loop to ensure clean shutdown. + + Args: + config: Configuration dictionary + config_path: Path to config file (for reload support) """ import os @@ -312,13 +413,18 @@ def run(config): notify_mod.initlog(logfile=config.get("logfile", "messages.log")) eventlog(None, "INFO", f"hbd version {__version__} starting up") + + if config_path: + logger.info(f"Config file: {config_path} (reload with SIGHUP)") + else: + logger.warning("No config path provided - reload via SIGHUP disabled") # Create and set the event loop manually loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: - loop.run_until_complete(_run_async(config)) + loop.run_until_complete(_run_async(config, config_path=config_path)) except KeyboardInterrupt: logger.info("Received KeyboardInterrupt, shutting down...") except Exception as e: diff --git a/hbd/server/notify.py b/hbd/server/notify.py index 0c897c0..895d8f4 100644 --- a/hbd/server/notify.py +++ b/hbd/server/notify.py @@ -62,6 +62,20 @@ def setup(cfg: dict): _config = dict(cfg) +def reload_config(cfg: dict): + """Reload notification configuration. + + This function updates the module-level notification configuration + during runtime config reloads. + + Args: + cfg: New configuration dictionary + """ + global _config + _config = dict(cfg) + logger.info("Notification configuration reloaded") + + def send_email(toaddrs, smtpserver, sender, subject, body, debug=0): """Send a plain email via SMTP. Returns True on success.""" try: diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 1217814..521e3c8 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -316,6 +316,31 @@ class ThresholdChecker: total_thresholds ) + def reload(self, config: Dict[str, Any]): + """Reload threshold configuration from new config dict. + + This clears all existing thresholds and re-parses from the new configuration. + Alert states are preserved to maintain hysteresis across reloads. + + Args: + config: New configuration dictionary + """ + logger.info("Reloading threshold configuration...") + + # Clear old configuration + self.threshold_configs.clear() + self.thresholds.clear() + self.host_config_mapping.clear() + + # Parse new configuration + self._parse_config(config) + + total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values()) + if total_thresholds == 0 and len(self.thresholds) > 0: + total_thresholds = len(self.thresholds) + + logger.info("Threshold configuration reloaded: %d total thresholds", total_thresholds) + def _parse_config(self, config: Dict[str, Any]): """Parse threshold configuration from YAML structure. diff --git a/hbd/server/udp.py b/hbd/server/udp.py index 4f26bf7..743a009 100644 --- a/hbd/server/udp.py +++ b/hbd/server/udp.py @@ -127,10 +127,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): host.doesack = msg.get("acks", -1) # send ACK back rmsg = {"time": __import__("time").time()} - if host.cver < 1: - opkt = b"ACK" - else: - opkt = dicttos("ACK", rmsg) + opkt = dicttos("ACK", rmsg) try: transport.sendto(opkt, addr) except Exception as e: @@ -174,7 +171,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): }) except Exception: pass - host.setcver(msg.get("ver", 0)) try: conn, res = host.conndata(cid, ip, rtt, now) @@ -301,22 +297,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): del host.cmds[0] if log: log(uname, "command sent") - if host.cver < 1: - rmsg = rmsg["cmd"] elif op == "UPD": del host.cmds[0] if log: log(uname, "update initiated") - if host.cver < 1: - if log: - log(uname, " ver 0 does not support UPD") - continue - if host.cver < 1: - opkt = rmsg if isinstance(rmsg, (bytes, str)) else str(rmsg) - if isinstance(opkt, str): - opkt = opkt.encode() - else: - opkt = dicttos(op, rmsg) + opkt = dicttos(op, rmsg) try: transport.sendto(opkt, addr) except Exception as e: diff --git a/scripts/demo_http_api.py b/scripts/demo_http_api.py index 7ea57f5..2f51ba5 100644 --- a/scripts/demo_http_api.py +++ b/scripts/demo_http_api.py @@ -51,7 +51,6 @@ def test_hosts_api(): print(f"Found {len(hosts)} hosts:\n") for host in hosts: name = host.get('name', 'unknown') - ver = host.get('ver', 0) dyn = host.get('dyn', False) conn_count = len(host.get('connections', [])) diff --git a/test_journal.py b/test_journal.py index b836d98..46c9d64 100644 --- a/test_journal.py +++ b/test_journal.py @@ -52,7 +52,6 @@ async def test_basic_logging(): 'ID': 'HTB', 'name': 'testhost1', 'interval': 30, - 'ver': 1 }, { 'ID': 'PLG', @@ -146,7 +145,6 @@ async def test_rotation(): 'ID': 'HTB', 'name': f'testhost{i}', 'interval': 30, - 'ver': 1, 'data': 'x' * 50 # Add some padding } await journal.log_message(msg, ('192.168.1.100', 50000 + i), 1000.0 + i)