From 917d6a401b2536979d29d76991a25e4bdd87e9f9 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Sat, 2 May 2026 10:35:23 -0400 Subject: [PATCH] feat: composable threshold_config list for per-host threshold layering threshold_config in the hosts section now accepts a list of named configs applied left-to-right on top of the defaults, so focused override profiles can be mixed without duplication. Single-string and legacy host_threshold_mapping forms are unchanged. - Add threshold_raw_configs to store per-config overrides separately - Normalise threshold_config to list on parse (string or list) - get_thresholds_for_host folds the list over the default base - Update README and docs/THRESHOLD_ALERTING.md with examples Co-Authored-By: Claude Sonnet 4.6 (1M context) --- README.md | 35 ++++++ docs/THRESHOLD_ALERTING.md | 244 +++++++++++++++++++++++++++++-------- hbd/server/threshold.py | 115 ++++++++++------- 3 files changed, 299 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 97e0db0..1194aa2 100644 --- a/README.md +++ b/README.md @@ -267,6 +267,41 @@ All plugin metrics can be thresholded: - **Network**: errors_total, dropped packets, connection counts - **Nagios**: exit_code mapping (0=OK, 1=WARNING, 2=CRITICAL) +### Per-Host Threshold Profiles + +Named threshold configurations let different hosts use different limits. A host's `threshold_config` can be a single name or a **list** — lists are applied left-to-right so profiles compose without duplication: + +```yaml +threshold_configs: + default: + thresholds: + cpu_monitor: + cpu_percent: {warning: 80, critical: 90} + memory_monitor: + memory_percent: {warning: 85, critical: 95} + + tight_cpu: # override CPU limits only + thresholds: + cpu_monitor: + cpu_percent: {warning: 60, critical: 75} + + db_disk: # add a database partition check + thresholds: + disk_monitor: + partitions: + /var/lib/postgresql: + percent: {warning: 75, critical: 88} + +hosts: + web-01: + threshold_config: default # single profile + + db-01: + threshold_config: [tight_cpu, db_disk] # layered: CPU override + extra disk check +``` + +Each named config's overrides are applied in order on top of the defaults. Metrics not mentioned in a profile are inherited unchanged. + See [docs/THRESHOLD_ALERTING.md](docs/THRESHOLD_ALERTING.md) for comprehensive documentation including best practices, troubleshooting, and advanced configuration. --- diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index a3eed90..48bf66c 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -814,42 +814,39 @@ Planned features: ## Multi-Threshold Configuration -**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping. +Support for multiple named threshold configurations with per-host mapping and composable layering. ### Overview The multi-threshold feature allows you to: -- Define multiple sets of threshold configurations -- Map different hosts to different threshold sets +- Define multiple named threshold configurations +- Assign one or more configurations to each host +- Compose configurations by layering — each named config's overrides are applied in order on top of the defaults - Use different sensitivity levels for different environments -- Maintain a default configuration for unmapped hosts ### Configuration Structure +Named configurations are defined under `threshold_configs`. Each host selects which ones to use via `threshold_config` in the `hosts` section (a string for a single config, or a list to layer multiple): + ```yaml -# Optional: Set the default configuration name (defaults to "default") +# Optional: set the default configuration name (defaults to "default") default_threshold_config: "default" -# Define multiple named threshold configurations threshold_configs: - # Configuration name 1 default: thresholds: - # Standard threshold definitions cpu_monitor: cpu_percent: warning: 80.0 critical: 90.0 - - # Configuration name 2 + high_sensitivity: thresholds: cpu_monitor: cpu_percent: warning: 60.0 critical: 75.0 - - # Configuration name 3 + low_sensitivity: thresholds: cpu_monitor: @@ -857,14 +854,77 @@ threshold_configs: warning: 90.0 critical: 95.0 -# Map specific hosts to specific configurations -host_threshold_mapping: - prod-web-01: high_sensitivity - prod-web-02: high_sensitivity - dev-server-01: low_sensitivity - # Unmapped hosts use default_threshold_config +hosts: + prod-web-01: + threshold_config: high_sensitivity # single config + + dev-server-01: + threshold_config: low_sensitivity + + # Hosts with no threshold_config use default_threshold_config ``` +### Composable Configurations (list form) + +`threshold_config` can be a list. Configs are applied **left to right**: the defaults are the base, then each named config's overrides are layered on top. Later entries in the list win on any metric they define. + +```yaml +threshold_configs: + default: + thresholds: + cpu_monitor: + cpu_percent: {warning: 80, critical: 90} + memory_monitor: + memory_percent: {warning: 85, critical: 95} + disk_monitor: + partitions: + /: + percent: {warning: 80, critical: 90} + + # Tighter CPU limits for busy servers + high_cpu_load: + thresholds: + cpu_monitor: + cpu_percent: {warning: 60, critical: 75} + + # Tighter disk limits for data-heavy servers + busy_disk: + thresholds: + disk_monitor: + partitions: + /: + percent: {warning: 70, critical: 85} + +hosts: + # Gets default thresholds only + web-01: + threshold_config: default + + # Gets tighter CPU limits, default memory and disk + build-server: + threshold_config: high_cpu_load + + # Layers both: tighter CPU AND tighter disk, default memory + db-01: + threshold_config: [high_cpu_load, busy_disk] + + # Three layers: busy_disk overrides high_cpu_load if they conflict + storage-01: + threshold_config: [default, high_cpu_load, busy_disk] +``` + +**How layering works:** + +Starting from the `default` thresholds: + +| Layer | Applied config | Effect | +|-------|---------------|--------| +| Base | `default` | all default thresholds | +| +1 | `high_cpu_load` | cpu_percent overridden to 60/75 | +| +2 | `busy_disk` | disk percent overridden to 70/85; cpu_percent stays at 60/75 | + +Each named config only overrides the metrics it explicitly defines. Metrics not mentioned in a config inherit from the layers beneath. + ### Use Cases #### 1. Environment-Based Thresholds @@ -879,7 +939,7 @@ threshold_configs: cpu_percent: warning: 70.0 # Alert earlier in production critical: 85.0 - + development: thresholds: cpu_monitor: @@ -887,11 +947,15 @@ threshold_configs: warning: 90.0 # More relaxed for dev critical: 98.0 -host_threshold_mapping: - prod-web-01: production - prod-web-02: production - dev-web-01: development - dev-web-02: development +hosts: + prod-web-01: + threshold_config: production + prod-web-02: + threshold_config: production + dev-web-01: + threshold_config: development + dev-web-02: + threshold_config: development ``` #### 2. Server Role-Based Thresholds @@ -906,7 +970,7 @@ threshold_configs: cpu_percent: warning: 80.0 critical: 90.0 - + database: thresholds: cpu_monitor: @@ -914,7 +978,7 @@ threshold_configs: warning: 70.0 critical: 85.0 memory_monitor: - percent: + memory_percent: warning: 90.0 # Databases can use high memory critical: 97.0 disk_monitor: @@ -923,21 +987,27 @@ threshold_configs: percent: warning: 75.0 critical: 85.0 - + cache: thresholds: memory_monitor: - percent: + memory_percent: warning: 95.0 # Redis/Memcached can use very high memory critical: 99.0 -host_threshold_mapping: - web-01: webserver - web-02: webserver - db-01: database - db-02: database - redis-01: cache - memcached-01: cache +hosts: + web-01: + threshold_config: webserver + web-02: + threshold_config: webserver + db-01: + threshold_config: database + db-02: + threshold_config: database + redis-01: + threshold_config: cache + memcached-01: + threshold_config: cache ``` #### 3. Sensitivity Levels @@ -952,10 +1022,10 @@ threshold_configs: partitions: /: percent: - warning: 70.0 # Very sensitive + warning: 70.0 critical: 80.0 hysteresis: 0.15 - + standard: thresholds: disk_monitor: @@ -965,7 +1035,7 @@ threshold_configs: warning: 85.0 critical: 95.0 hysteresis: 0.1 - + relaxed: thresholds: disk_monitor: @@ -976,12 +1046,69 @@ threshold_configs: critical: 98.0 hysteresis: 0.05 -host_threshold_mapping: - payment-gateway: critical - auth-server: critical - web-01: standard - web-02: standard - test-server: relaxed +hosts: + payment-gateway: + threshold_config: critical + auth-server: + threshold_config: critical + web-01: + threshold_config: standard + web-02: + threshold_config: standard + test-server: + threshold_config: relaxed +``` + +#### 4. Composable Profiles + +Build host-specific thresholds by combining small, focused configs: + +```yaml +threshold_configs: + # Baseline — everything at default levels + default: + thresholds: + cpu_monitor: + cpu_percent: {warning: 80, critical: 90} + memory_monitor: + memory_percent: {warning: 85, critical: 95} + + # Overlay: tighter CPU only + tight_cpu: + thresholds: + cpu_monitor: + cpu_percent: {warning: 60, critical: 75} + + # Overlay: tighter memory only + tight_memory: + thresholds: + memory_monitor: + memory_percent: {warning: 70, critical: 85} + + # Overlay: extra disk partition for database servers + db_disk: + thresholds: + disk_monitor: + partitions: + /var/lib/postgresql: + percent: {warning: 75, critical: 88} + +hosts: + # Plain web server + web-01: + threshold_config: default + + # Build server: tight CPU, default memory and disk + build-01: + threshold_config: tight_cpu + + # Database: tight CPU + tight memory + extra disk partition + db-01: + threshold_config: [tight_cpu, tight_memory, db_disk] + + # Replica database: tight memory + extra disk, normal CPU + db-02: + threshold_config: [tight_memory, db_disk] ``` ### Backward Compatibility @@ -1012,16 +1139,25 @@ threshold_configs: ### Configuration Priority -1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config -2. **Default config**: Use `default_threshold_config` -3. **First alphabetically**: If default not found, use first config alphabetically -4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds` +1. **Host `threshold_config` (list)**: Layer each named config's overrides left-to-right on top of the defaults +2. **Host `threshold_config` (string)**: Use that single named config directly +3. **`host_threshold_mapping`** (legacy): Same as above, string only +4. **`default_threshold_config`**: Used for hosts with no mapping +5. **First alphabetically**: If the default config is not found, use the first config alphabetically +6. **Legacy `thresholds` section**: Used when `threshold_configs` is absent entirely -### Example: Complete Multi-Threshold Setup +### Backward Compatibility -See `hbd/config_multi_threshold_example.yaml` for a complete example with: -- 4 named configurations (default, high_sensitivity, low_sensitivity, database) -- Host-to-config mappings for production, development, and test systems -- Specialized database server thresholds -- Custom display messages with plugin data +The legacy `host_threshold_mapping` top-level key and the flat `thresholds` section are still fully supported: + +```yaml +# Still works — equivalent to hosts: {prod-web-01: {threshold_config: high_sensitivity}} +host_threshold_mapping: + prod-web-01: high_sensitivity + +# Still works — equivalent to threshold_configs: {default: {thresholds: ...}} +thresholds: + cpu_monitor: + cpu_percent: {warning: 80, critical: 90} +``` diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 3e25060..6588651 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -328,15 +328,18 @@ class ThresholdChecker: renotify_interval: Seconds between repeat notifications (default: 1 hour) journal: Optional MessageJournal instance for logging threshold events """ - # Named threshold configurations: {config_name: {metric_path: ThresholdConfig}} + # Named threshold configurations (pre-merged: defaults + overrides): {config_name: {metric_path: ThresholdConfig}} self.threshold_configs = {} - + + # Raw overrides only for each named config (no defaults baked in): {config_name: {metric_path: ThresholdConfig}} + self.threshold_raw_configs: Dict[str, Dict[str, ThresholdConfig]] = {} + # Single threshold set for backward compatibility: {metric_path: ThresholdConfig} self.thresholds = {} - - # Host to config name mapping: {host_name: config_name} - self.host_config_mapping = {} - + + # Host to ordered list of config names: {host_name: [config_name, ...]} + self.host_config_mapping: Dict[str, List[str]] = {} + # Default config name to use when no mapping exists self.default_config = "default" @@ -372,6 +375,7 @@ class ThresholdChecker: # Clear old configuration self.threshold_configs.clear() + self.threshold_raw_configs.clear() self.thresholds.clear() self.host_config_mapping.clear() self.grace_seconds = float(config.get("grace", 2)) @@ -424,9 +428,10 @@ class ThresholdChecker: self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=effective_defaults) self.threshold_configs["default"] = dict(effective_defaults) + self.threshold_raw_configs["default"] = {} logger.info("Registered 'default' threshold config with %d metrics", len(effective_defaults)) - # Parse each named configuration, seeding it with effective_defaults first + # Parse each named configuration for config_name, config_data in threshold_configs.items(): if config_name == "default": continue # already handled above @@ -440,33 +445,41 @@ class ThresholdChecker: continue logger.info("Parsing threshold configuration: %s", config_name) - self.threshold_configs[config_name] = dict(effective_defaults) + # Raw overrides only (used for multi-config layering) + raw_overrides: Dict[str, ThresholdConfig] = {} thresholds_config = config_data["thresholds"] for plugin_name, plugin_thresholds in thresholds_config.items(): - if not isinstance(plugin_thresholds, dict): - continue + if isinstance(plugin_thresholds, dict): + self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=raw_overrides) + self.threshold_raw_configs[config_name] = raw_overrides - self._parse_plugin_thresholds( - plugin_name, - plugin_thresholds, - target_dict=self.threshold_configs[config_name] - ) - - # Parse host to config mapping from two possible sources - # 1. New format: hosts section with threshold_config attribute + # Pre-merged version (defaults + overrides) for single-config fast path + self.threshold_configs[config_name] = dict(effective_defaults) + self.threshold_configs[config_name].update(raw_overrides) + + # Parse host → config list mapping from two possible sources + + def _normalise(value) -> List[str]: + """Accept a string or list; always return a list.""" + if isinstance(value, list): + return [str(v) for v in value] + return [str(value)] + + # 1. hosts section with threshold_config attribute (string or list) if "hosts" in config: hosts_config = config["hosts"] if isinstance(hosts_config, dict): for host_name, host_attrs in hosts_config.items(): if isinstance(host_attrs, dict) and "threshold_config" in host_attrs: - self.host_config_mapping[host_name] = host_attrs["threshold_config"] - - # 2. Legacy format: host_threshold_mapping section (for backward compatibility) + self.host_config_mapping[host_name] = _normalise(host_attrs["threshold_config"]) + + # 2. Legacy host_threshold_mapping section (string values only) if "host_threshold_mapping" in config: legacy_mapping = config.get("host_threshold_mapping", {}) if isinstance(legacy_mapping, dict): - self.host_config_mapping.update(legacy_mapping) + for host_name, value in legacy_mapping.items(): + self.host_config_mapping[host_name] = _normalise(value) # Set default config (first one alphabetically or explicitly set) self.default_config = config.get("default_threshold_config", "default") @@ -664,35 +677,55 @@ class ThresholdChecker: ) def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: - """Get the appropriate threshold configuration for a host. - + """Get the effective threshold configuration for a host. + + When threshold_config is a list, configs are applied left-to-right on top + of the default thresholds so earlier entries can be overridden by later ones. + Args: host_name: Name of the host - + Returns: Dictionary of thresholds for this host """ # Legacy mode: single threshold set for all hosts if self.thresholds and not self.threshold_configs: return self.thresholds - - # Multi-config mode: look up host-specific configuration - if self.threshold_configs: - config_name = self.host_config_mapping.get(host_name, self.default_config) - - if config_name in self.threshold_configs: - return self.threshold_configs[config_name] - else: + + if not self.threshold_configs: + return {} + + config_names = self.host_config_mapping.get(host_name) + + # No host-specific mapping → return pre-merged default + if not config_names: + return self.threshold_configs.get(self.default_config, {}) + + # Single config → fast path using pre-merged copy + if len(config_names) == 1: + name = config_names[0] + if name in self.threshold_configs: + return self.threshold_configs[name] + logger.warning( + "Threshold config '%s' not found for host '%s', using default '%s'", + name, host_name, self.default_config, + ) + return self.threshold_configs.get(self.default_config, {}) + + # Multiple configs → start from defaults, layer raw overrides in order + result = dict(self.threshold_configs.get(self.default_config, {})) + for name in config_names: + if name == self.default_config: + continue # defaults already the base + raw = self.threshold_raw_configs.get(name) + if raw is None: logger.warning( - "Threshold config '%s' not found for host '%s', using default '%s'", - config_name, - host_name, - self.default_config + "Threshold config '%s' not found for host '%s', skipping", + name, host_name, ) - return self.threshold_configs.get(self.default_config, {}) - - # No thresholds configured - return {} + else: + result.update(raw) + return result def check_value( self,