From 090d341244b68573ccf9c3582bf2ba12f6eda55e Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Wed, 1 Apr 2026 15:22:42 -0400 Subject: [PATCH] per-client threshold config --- .hb.yaml | 156 +++++++--- .hb.yaml.swp | Bin 12288 -> 20480 bytes docs/THRESHOLD_ALERTING.md | 217 ++++++++++++++ hbd/config_multi_threshold_example.yaml | 202 +++++++++++++ hbd/server/templates/alerts.html | 6 +- hbd/server/threshold.py | 365 +++++++++++++++++++++--- nagios_bad.sh | 4 + 7 files changed, 873 insertions(+), 77 deletions(-) create mode 100644 hbd/config_multi_threshold_example.yaml create mode 100755 nagios_bad.sh diff --git a/.hb.yaml b/.hb.yaml index 02591e2..ec54df5 100644 --- a/.hb.yaml +++ b/.hb.yaml @@ -50,43 +50,119 @@ journal_max_size: 104857600 # Max size (100MB default) journal_max_backups: 10 # Number of backups to keep thresholds: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 - memory_monitor: - percent: - warning: 3.0 - critical: 95.0 - disk_monitor: - partitions: - /: - percent: - warning: 85.0 - critical: 90.0 - nagios_runner: - overall_status_code: - warning: 1 - critical: 2 - operator: ">=" - load_status: - warning: WARNING - critical: CRITICAL - operator: "==" - UPS_load: - warning: 70 - critical: 80 - operator: ">=" - UPS_status_code: - warning: 1 - critical: 2 - operator: ">=" - nextcloud_apps_status: - display: "{nextcloud_apps_output}" - warning: 1 - critical: 2 - operator: ">=" - rtt: - y: - warning: 30 - critical: 250.0 + default: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + rtt: + y: + warning: 30 + critical: 250.0 + + + freebsd_server: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: + y: + warning: 30 + critical: 250.0 + + truenas_server: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: + y: + warning: 30 + critical: 250.0 + + +host_threshold_mapping: + # Critical production servers + + wally: freebsd_server + eris: truenas_server + diff --git a/.hb.yaml.swp b/.hb.yaml.swp index 6dc360126d7829af182f10486444e38fa7372be6..10da5a262ec771698a93c2f5e7500da252e900e1 100644 GIT binary patch literal 20480 zcmeI2dyHL09mmJkM_c@Mxn?b>#CyM1VG&OK-Dy{Grw zGl!Wud-s+g36Ox1fPaXU7(ruV_`{GU)*uP7QL7=1CYTVhiH#xn2ZmUU{82zj{La1S z+{d=r+WH4o=Omw;IrI3<%=es`Gp|x+|MYG;mdX-b*AOztf8(8d`jg}*x%I>jT)}Lq zLsq>SUZ9v*HA*Y+E3v_J#Z&1jvxO=#TcuOgQmSd!{Yx7#F^mF6fh$uWQsvw8o5;{$ zHlqy;SU1v7e0<@`Y?)L>0i%FXz$jo8FbWt2i~>dhqriWq0xDcf9ze<0#HF8(?;luk z|6u&QE51*y2wxjNUmM@Qwjw-!G&iGwQNSo*6fg=H1&jhl0i%FXz$jo8FbWt2-UkJ2 zmyoR=A!K14GROXZGXH;T3n8z8SHO2b1*E}^;5x7YytkQ{f%p5F33(O#4!j7CfhITz?gZC^O<*H<@1uk~2kr%50+V1HxCU$i zr#BGtFxUmwfm7=V`2~0${1AK_JPHIj40eD#7yui=yU^eba2A{f$H9}}G4K%JU=i#G zTCaKVF0^|c{1%)9C&2f>Bftj-!8q6kt_A=6Fd=^iFM}V0r@`YO0EfU7NP+9X`418D zC-5?O0UQH9I0&Xd3fv4n1=fMTVUgyq;4JtxcmaGLJO-A52j;;zxD9Lto54S@c=H`f^TnXnZXNcZ{iBsQgv@RaA+eu3sQowyBK1YS@n&|KH= z(%Jglb}yRW_(k)Z*f%{tJ+Xb}!dw@(@0*>T-4O?9vvHMReSo&FmDNf7yY(6DM()%n zoA~i1sNwq;N)^}@kISNnf`Ex`ZFHtx(_@)bR~Q}C$sj+{7Xr;47O1YBbm9zgNC(RE zK(>7#Z>?D8fu~;)(KhwflVz{rPIg0`-1>5=vpN^eRn7I*pkprI^>xCi*k@CF^psDD zhy}JRN+uv>fsj^t`-(g`(lys3(NH2?vt3W2dh}WOT4SqS9q?A;t5a&c)cET8Mgz92 z9G^#S(GD+cB`ZR&tik-dMKP_k)9IL2+S4Cgek;XDBKg#NTErDJGi{}7!E|TTWo0|^ z(GjVtV6w`67yU9h!Xpvbez9&Z7fZHNLrX5uY$kaJo%;Ztjp`*PC@<4E9#x#y7z_Js zNbe}q28Qz^xzS9&odLSrUZy$*{cJXq*?m`1rfnS)WzR?QI+N0_Fqw)=*C_?X&+#r> zGOFVRcAa$+x*l3!y2|S;otzZ)c|~@Qigs)a(0kf-rbxpa{EV6%E3jGenOmT`@9(7O zaSYI@R+KdQVI7smCB#qyV?o-GvRGq!;1UXN39eMg>Q)>8Cj!cm?>kl73zDFTseSXs zJ=^DZ(s62~(uzry)=3;DR57$w6^SnDnU17L`y3;iG*eF6XG*fb5zSDgeQ$}St0gzp z5X@yM$8K5Fr3<-G1v-++WJa}^6TDu>-KS!s?tHKwQ+NX2Z# zC{xLtNHDrMH%A)1wnhk%brr_O^SSnuEoFlXsA-9OD1|-RI5Jvs>{>0fS`m<$?l$B1 z^@XIY?ZEi2OD%0URko$7+n%3tIP@$Fyl4HWQ?mvR0lPh^6_3LXCxNMeE3e$_#`!XF zn}N$=0)rEJpViKqi;8tWAz>t|QY^u?cIjc1)$-G>VqE5Evbu9;elB`SswQ8u9qzm$S8De z#fl|QNAG#J@JJ`27I6g+t6Zv9v7?|C@J3*Lj#>@QYaYvvwB16XBD#tJDiBAm+R~}| zT*_7yyQy1o%Zcx7iTi%r%}`udI~}enD35lErNre58zUy|$Al-5N76K;a8PRLRnV7o zN8WAO3-q?^aCS65I5?WkjkS_k?PK3ocNlmH1H)t4k?i2m@W^N`ADd|YN2we5>hW=^ zDy+1A*_{d}&t1J};?-(wT2Lfv*vhHK;*!=O_9Z>VzzNxkupM+_!x@4es9Z;Lv+A;f zCaoV^{hO6uz~?PHJyjT$e9tLXRozb~eP0r2wXDR+R_WimJzWa<^yk)>Z1v30l#04m z1$xUZq+06fFNW~0$$;*3QK3d=&?0*N{~6579|wB=pX}fN1@rqe;OF4GAOa;Y3vwU@ zZU!F%?;!2l;5_&vI0=3VegK{TE!*$I+J?!}C}0#Y3K#{90!9I&fKk9GU=%P47zM6c z1t`^v6799S?!FDy8yeUK!dhqkvJsDDc0gKr)2w yIZtE`@t8wAr2Sp=9gDmCLp<%1mFB43<(y45NA1jk*cFQkV^`UM*!~5S0{Iumk*CZ6 delta 493 zcmZY3O-mb56b9fE^QjnSk}#Q$*<2EnQ3Or9C{#)j>?Wq^bYm0=*fb3o&`d}sL{~{^ z|3Gt!?WT~8AK*$-!A166DFiHTOA%Zt;zAdN;yYbPp$8tg_ndnVZ>!^Iy^g#|W>XpQ zWbz@QWTJ25Mf6`5K z=)nXVFchcF-9e(y@E*#L0|TCd4k~!zkB{gAen1cQpbhW9ge6G9BhbJFH(sJM=tB>7 zU=u!o35J(ql)ym&CtM=hIsAlUu%jM%x(?Tbn|moA-;S!>7?X!Krb&pZWla+&;}LGe z{QN58=7mgvt+>u#<57MbSN`cs{41sY?d?QEtk22(DIxQALSZ$g@$5^*ziQ^+F4wHQ zRj>WGRpRHm%F(pO>$=8^=`e>4DKuq=5TcT=S|zJwmhFGfj4)>ng}" hysteresis: 0.1 + display: "display format" enabled: true ``` @@ -82,6 +83,8 @@ Note: At least one of `warning` or `critical` must be specified. - Range: 0.0 to 1.0 - Prevents rapid state transitions when value hovers near threshold +- **display**: f-string to hold the display format for alert messages + - defaults to "(threshold: {op_symbol} {threshold_value})" - **enabled**: Whether this threshold is active (default: `true`) ### Comparison Operators @@ -740,3 +743,217 @@ Planned features: - [Message Journal Documentation](MESSAGE_JOURNAL.md) - Configuration examples: `hbd/config_thresholds_example.yaml` - Test suite: `test_threshold.py` + +## Multi-Threshold Configuration + +**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping. + +### Overview + +The multi-threshold feature allows you to: +- Define multiple sets of threshold configurations +- Map different hosts to different threshold sets +- Use different sensitivity levels for different environments +- Maintain a default configuration for unmapped hosts + +### Configuration Structure + +```yaml +# Optional: Set the default configuration name (defaults to "default") +default_threshold_config: "default" + +# Define multiple named threshold configurations +threshold_configs: + # Configuration name 1 + default: + thresholds: + # Standard threshold definitions + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + + # Configuration name 2 + high_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 60.0 + critical: 75.0 + + # Configuration name 3 + low_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 + critical: 95.0 + +# Map specific hosts to specific configurations +host_threshold_mapping: + prod-web-01: high_sensitivity + prod-web-02: high_sensitivity + dev-server-01: low_sensitivity + # Unmapped hosts use default_threshold_config +``` + +### Use Cases + +#### 1. Environment-Based Thresholds + +Different thresholds for production vs. development: + +```yaml +threshold_configs: + production: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 # Alert earlier in production + critical: 85.0 + + development: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 # More relaxed for dev + critical: 98.0 + +host_threshold_mapping: + prod-web-01: production + prod-web-02: production + dev-web-01: development + dev-web-02: development +``` + +#### 2. Server Role-Based Thresholds + +Different thresholds based on server function: + +```yaml +threshold_configs: + webserver: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + + database: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 + critical: 85.0 + memory_monitor: + percent: + warning: 90.0 # Databases can use high memory + critical: 97.0 + disk_monitor: + partitions: + /var/lib/mysql: + percent: + warning: 75.0 + critical: 85.0 + + cache: + thresholds: + memory_monitor: + percent: + warning: 95.0 # Redis/Memcached can use very high memory + critical: 99.0 + +host_threshold_mapping: + web-01: webserver + web-02: webserver + db-01: database + db-02: database + redis-01: cache + memcached-01: cache +``` + +#### 3. Sensitivity Levels + +Different sensitivity for critical vs. non-critical systems: + +```yaml +threshold_configs: + critical: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 70.0 # Very sensitive + critical: 80.0 + hysteresis: 0.15 + + standard: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 95.0 + hysteresis: 0.1 + + relaxed: + thresholds: + disk_monitor: + partitions: + /: + percent: + warning: 90.0 + critical: 98.0 + hysteresis: 0.05 + +host_threshold_mapping: + payment-gateway: critical + auth-server: critical + web-01: standard + web-02: standard + test-server: relaxed +``` + +### Backward Compatibility + +The legacy single threshold configuration is fully supported: + +```yaml +# Old format - still works +thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 +``` + +This is equivalent to: + +```yaml +# New format +threshold_configs: + default: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 +``` + +### Configuration Priority + +1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config +2. **Default config**: Use `default_threshold_config` +3. **First alphabetically**: If default not found, use first config alphabetically +4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds` + +### Example: Complete Multi-Threshold Setup + +See `hbd/config_multi_threshold_example.yaml` for a complete example with: +- 4 named configurations (default, high_sensitivity, low_sensitivity, database) +- Host-to-config mappings for production, development, and test systems +- Specialized database server thresholds +- Custom display messages with plugin data + diff --git a/hbd/config_multi_threshold_example.yaml b/hbd/config_multi_threshold_example.yaml new file mode 100644 index 0000000..c14839c --- /dev/null +++ b/hbd/config_multi_threshold_example.yaml @@ -0,0 +1,202 @@ +# ============================================================================== +# Heartbeat Daemon Multi-Threshold Configuration Example +# ============================================================================== +# This file demonstrates the new multi-threshold configuration feature that allows +# different threshold settings for different hosts/clients. +# +# Features: +# - Define multiple named threshold configurations +# - Map specific hosts to specific threshold configurations +# - Set a default configuration for unmapped hosts +# - Backward compatible with single threshold configuration +# ============================================================================== + +# Global threshold settings +threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds) + +# Optional: Set default threshold config (defaults to "default" if not specified) +default_threshold_config: "default" + +# ---------------------------------------------------------------------------- +# Multiple Named Threshold Configurations +# ---------------------------------------------------------------------------- +# Define multiple threshold configurations with different sensitivity levels +threshold_configs: + + # Default configuration - moderate thresholds for most servers + default: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + operator: ">" + load_1min: + warning: 4.0 + critical: 8.0 + operator: ">" + + memory_monitor: + percent: + warning: 85.0 + critical: 95.0 + operator: ">" + + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 95.0 + operator: ">" + + rtt: + # RTT thresholds per remote host + router: + warning: 50.0 # ms + critical: 200.0 + server1: + warning: 100.0 + critical: 500.0 + + # High sensitivity configuration - lower thresholds for critical systems + high_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 60.0 # Alert earlier + critical: 75.0 + operator: ">" + hysteresis: 0.15 # More hysteresis to reduce flapping + load_1min: + warning: 2.0 + critical: 4.0 + operator: ">" + + memory_monitor: + percent: + warning: 75.0 # Alert at lower memory usage + critical: 85.0 + operator: ">" + display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)" + + disk_monitor: + partitions: + /: + percent: + warning: 75.0 + critical: 85.0 + operator: ">" + /var: + percent: + warning: 80.0 + critical: 90.0 + operator: ">" + + rtt: + router: + warning: 30.0 + critical: 100.0 + server1: + warning: 50.0 + critical: 200.0 + + # Low sensitivity configuration - higher thresholds for development/test systems + low_sensitivity: + thresholds: + cpu_monitor: + cpu_percent: + warning: 90.0 # Only alert at very high usage + critical: 95.0 + operator: ">" + + memory_monitor: + percent: + warning: 90.0 + critical: 98.0 + operator: ">" + + disk_monitor: + partitions: + /: + percent: + warning: 90.0 + critical: 95.0 + operator: ">" + + rtt: + router: + warning: 100.0 + critical: 500.0 + + # Production database servers - specialized thresholds + database: + thresholds: + cpu_monitor: + cpu_percent: + warning: 70.0 + critical: 85.0 + operator: ">" + + memory_monitor: + percent: + warning: 90.0 # Databases can use high memory + critical: 97.0 + operator: ">" + display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)" + + disk_monitor: + partitions: + /: + percent: + warning: 80.0 + critical: 90.0 + operator: ">" + /var/lib/mysql: # Database data partition + percent: + warning: 75.0 # Alert earlier for DB partition + critical: 85.0 + operator: ">" + + rtt: + router: + warning: 20.0 # Stricter latency requirements + critical: 50.0 + +# ---------------------------------------------------------------------------- +# Host to Threshold Configuration Mapping +# ---------------------------------------------------------------------------- +# Map specific hosts to specific threshold configurations +# Hosts not listed here will use the default_threshold_config +host_threshold_mapping: + # Critical production servers + prod-web-01: high_sensitivity + prod-web-02: high_sensitivity + prod-api-01: high_sensitivity + + # Database servers + prod-db-01: database + prod-db-02: database + prod-db-replica: database + + # Development and test systems + dev-server-01: low_sensitivity + dev-server-02: low_sensitivity + test-server-01: low_sensitivity + test-server-02: low_sensitivity + + # Everything else uses 'default' (no need to list explicitly) + +# ---------------------------------------------------------------------------- +# Backward Compatibility Example +# ---------------------------------------------------------------------------- +# The old single threshold format is still supported: +# Just use 'thresholds:' directly without 'threshold_configs:' +# +# thresholds: +# cpu_monitor: +# cpu_percent: +# warning: 80.0 +# critical: 90.0 +# +# This will apply the same thresholds to all hosts. diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html index 34a6744..1209480 100644 --- a/hbd/server/templates/alerts.html +++ b/hbd/server/templates/alerts.html @@ -397,9 +397,11 @@ const level = alert.level.toLowerCase(); const duration = getDuration(alert.since); - // Format value with threshold info if available + // Use formatted message if available, otherwise build from individual fields let valueText = `Value: ${formatValue(alert.last_value)}`; - if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { + if (alert.formatted_message) { + valueText += ` ${alert.formatted_message}`; + } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { valueText += ` (threshold: ${alert.operator} ${formatValue(alert.threshold_value)})`; } diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index f89881a..2c88b72 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -55,6 +55,7 @@ class AlertState: self.last_notification = None self.threshold_value = None # The threshold value that triggered alert self.operator = None # The comparison operator (>, <, >=, etc.) + self.formatted_message = None # Formatted display message for UI def update( self, @@ -120,6 +121,8 @@ class AlertState: result["threshold_value"] = self.threshold_value if self.operator is not None: result["operator"] = self.operator + if self.formatted_message is not None: + result["formatted_message"] = self.formatted_message return result @@ -285,7 +288,18 @@ class ThresholdChecker: renotify_interval: Seconds between repeat notifications (default: 1 hour) journal: Optional MessageJournal instance for logging threshold events """ - self.thresholds = {} # {metric_path: ThresholdConfig} + # Named threshold configurations: {config_name: {metric_path: ThresholdConfig}} + self.threshold_configs = {} + + # Single threshold set for backward compatibility: {metric_path: ThresholdConfig} + self.thresholds = {} + + # Host to config name mapping: {host_name: config_name} + self.host_config_mapping = {} + + # Default config name to use when no mapping exists + self.default_config = "default" + self.notification_callback = notification_callback self.renotify_interval = renotify_interval self.journal = journal @@ -293,10 +307,84 @@ class ThresholdChecker: # Parse configuration self._parse_config(config) - logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds)) + total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values()) + if total_thresholds == 0 and len(self.thresholds) > 0: + # Backward compatibility: using single threshold set + total_thresholds = len(self.thresholds) + logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds) + else: + logger.info( + "ThresholdChecker initialized with %d named configurations (%d total thresholds)", + len(self.threshold_configs), + total_thresholds + ) def _parse_config(self, config: Dict[str, Any]): - """Parse threshold configuration from YAML structure.""" + """Parse threshold configuration from YAML structure. + + Supports two formats: + 1. Legacy format with direct 'thresholds' section + 2. New format with 'threshold_configs' and 'host_threshold_mapping' + """ + # Check for new multi-config format + if "threshold_configs" in config: + self._parse_multi_config(config) + elif "thresholds" in config: + # Legacy single threshold configuration + self._parse_legacy_config(config) + else: + logger.info("No thresholds configured") + + def _parse_multi_config(self, config: Dict[str, Any]): + """Parse multiple named threshold configurations.""" + threshold_configs = config.get("threshold_configs", {}) + + if not threshold_configs: + logger.info("No threshold configurations defined") + return + + # Parse each named configuration + for config_name, config_data in threshold_configs.items(): + if not isinstance(config_data, dict): + logger.warning("Invalid threshold config '%s', skipping", config_name) + continue + + if "thresholds" not in config_data: + logger.warning("No thresholds in config '%s', skipping", config_name) + continue + + logger.info("Parsing threshold configuration: %s", config_name) + self.threshold_configs[config_name] = {} + + thresholds_config = config_data["thresholds"] + for plugin_name, plugin_thresholds in thresholds_config.items(): + if not isinstance(plugin_thresholds, dict): + continue + + self._parse_plugin_thresholds( + plugin_name, + plugin_thresholds, + target_dict=self.threshold_configs[config_name] + ) + + # Parse host to config mapping + self.host_config_mapping = config.get("host_threshold_mapping", {}) + + # Set default config (first one alphabetically or explicitly set) + self.default_config = config.get("default_threshold_config", "default") + if self.default_config not in self.threshold_configs and self.threshold_configs: + # Use first available config as default + self.default_config = sorted(self.threshold_configs.keys())[0] + logger.info("Using '%s' as default threshold config", self.default_config) + + logger.info( + "Loaded %d threshold configurations with %d host mappings", + len(self.threshold_configs), + len(self.host_config_mapping) + ) + + def _parse_legacy_config(self, config: Dict[str, Any]): + """Parse legacy single threshold configuration for backward compatibility.""" if not config or "thresholds" not in config: logger.info("No thresholds configured") return @@ -307,13 +395,27 @@ class ThresholdChecker: if not isinstance(plugin_thresholds, dict): continue - self._parse_plugin_thresholds(plugin_name, plugin_thresholds) + self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds) - def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]): - """Parse thresholds for a specific plugin.""" + def _parse_plugin_thresholds( + self, + plugin_name: str, + thresholds: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): + """Parse thresholds for a specific plugin. + + Args: + plugin_name: Name of the plugin + thresholds: Threshold configuration dictionary + target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds) + """ + if target_dict is None: + target_dict = self.thresholds + # Special handling for RTT thresholds (per-host) if plugin_name == "rtt": - self._parse_rtt_thresholds(thresholds) + self._parse_rtt_thresholds(thresholds, target_dict) return for metric_name, threshold_config in thresholds.items(): @@ -322,7 +424,7 @@ class ThresholdChecker: # Handle nested metrics (e.g., partitions./.percent) if metric_name == "partitions": - self._parse_partition_thresholds(plugin_name, threshold_config) + self._parse_partition_thresholds(plugin_name, threshold_config, target_dict) continue metric_path = f"{plugin_name}.{metric_name}" @@ -331,7 +433,7 @@ class ThresholdChecker: warning = threshold_config.get("warning") critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") - display = threshold_config.get("display") + display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default enabled = threshold_config.get("enabled", True) @@ -349,7 +451,7 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold logger.debug( "Registered threshold for %s: warn=%s, crit=%s, op=%s", metric_path, @@ -358,8 +460,22 @@ class ThresholdChecker: operator ) - def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]): - """Parse partition-specific thresholds for disk monitoring.""" + def _parse_partition_thresholds( + self, + plugin_name: str, + partitions: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): + """Parse partition-specific thresholds for disk monitoring. + + Args: + plugin_name: Name of the plugin + partitions: Partition threshold configuration + target_dict: Dictionary to store parsed thresholds + """ + if target_dict is None: + target_dict = self.thresholds + for partition, metrics in partitions.items(): if not isinstance(metrics, dict): continue @@ -390,9 +506,13 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold - def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]): + def _parse_rtt_thresholds( + self, + rtt_thresholds: Dict[str, Any], + target_dict: Optional[Dict[str, ThresholdConfig]] = None + ): """Parse RTT thresholds (per-host network latency thresholds). RTT thresholds are configured as: @@ -401,7 +521,14 @@ class ThresholdChecker: hostname1: warning: 100.0 # ms critical: 500.0 # ms + + Args: + rtt_thresholds: RTT threshold configuration + target_dict: Dictionary to store parsed thresholds """ + if target_dict is None: + target_dict = self.thresholds + for hostname, threshold_config in rtt_thresholds.items(): if not isinstance(threshold_config, dict): continue @@ -430,7 +557,7 @@ class ThresholdChecker: display=display ) - self.thresholds[metric_path] = threshold + target_dict[metric_path] = threshold logger.debug( "Registered RTT threshold for %s: warn=%s ms, crit=%s ms", hostname, @@ -438,6 +565,37 @@ class ThresholdChecker: critical ) + def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: + """Get the appropriate threshold configuration for a host. + + Args: + host_name: Name of the host + + Returns: + Dictionary of thresholds for this host + """ + # Legacy mode: single threshold set for all hosts + if self.thresholds and not self.threshold_configs: + return self.thresholds + + # Multi-config mode: look up host-specific configuration + if self.threshold_configs: + config_name = self.host_config_mapping.get(host_name, self.default_config) + + if config_name in self.threshold_configs: + return self.threshold_configs[config_name] + else: + logger.warning( + "Threshold config '%s' not found for host '%s', using default '%s'", + config_name, + host_name, + self.default_config + ) + return self.threshold_configs.get(self.default_config, {}) + + # No thresholds configured + return {} + def check_value( self, host_name: str, @@ -457,10 +615,13 @@ class ThresholdChecker: Returns: Tuple of (old_level, new_level) if state changed, None otherwise """ - if metric_path not in self.thresholds: + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + + if metric_path not in thresholds: return None - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: @@ -484,14 +645,17 @@ class ThresholdChecker: # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): - self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold) + # For check_value, we don't have full plugin data, pass None + lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) return (old_level, new_level) elif new_level != AlertLevel.OK: # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, None) return None - def check_plugin_data( self, host_name: str, @@ -513,14 +677,17 @@ class ThresholdChecker: """ state_changes = [] + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + # Check flat metrics for metric_name, value in data.items(): metric_path = f"{plugin_name}.{metric_name}" - if metric_path not in self.thresholds: + if metric_path not in thresholds: continue - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] # Get or create alert state if metric_path not in alert_states: @@ -545,10 +712,13 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold) + lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) elif new_level != AlertLevel.OK: # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) # Check nested metrics (e.g., partition data in disk_monitor) self._check_nested_metrics( @@ -570,6 +740,9 @@ class ThresholdChecker: state_changes: list, ): """Check nested metrics like partition-specific thresholds.""" + # Get host-specific thresholds + thresholds = self.get_thresholds_for_host(host_name) + # Look for partition data in disk_monitor if plugin_name == "disk_monitor" and "partitions" in data: partitions = data["partitions"] @@ -583,10 +756,10 @@ class ThresholdChecker: for metric_name, value in metrics.items(): metric_path = f"{plugin_name}.{partition}.{metric_name}" - if metric_path not in self.thresholds: + if metric_path not in thresholds: continue - threshold = self.thresholds[metric_path] + threshold = thresholds[metric_path] if metric_path not in alert_states: alert_states[metric_path] = AlertState(metric_path) @@ -608,16 +781,20 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - self._trigger_notification( + lvl, message, formatted_msg = self._trigger_notification( host_name, metric_path, old_level, new_level, value, - threshold + threshold, + data # Pass full plugin data for format string ) + # Update alert state with formatted message + alert_state.formatted_message = formatted_msg + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) elif new_level != AlertLevel.OK: - self._check_renotify(host_name, alert_state, metric_path, value, threshold) + self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) def _trigger_notification( self, @@ -627,8 +804,19 @@ class ThresholdChecker: new_level: AlertLevel, value: Any, threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]] = None, ): - """Trigger a notification for an alert state change.""" + """Trigger a notification for an alert state change. + + Args: + host_name: Name of the host + metric_path: Full metric path + old_level: Previous alert level + new_level: New alert level + value: Current metric value + threshold: Threshold configuration + plugin_data: Optional dictionary of all plugin data fields for format string + """ # Determine which threshold was exceeded threshold_value = None if new_level == AlertLevel.CRITICAL and threshold.critical is not None: @@ -646,20 +834,59 @@ class ThresholdChecker: elif new_level == AlertLevel.WARNING: lvl = "WARNING" if threshold_value is not None: - message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"{metric_path} = {value} {threshold_info}" else: message = f"{metric_path} = {value}" elif new_level == AlertLevel.CRITICAL: lvl = "CRITICAL" if threshold_value is not None: - message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"{metric_path} = {value} {threshold_info}" else: message = f"{metric_path} = {value}" else: lvl = "UNKNOWN" message = f"{metric_path} = {value}" - # Send notification + # Return the formatted threshold info for storing in AlertState + formatted_threshold_msg = None + if threshold_value is not None and new_level != AlertLevel.OK: + formatted_threshold_msg = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + + return lvl, message, formatted_threshold_msg + + def _send_notification( + self, + host_name: str, + lvl: str, + message: str, + metric_path: str, + old_level: AlertLevel, + new_level: AlertLevel, + value: Any, + ): + """Send notification and log to journal/eventlog.""" if self.notification_callback is not None: try: self.notification_callback(f"{lvl}: {host_name} - {message}") @@ -684,6 +911,56 @@ class ThresholdChecker: # Log to eventlog as well eventlog(host_name, lvl, message, service="threshold") + def _format_display( + self, + display_format: str, + value: Any, + threshold_value: float, + op_symbol: str, + plugin_data: Optional[Dict[str, Any]] = None, + ) -> str: + """Format the display string using available data. + + Args: + display_format: Format string from threshold config + value: Current metric value + threshold_value: Threshold value that was exceeded + op_symbol: Comparison operator symbol + plugin_data: Optional dictionary of plugin data fields + + Returns: + Formatted display string + """ + # Build format context with standard variables + format_context = { + 'value': value, + 'threshold_value': threshold_value, + 'op_symbol': op_symbol, + } + + # Add all plugin data fields if available + if plugin_data: + format_context.update(plugin_data) + + try: + # Format the display string + return display_format.format(**format_context) + except KeyError as e: + logger.warning( + "Missing format variable in display string '%s': %s", + display_format, + e + ) + # Fallback to default format + return f"(threshold: {op_symbol} {threshold_value})" + except Exception as e: + logger.error( + "Error formatting display string '%s': %s", + display_format, + e + ) + return f"(threshold: {op_symbol} {threshold_value})" + def _check_renotify( self, host_name: str, @@ -691,8 +968,18 @@ class ThresholdChecker: metric_path: str, value: Any, threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]] = None, ): - """Check if we should send a repeat notification.""" + """Check if we should send a repeat notification. + + Args: + host_name: Name of the host + alert_state: Current alert state + metric_path: Full metric path + value: Current metric value + threshold: Threshold configuration + plugin_data: Optional dictionary of all plugin data fields + """ if alert_state.level == AlertLevel.OK: return @@ -718,7 +1005,15 @@ class ThresholdChecker: # Time to re-notify if threshold_value is not None: - message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)" + # Use display format string + threshold_info = self._format_display( + threshold.display, + value=value, + threshold_value=threshold_value, + op_symbol=op_symbol, + plugin_data=plugin_data + ) + message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s" else: message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" diff --git a/nagios_bad.sh b/nagios_bad.sh new file mode 100755 index 0000000..2dc1341 --- /dev/null +++ b/nagios_bad.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +#echo "OK - all is well" +echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"