per-client threshold config

2026-04-01 15:22:42 -04:00
parent 079e84f729
commit 090d341244
7 changed files with 873 additions and 77 deletions
@@ -50,43 +50,119 @@ journal_max_size: 104857600             # Max size (100MB default)
 journal_max_backups: 10                 # Number of backups to keep

 thresholds:
-  cpu_monitor:
-    cpu_percent:
-      warning: 80.0
-      critical: 90.0
-  memory_monitor:
-    percent:
-      warning: 3.0
-      critical: 95.0
-  disk_monitor:
-    partitions:
-      /:
-        percent:
-          warning: 85.0
-          critical: 90.0
-  nagios_runner:
-    overall_status_code:
-      warning: 1
-      critical: 2
-      operator: ">="
-    load_status:
-      warning: WARNING
-      critical: CRITICAL
-      operator: "=="
-    UPS_load:
-      warning: 70
-      critical: 80
-      operator: ">="
-    UPS_status_code:
-      warning: 1
-      critical: 2
-      operator: ">="
-    nextcloud_apps_status:
-      display: "{nextcloud_apps_output}"
-      warning: 1
-      critical: 2
-      operator: ">="
-  rtt:
-    y:
-      warning: 30
-      critical: 250.0
+  default:
+    cpu_monitor:
+      cpu_percent:
+        warning: 80.0
+        critical: 90.0
+    memory_monitor:
+      percent:
+        warning: 3.0
+        critical: 95.0
+    disk_monitor:
+      partitions:
+        /:
+          percent:
+            warning: 85.0
+            critical: 90.0
+    rtt:
+      y:
+        warning: 30
+        critical: 250.0
+
+
+  freebsd_server:
+    cpu_monitor:
+      cpu_percent:
+        warning: 80.0
+        critical: 90.0
+    memory_monitor:
+      percent:
+        warning: 3.0
+        critical: 95.0
+    disk_monitor:
+      partitions:
+        /:
+          percent:
+            warning: 85.0
+            critical: 90.0
+    nagios_runner:
+ #     overall_status_code:
+ #       warning: 1
+ #       critical: 2
+ #       operator: ">="
+      load_status:
+        warning: WARNING
+        critical: CRITICAL
+        operator: "=="
+      UPS_load:
+        display: "{ups_output}"
+        warning: 70
+        critical: 80
+        operator: ">="
+      UPS_status_code:
+        display: "{ups_output}"
+        warning: 1
+        critical: 2
+        operator: ">="
+      nextcloud_apps_status_code:
+        display: "{nextcloud_apps_output}"
+        warning: 1
+        critical: 2
+        operator: ">="
+    rtt:
+      y:
+        warning: 30
+        critical: 250.0
+
+  truenas_server:
+    cpu_monitor:
+      cpu_percent:
+        warning: 80.0
+        critical: 90.0
+    memory_monitor:
+      percent:
+        warning: 3.0
+        critical: 95.0
+    disk_monitor:
+      partitions:
+        /:
+          percent:
+            warning: 85.0
+            critical: 90.0
+    nagios_runner:
+ #     overall_status_code:
+ #       warning: 1
+ #       critical: 2
+ #       operator: ">="
+      load_status:
+        warning: WARNING
+        critical: CRITICAL
+        operator: "=="
+      UPS_load:
+        display: "{ups_output}"
+        warning: 70
+        critical: 80
+        operator: ">="
+      UPS_status_code:
+        display: "{ups_output}"
+        warning: 1
+        critical: 2
+        operator: ">="
+      nextcloud_apps_status_code:
+        display: "{nextcloud_apps_output}"
+        warning: 1
+        critical: 2
+        operator: ">="
+    rtt:
+      y:
+        warning: 30
+        critical: 250.0
+
+
+host_threshold_mapping:
+  # Critical production servers
+
+  wally: freebsd_server
+  eris: truenas_server
+
@@ -56,6 +56,7 @@ thresholds:
      critical: 90.0
      operator: ">"
      hysteresis: 0.1
+      display: "display format"
      enabled: true
 ```

@@ -82,6 +83,8 @@ Note: At least one of `warning` or `critical` must be specified.
  - Range: 0.0 to 1.0
  - Prevents rapid state transitions when value hovers near threshold

+- **display**: f-string to hold the display format for alert messages
+  - defaults to "(threshold: {op_symbol} {threshold_value})"
 - **enabled**: Whether this threshold is active (default: `true`)

 ### Comparison Operators
@@ -740,3 +743,217 @@ Planned features:
 - [Message Journal Documentation](MESSAGE_JOURNAL.md)
 - Configuration examples: `hbd/config_thresholds_example.yaml`
 - Test suite: `test_threshold.py`
+
+## Multi-Threshold Configuration
+
+**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping.
+
+### Overview
+
+The multi-threshold feature allows you to:
+- Define multiple sets of threshold configurations
+- Map different hosts to different threshold sets
+- Use different sensitivity levels for different environments
+- Maintain a default configuration for unmapped hosts
+
+### Configuration Structure
+
+```yaml
+# Optional: Set the default configuration name (defaults to "default")
+default_threshold_config: "default"
+
+# Define multiple named threshold configurations
+threshold_configs:
+  # Configuration name 1
+  default:
+    thresholds:
+      # Standard threshold definitions
+      cpu_monitor:
+        cpu_percent:
+          warning: 80.0
+          critical: 90.0
+  
+  # Configuration name 2
+  high_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 60.0
+          critical: 75.0
+  
+  # Configuration name 3
+  low_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 90.0
+          critical: 95.0
+
+# Map specific hosts to specific configurations
+host_threshold_mapping:
+  prod-web-01: high_sensitivity
+  prod-web-02: high_sensitivity
+  dev-server-01: low_sensitivity
+  # Unmapped hosts use default_threshold_config
+```
+
+### Use Cases
+
+#### 1. Environment-Based Thresholds
+
+Different thresholds for production vs. development:
+
+```yaml
+threshold_configs:
+  production:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 70.0   # Alert earlier in production
+          critical: 85.0
+  
+  development:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 90.0   # More relaxed for dev
+          critical: 98.0
+
+host_threshold_mapping:
+  prod-web-01: production
+  prod-web-02: production
+  dev-web-01: development
+  dev-web-02: development
+```
+
+#### 2. Server Role-Based Thresholds
+
+Different thresholds based on server function:
+
+```yaml
+threshold_configs:
+  webserver:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 80.0
+          critical: 90.0
+  
+  database:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 70.0
+          critical: 85.0
+      memory_monitor:
+        percent:
+          warning: 90.0   # Databases can use high memory
+          critical: 97.0
+      disk_monitor:
+        partitions:
+          /var/lib/mysql:
+            percent:
+              warning: 75.0
+              critical: 85.0
+  
+  cache:
+    thresholds:
+      memory_monitor:
+        percent:
+          warning: 95.0   # Redis/Memcached can use very high memory
+          critical: 99.0
+
+host_threshold_mapping:
+  web-01: webserver
+  web-02: webserver
+  db-01: database
+  db-02: database
+  redis-01: cache
+  memcached-01: cache
+```
+
+#### 3. Sensitivity Levels
+
+Different sensitivity for critical vs. non-critical systems:
+
+```yaml
+threshold_configs:
+  critical:
+    thresholds:
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 70.0    # Very sensitive
+              critical: 80.0
+              hysteresis: 0.15
+  
+  standard:
+    thresholds:
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 85.0
+              critical: 95.0
+              hysteresis: 0.1
+  
+  relaxed:
+    thresholds:
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 90.0
+              critical: 98.0
+              hysteresis: 0.05
+
+host_threshold_mapping:
+  payment-gateway: critical
+  auth-server: critical
+  web-01: standard
+  web-02: standard
+  test-server: relaxed
+```
+
+### Backward Compatibility
+
+The legacy single threshold configuration is fully supported:
+
+```yaml
+# Old format - still works
+thresholds:
+  cpu_monitor:
+    cpu_percent:
+      warning: 80.0
+      critical: 90.0
+```
+
+This is equivalent to:
+
+```yaml
+# New format
+threshold_configs:
+  default:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 80.0
+          critical: 90.0
+```
+
+### Configuration Priority
+
+1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config
+2. **Default config**: Use `default_threshold_config` 
+3. **First alphabetically**: If default not found, use first config alphabetically
+4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds`
+
+### Example: Complete Multi-Threshold Setup
+
+See `hbd/config_multi_threshold_example.yaml` for a complete example with:
+- 4 named configurations (default, high_sensitivity, low_sensitivity, database)
+- Host-to-config mappings for production, development, and test systems
+- Specialized database server thresholds
+- Custom display messages with plugin data
+
@@ -0,0 +1,202 @@
+# ==============================================================================
+# Heartbeat Daemon Multi-Threshold Configuration Example
+# ==============================================================================
+# This file demonstrates the new multi-threshold configuration feature that allows
+# different threshold settings for different hosts/clients.
+#
+# Features:
+#   - Define multiple named threshold configurations
+#   - Map specific hosts to specific threshold configurations
+#   - Set a default configuration for unmapped hosts
+#   - Backward compatible with single threshold configuration
+# ==============================================================================
+
+# Global threshold settings
+threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts (seconds)
+
+# Optional: Set default threshold config (defaults to "default" if not specified)
+default_threshold_config: "default"
+
+# ----------------------------------------------------------------------------
+# Multiple Named Threshold Configurations
+# ----------------------------------------------------------------------------
+# Define multiple threshold configurations with different sensitivity levels
+threshold_configs:
+  
+  # Default configuration - moderate thresholds for most servers
+  default:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 80.0
+          critical: 90.0
+          operator: ">"
+        load_1min:
+          warning: 4.0
+          critical: 8.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 85.0
+          critical: 95.0
+          operator: ">"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 85.0
+              critical: 95.0
+              operator: ">"
+      
+      rtt:
+        # RTT thresholds per remote host
+        router:
+          warning: 50.0   # ms
+          critical: 200.0
+        server1:
+          warning: 100.0
+          critical: 500.0
+  
+  # High sensitivity configuration - lower thresholds for critical systems
+  high_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 60.0      # Alert earlier
+          critical: 75.0
+          operator: ">"
+          hysteresis: 0.15   # More hysteresis to reduce flapping
+        load_1min:
+          warning: 2.0
+          critical: 4.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 75.0      # Alert at lower memory usage
+          critical: 85.0
+          operator: ">"
+          display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 75.0
+              critical: 85.0
+              operator: ">"
+          /var:
+            percent:
+              warning: 80.0
+              critical: 90.0
+              operator: ">"
+      
+      rtt:
+        router:
+          warning: 30.0
+          critical: 100.0
+        server1:
+          warning: 50.0
+          critical: 200.0
+  
+  # Low sensitivity configuration - higher thresholds for development/test systems
+  low_sensitivity:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 90.0      # Only alert at very high usage
+          critical: 95.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 90.0
+          critical: 98.0
+          operator: ">"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 90.0
+              critical: 95.0
+              operator: ">"
+      
+      rtt:
+        router:
+          warning: 100.0
+          critical: 500.0
+  
+  # Production database servers - specialized thresholds
+  database:
+    thresholds:
+      cpu_monitor:
+        cpu_percent:
+          warning: 70.0
+          critical: 85.0
+          operator: ">"
+      
+      memory_monitor:
+        percent:
+          warning: 90.0      # Databases can use high memory
+          critical: 97.0
+          operator: ">"
+          display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
+      
+      disk_monitor:
+        partitions:
+          /:
+            percent:
+              warning: 80.0
+              critical: 90.0
+              operator: ">"
+          /var/lib/mysql:    # Database data partition
+            percent:
+              warning: 75.0  # Alert earlier for DB partition
+              critical: 85.0
+              operator: ">"
+      
+      rtt:
+        router:
+          warning: 20.0     # Stricter latency requirements
+          critical: 50.0
+
+# ----------------------------------------------------------------------------
+# Host to Threshold Configuration Mapping
+# ----------------------------------------------------------------------------
+# Map specific hosts to specific threshold configurations
+# Hosts not listed here will use the default_threshold_config
+host_threshold_mapping:
+  # Critical production servers
+  prod-web-01: high_sensitivity
+  prod-web-02: high_sensitivity
+  prod-api-01: high_sensitivity
+  
+  # Database servers
+  prod-db-01: database
+  prod-db-02: database
+  prod-db-replica: database
+  
+  # Development and test systems
+  dev-server-01: low_sensitivity
+  dev-server-02: low_sensitivity
+  test-server-01: low_sensitivity
+  test-server-02: low_sensitivity
+  
+  # Everything else uses 'default' (no need to list explicitly)
+
+# ----------------------------------------------------------------------------
+# Backward Compatibility Example
+# ----------------------------------------------------------------------------
+# The old single threshold format is still supported:
+# Just use 'thresholds:' directly without 'threshold_configs:'
+#
+# thresholds:
+#   cpu_monitor:
+#     cpu_percent:
+#       warning: 80.0
+#       critical: 90.0
+#
+# This will apply the same thresholds to all hosts.
@@ -397,9 +397,11 @@
        const level = alert.level.toLowerCase();
        const duration = getDuration(alert.since);
        
-        // Format value with threshold info if available
+        // Use formatted message if available, otherwise build from individual fields
        let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
-        if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
+        if (alert.formatted_message) {
+          valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
+        } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
          valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
        }
        
@@ -55,6 +55,7 @@ class AlertState:
        self.last_notification = None
        self.threshold_value = None  # The threshold value that triggered alert
        self.operator = None  # The comparison operator (>, <, >=, etc.)
+        self.formatted_message = None  # Formatted display message for UI
    
    def update(
        self, 
@@ -120,6 +121,8 @@ class AlertState:
            result["threshold_value"] = self.threshold_value
        if self.operator is not None:
            result["operator"] = self.operator
+        if self.formatted_message is not None:
+            result["formatted_message"] = self.formatted_message
        
        return result

@@ -285,7 +288,18 @@ class ThresholdChecker:
            renotify_interval: Seconds between repeat notifications (default: 1 hour)
            journal: Optional MessageJournal instance for logging threshold events
        """
-        self.thresholds = {}  # {metric_path: ThresholdConfig}
+        # Named threshold configurations: {config_name: {metric_path: ThresholdConfig}}
+        self.threshold_configs = {}
+        
+        # Single threshold set for backward compatibility: {metric_path: ThresholdConfig}
+        self.thresholds = {}
+        
+        # Host to config name mapping: {host_name: config_name}
+        self.host_config_mapping = {}
+        
+        # Default config name to use when no mapping exists
+        self.default_config = "default"
+        
        self.notification_callback = notification_callback
        self.renotify_interval = renotify_interval
        self.journal = journal
@@ -293,10 +307,84 @@ class ThresholdChecker:
        # Parse configuration
        self._parse_config(config)
        
-        logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
+        total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values())
+        if total_thresholds == 0 and len(self.thresholds) > 0:
+            # Backward compatibility: using single threshold set
+            total_thresholds = len(self.thresholds)
+            logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds)
+        else:
+            logger.info(
+                "ThresholdChecker initialized with %d named configurations (%d total thresholds)",
+                len(self.threshold_configs),
+                total_thresholds
+            )
    
    def _parse_config(self, config: Dict[str, Any]):
-        """Parse threshold configuration from YAML structure."""
+        """Parse threshold configuration from YAML structure.
+        
+        Supports two formats:
+        1. Legacy format with direct 'thresholds' section
+        2. New format with 'threshold_configs' and 'host_threshold_mapping'
+        """
+        # Check for new multi-config format
+        if "threshold_configs" in config:
+            self._parse_multi_config(config)
+        elif "thresholds" in config:
+            # Legacy single threshold configuration
+            self._parse_legacy_config(config)
+        else:
+            logger.info("No thresholds configured")
+    
+    def _parse_multi_config(self, config: Dict[str, Any]):
+        """Parse multiple named threshold configurations."""
+        threshold_configs = config.get("threshold_configs", {})
+        
+        if not threshold_configs:
+            logger.info("No threshold configurations defined")
+            return
+        
+        # Parse each named configuration
+        for config_name, config_data in threshold_configs.items():
+            if not isinstance(config_data, dict):
+                logger.warning("Invalid threshold config '%s', skipping", config_name)
+                continue
+            
+            if "thresholds" not in config_data:
+                logger.warning("No thresholds in config '%s', skipping", config_name)
+                continue
+            
+            logger.info("Parsing threshold configuration: %s", config_name)
+            self.threshold_configs[config_name] = {}
+            
+            thresholds_config = config_data["thresholds"]
+            for plugin_name, plugin_thresholds in thresholds_config.items():
+                if not isinstance(plugin_thresholds, dict):
+                    continue
+                
+                self._parse_plugin_thresholds(
+                    plugin_name,
+                    plugin_thresholds,
+                    target_dict=self.threshold_configs[config_name]
+                )
+        
+        # Parse host to config mapping
+        self.host_config_mapping = config.get("host_threshold_mapping", {})
+        
+        # Set default config (first one alphabetically or explicitly set)
+        self.default_config = config.get("default_threshold_config", "default")
+        if self.default_config not in self.threshold_configs and self.threshold_configs:
+            # Use first available config as default
+            self.default_config = sorted(self.threshold_configs.keys())[0]
+            logger.info("Using '%s' as default threshold config", self.default_config)
+        
+        logger.info(
+            "Loaded %d threshold configurations with %d host mappings",
+            len(self.threshold_configs),
+            len(self.host_config_mapping)
+        )
+    
+    def _parse_legacy_config(self, config: Dict[str, Any]):
+        """Parse legacy single threshold configuration for backward compatibility."""
        if not config or "thresholds" not in config:
            logger.info("No thresholds configured")
            return
@@ -307,13 +395,27 @@ class ThresholdChecker:
            if not isinstance(plugin_thresholds, dict):
                continue
            
-            self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
+            self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds)
+    
+    def _parse_plugin_thresholds(
+        self,
+        plugin_name: str,
+        thresholds: Dict[str, Any],
+        target_dict: Optional[Dict[str, ThresholdConfig]] = None
+    ):
+        """Parse thresholds for a specific plugin.
+        
+        Args:
+            plugin_name: Name of the plugin
+            thresholds: Threshold configuration dictionary
+            target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds)
+        """
+        if target_dict is None:
+            target_dict = self.thresholds
        
-    def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
-        """Parse thresholds for a specific plugin."""
        # Special handling for RTT thresholds (per-host)
        if plugin_name == "rtt":
-            self._parse_rtt_thresholds(thresholds)
+            self._parse_rtt_thresholds(thresholds, target_dict)
            return
        
        for metric_name, threshold_config in thresholds.items():
@@ -322,7 +424,7 @@ class ThresholdChecker:
            
            # Handle nested metrics (e.g., partitions./.percent)
            if metric_name == "partitions":
-                self._parse_partition_thresholds(plugin_name, threshold_config)
+                self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
                continue
            
            metric_path = f"{plugin_name}.{metric_name}"
@@ -331,7 +433,7 @@ class ThresholdChecker:
            warning = threshold_config.get("warning")
            critical = threshold_config.get("critical")
            operator = threshold_config.get("operator", ">")
-            display = threshold_config.get("display")
+            display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
            hysteresis = threshold_config.get("hysteresis", 0.1)  # 10% default
            enabled = threshold_config.get("enabled", True)
            
@@ -349,7 +451,7 @@ class ThresholdChecker:
                display=display
            )
            
-            self.thresholds[metric_path] = threshold
+            target_dict[metric_path] = threshold
            logger.debug(
                "Registered threshold for %s: warn=%s, crit=%s, op=%s",
                metric_path,
@@ -358,8 +460,22 @@ class ThresholdChecker:
                operator
            )
    
-    def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
-        """Parse partition-specific thresholds for disk monitoring."""
+    def _parse_partition_thresholds(
+        self,
+        plugin_name: str,
+        partitions: Dict[str, Any],
+        target_dict: Optional[Dict[str, ThresholdConfig]] = None
+    ):
+        """Parse partition-specific thresholds for disk monitoring.
+        
+        Args:
+            plugin_name: Name of the plugin
+            partitions: Partition threshold configuration
+            target_dict: Dictionary to store parsed thresholds
+        """
+        if target_dict is None:
+            target_dict = self.thresholds
+        
        for partition, metrics in partitions.items():
            if not isinstance(metrics, dict):
                continue
@@ -390,9 +506,13 @@ class ThresholdChecker:
                    display=display 
                )
                
-                self.thresholds[metric_path] = threshold
+                target_dict[metric_path] = threshold
    
-    def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
+    def _parse_rtt_thresholds(
+        self,
+        rtt_thresholds: Dict[str, Any],
+        target_dict: Optional[Dict[str, ThresholdConfig]] = None
+    ):
        """Parse RTT thresholds (per-host network latency thresholds).
        
        RTT thresholds are configured as:
@@ -401,7 +521,14 @@ class ThresholdChecker:
            hostname1:
              warning: 100.0   # ms
              critical: 500.0  # ms
+        
+        Args:
+            rtt_thresholds: RTT threshold configuration
+            target_dict: Dictionary to store parsed thresholds
        """
+        if target_dict is None:
+            target_dict = self.thresholds
+        
        for hostname, threshold_config in rtt_thresholds.items():
            if not isinstance(threshold_config, dict):
                continue
@@ -430,7 +557,7 @@ class ThresholdChecker:
                display=display
            )
            
-            self.thresholds[metric_path] = threshold
+            target_dict[metric_path] = threshold
            logger.debug(
                "Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
                hostname,
@@ -438,6 +565,37 @@ class ThresholdChecker:
                critical
            )
    
+    def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
+        """Get the appropriate threshold configuration for a host.
+        
+        Args:
+            host_name: Name of the host
+            
+        Returns:
+            Dictionary of thresholds for this host
+        """
+        # Legacy mode: single threshold set for all hosts
+        if self.thresholds and not self.threshold_configs:
+            return self.thresholds
+        
+        # Multi-config mode: look up host-specific configuration
+        if self.threshold_configs:
+            config_name = self.host_config_mapping.get(host_name, self.default_config)
+            
+            if config_name in self.threshold_configs:
+                return self.threshold_configs[config_name]
+            else:
+                logger.warning(
+                    "Threshold config '%s' not found for host '%s', using default '%s'",
+                    config_name,
+                    host_name,
+                    self.default_config
+                )
+                return self.threshold_configs.get(self.default_config, {})
+        
+        # No thresholds configured
+        return {}
+    
    def check_value(
        self,
        host_name: str,
@@ -457,10 +615,13 @@ class ThresholdChecker:
        Returns:
            Tuple of (old_level, new_level) if state changed, None otherwise
        """
-        if metric_path not in self.thresholds:
+        # Get host-specific thresholds
+        thresholds = self.get_thresholds_for_host(host_name)
+        
+        if metric_path not in thresholds:
            return None
        
-        threshold = self.thresholds[metric_path]
+        threshold = thresholds[metric_path]
        
        # Get or create alert state
        if metric_path not in alert_states:
@@ -484,14 +645,17 @@ class ThresholdChecker:
        # Update state and check for changes
        old_level = alert_state.level
        if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
-            self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
+            # For check_value, we don't have full plugin data, pass None
+            lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
+            # Update alert state with formatted message
+            alert_state.formatted_message = formatted_msg
+            self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
            return (old_level, new_level)
        elif new_level != AlertLevel.OK:
            # Check if we should re-notify
-            self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+            self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
        
        return None
-    
    def check_plugin_data(
        self,
        host_name: str,
@@ -513,14 +677,17 @@ class ThresholdChecker:
        """
        state_changes = []
        
+        # Get host-specific thresholds
+        thresholds = self.get_thresholds_for_host(host_name)
+        
        # Check flat metrics
        for metric_name, value in data.items():
            metric_path = f"{plugin_name}.{metric_name}"
            
-            if metric_path not in self.thresholds:
+            if metric_path not in thresholds:
                continue
            
-            threshold = self.thresholds[metric_path]
+            threshold = thresholds[metric_path]
            
            # Get or create alert state
            if metric_path not in alert_states:
@@ -545,10 +712,13 @@ class ThresholdChecker:
            old_level = alert_state.level
            if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
                state_changes.append((metric_path, old_level, new_level, value))
-                self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
+                lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
+                # Update alert state with formatted message
+                alert_state.formatted_message = formatted_msg
+                self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
            elif new_level != AlertLevel.OK:
                # Check if we should re-notify
-                self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+                self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
        
        # Check nested metrics (e.g., partition data in disk_monitor)
        self._check_nested_metrics(
@@ -570,6 +740,9 @@ class ThresholdChecker:
        state_changes: list,
    ):
        """Check nested metrics like partition-specific thresholds."""
+        # Get host-specific thresholds
+        thresholds = self.get_thresholds_for_host(host_name)
+        
        # Look for partition data in disk_monitor
        if plugin_name == "disk_monitor" and "partitions" in data:
            partitions = data["partitions"]
@@ -583,10 +756,10 @@ class ThresholdChecker:
                for metric_name, value in metrics.items():
                    metric_path = f"{plugin_name}.{partition}.{metric_name}"
                    
-                    if metric_path not in self.thresholds:
+                    if metric_path not in thresholds:
                        continue
                    
-                    threshold = self.thresholds[metric_path]
+                    threshold = thresholds[metric_path]
                    
                    if metric_path not in alert_states:
                        alert_states[metric_path] = AlertState(metric_path)
@@ -608,16 +781,20 @@ class ThresholdChecker:
                    old_level = alert_state.level
                    if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
                        state_changes.append((metric_path, old_level, new_level, value))
-                        self._trigger_notification(
+                        lvl, message, formatted_msg = self._trigger_notification(
                            host_name,
                            metric_path,
                            old_level,
                            new_level,
                            value,
-                            threshold
+                            threshold,
+                            data  # Pass full plugin data for format string
                        )
+                        # Update alert state with formatted message
+                        alert_state.formatted_message = formatted_msg
+                        self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
                    elif new_level != AlertLevel.OK:
-                        self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+                        self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
    
    def _trigger_notification(
        self,
@@ -627,8 +804,19 @@ class ThresholdChecker:
        new_level: AlertLevel,
        value: Any,
        threshold: ThresholdConfig,
+        plugin_data: Optional[Dict[str, Any]] = None,
    ):
-        """Trigger a notification for an alert state change."""
+        """Trigger a notification for an alert state change.
+        
+        Args:
+            host_name: Name of the host
+            metric_path: Full metric path
+            old_level: Previous alert level
+            new_level: New alert level
+            value: Current metric value
+            threshold: Threshold configuration
+            plugin_data: Optional dictionary of all plugin data fields for format string
+        """
        # Determine which threshold was exceeded
        threshold_value = None
        if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
@@ -646,20 +834,59 @@ class ThresholdChecker:
        elif new_level == AlertLevel.WARNING:
            lvl = "WARNING"
            if threshold_value is not None:
-                message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
+                # Use display format string
+                threshold_info = self._format_display(
+                    threshold.display,
+                    value=value,
+                    threshold_value=threshold_value,
+                    op_symbol=op_symbol,
+                    plugin_data=plugin_data
+                )
+                message = f"{metric_path} = {value} {threshold_info}"
            else:
                message = f"{metric_path} = {value}"
        elif new_level == AlertLevel.CRITICAL:
            lvl = "CRITICAL"
            if threshold_value is not None:
-                message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
+                # Use display format string
+                threshold_info = self._format_display(
+                    threshold.display,
+                    value=value,
+                    threshold_value=threshold_value,
+                    op_symbol=op_symbol,
+                    plugin_data=plugin_data
+                )
+                message = f"{metric_path} = {value} {threshold_info}"
            else:
                message = f"{metric_path} = {value}"
        else:
            lvl = "UNKNOWN"
            message = f"{metric_path} = {value}"
        
-        # Send notification
+        # Return the formatted threshold info for storing in AlertState
+        formatted_threshold_msg = None
+        if threshold_value is not None and new_level != AlertLevel.OK:
+            formatted_threshold_msg = self._format_display(
+                threshold.display,
+                value=value,
+                threshold_value=threshold_value,
+                op_symbol=op_symbol,
+                plugin_data=plugin_data
+            )
+        
+        return lvl, message, formatted_threshold_msg
+    
+    def _send_notification(
+        self,
+        host_name: str,
+        lvl: str,
+        message: str,
+        metric_path: str,
+        old_level: AlertLevel,
+        new_level: AlertLevel,
+        value: Any,
+    ):
+        """Send notification and log to journal/eventlog."""
        if self.notification_callback is not None:
            try:
                self.notification_callback(f"{lvl}: {host_name} - {message}")
@@ -684,6 +911,56 @@ class ThresholdChecker:
        # Log to eventlog as well
        eventlog(host_name, lvl,  message, service="threshold")
    
+    def _format_display(
+        self,
+        display_format: str,
+        value: Any,
+        threshold_value: float,
+        op_symbol: str,
+        plugin_data: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        """Format the display string using available data.
+        
+        Args:
+            display_format: Format string from threshold config
+            value: Current metric value
+            threshold_value: Threshold value that was exceeded
+            op_symbol: Comparison operator symbol
+            plugin_data: Optional dictionary of plugin data fields
+            
+        Returns:
+            Formatted display string
+        """
+        # Build format context with standard variables
+        format_context = {
+            'value': value,
+            'threshold_value': threshold_value,
+            'op_symbol': op_symbol,
+        }
+        
+        # Add all plugin data fields if available
+        if plugin_data:
+            format_context.update(plugin_data)
+        
+        try:
+            # Format the display string
+            return display_format.format(**format_context)
+        except KeyError as e:
+            logger.warning(
+                "Missing format variable in display string '%s': %s",
+                display_format,
+                e
+            )
+            # Fallback to default format
+            return f"(threshold: {op_symbol} {threshold_value})"
+        except Exception as e:
+            logger.error(
+                "Error formatting display string '%s': %s",
+                display_format,
+                e
+            )
+            return f"(threshold: {op_symbol} {threshold_value})"
+    
    def _check_renotify(
        self,
        host_name: str,
@@ -691,8 +968,18 @@ class ThresholdChecker:
        metric_path: str,
        value: Any,
        threshold: ThresholdConfig,
+        plugin_data: Optional[Dict[str, Any]] = None,
    ):
-        """Check if we should send a repeat notification."""
+        """Check if we should send a repeat notification.
+        
+        Args:
+            host_name: Name of the host
+            alert_state: Current alert state
+            metric_path: Full metric path
+            value: Current metric value
+            threshold: Threshold configuration
+            plugin_data: Optional dictionary of all plugin data fields
+        """
        if alert_state.level == AlertLevel.OK:
            return
        
@@ -718,7 +1005,15 @@ class ThresholdChecker:
            
            # Time to re-notify
            if threshold_value is not None:
-                message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
+                # Use display format string
+                threshold_info = self._format_display(
+                    threshold.display,
+                    value=value,
+                    threshold_value=threshold_value,
+                    op_symbol=op_symbol,
+                    plugin_data=plugin_data
+                )
+                message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
            else:
                message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
            
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+#echo "OK - all is well"
+echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"