diff --git a/.hb.yaml b/.hb.yaml
index 02591e2..ec54df5 100644
--- a/.hb.yaml
+++ b/.hb.yaml
@@ -50,43 +50,119 @@ journal_max_size: 104857600 # Max size (100MB default)
journal_max_backups: 10 # Number of backups to keep
thresholds:
- cpu_monitor:
- cpu_percent:
- warning: 80.0
- critical: 90.0
- memory_monitor:
- percent:
- warning: 3.0
- critical: 95.0
- disk_monitor:
- partitions:
- /:
- percent:
- warning: 85.0
- critical: 90.0
- nagios_runner:
- overall_status_code:
- warning: 1
- critical: 2
- operator: ">="
- load_status:
- warning: WARNING
- critical: CRITICAL
- operator: "=="
- UPS_load:
- warning: 70
- critical: 80
- operator: ">="
- UPS_status_code:
- warning: 1
- critical: 2
- operator: ">="
- nextcloud_apps_status:
- display: "{nextcloud_apps_output}"
- warning: 1
- critical: 2
- operator: ">="
- rtt:
- y:
- warning: 30
- critical: 250.0
+ default:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+ memory_monitor:
+ percent:
+ warning: 3.0
+ critical: 95.0
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 85.0
+ critical: 90.0
+ rtt:
+ y:
+ warning: 30
+ critical: 250.0
+
+
+ freebsd_server:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+ memory_monitor:
+ percent:
+ warning: 3.0
+ critical: 95.0
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 85.0
+ critical: 90.0
+ nagios_runner:
+ # overall_status_code:
+ # warning: 1
+ # critical: 2
+ # operator: ">="
+ load_status:
+ warning: WARNING
+ critical: CRITICAL
+ operator: "=="
+ UPS_load:
+ display: "{ups_output}"
+ warning: 70
+ critical: 80
+ operator: ">="
+ UPS_status_code:
+ display: "{ups_output}"
+ warning: 1
+ critical: 2
+ operator: ">="
+ nextcloud_apps_status_code:
+ display: "{nextcloud_apps_output}"
+ warning: 1
+ critical: 2
+ operator: ">="
+ rtt:
+ y:
+ warning: 30
+ critical: 250.0
+
+ truenas_server:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+ memory_monitor:
+ percent:
+ warning: 3.0
+ critical: 95.0
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 85.0
+ critical: 90.0
+ nagios_runner:
+ # overall_status_code:
+ # warning: 1
+ # critical: 2
+ # operator: ">="
+ load_status:
+ warning: WARNING
+ critical: CRITICAL
+ operator: "=="
+ UPS_load:
+ display: "{ups_output}"
+ warning: 70
+ critical: 80
+ operator: ">="
+ UPS_status_code:
+ display: "{ups_output}"
+ warning: 1
+ critical: 2
+ operator: ">="
+ nextcloud_apps_status_code:
+ display: "{nextcloud_apps_output}"
+ warning: 1
+ critical: 2
+ operator: ">="
+ rtt:
+ y:
+ warning: 30
+ critical: 250.0
+
+
+host_threshold_mapping:
+ # Critical production servers
+
+ wally: freebsd_server
+ eris: truenas_server
+
diff --git a/.hb.yaml.swp b/.hb.yaml.swp
index 6dc3601..10da5a2 100644
Binary files a/.hb.yaml.swp and b/.hb.yaml.swp differ
diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md
index 0ff0ef6..49b5be5 100644
--- a/docs/THRESHOLD_ALERTING.md
+++ b/docs/THRESHOLD_ALERTING.md
@@ -56,6 +56,7 @@ thresholds:
critical: 90.0
operator: ">"
hysteresis: 0.1
+ display: "display format"
enabled: true
```
@@ -82,6 +83,8 @@ Note: At least one of `warning` or `critical` must be specified.
- Range: 0.0 to 1.0
- Prevents rapid state transitions when value hovers near threshold
+- **display**: f-string to hold the display format for alert messages
+ - defaults to "(threshold: {op_symbol} {threshold_value})"
- **enabled**: Whether this threshold is active (default: `true`)
### Comparison Operators
@@ -740,3 +743,217 @@ Planned features:
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
- Configuration examples: `hbd/config_thresholds_example.yaml`
- Test suite: `test_threshold.py`
+
+## Multi-Threshold Configuration
+
+**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping.
+
+### Overview
+
+The multi-threshold feature allows you to:
+- Define multiple sets of threshold configurations
+- Map different hosts to different threshold sets
+- Use different sensitivity levels for different environments
+- Maintain a default configuration for unmapped hosts
+
+### Configuration Structure
+
+```yaml
+# Optional: Set the default configuration name (defaults to "default")
+default_threshold_config: "default"
+
+# Define multiple named threshold configurations
+threshold_configs:
+ # Configuration name 1
+ default:
+ thresholds:
+ # Standard threshold definitions
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+
+ # Configuration name 2
+ high_sensitivity:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 60.0
+ critical: 75.0
+
+ # Configuration name 3
+ low_sensitivity:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 90.0
+ critical: 95.0
+
+# Map specific hosts to specific configurations
+host_threshold_mapping:
+ prod-web-01: high_sensitivity
+ prod-web-02: high_sensitivity
+ dev-server-01: low_sensitivity
+ # Unmapped hosts use default_threshold_config
+```
+
+### Use Cases
+
+#### 1. Environment-Based Thresholds
+
+Different thresholds for production vs. development:
+
+```yaml
+threshold_configs:
+ production:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 70.0 # Alert earlier in production
+ critical: 85.0
+
+ development:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 90.0 # More relaxed for dev
+ critical: 98.0
+
+host_threshold_mapping:
+ prod-web-01: production
+ prod-web-02: production
+ dev-web-01: development
+ dev-web-02: development
+```
+
+#### 2. Server Role-Based Thresholds
+
+Different thresholds based on server function:
+
+```yaml
+threshold_configs:
+ webserver:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+
+ database:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 70.0
+ critical: 85.0
+ memory_monitor:
+ percent:
+ warning: 90.0 # Databases can use high memory
+ critical: 97.0
+ disk_monitor:
+ partitions:
+ /var/lib/mysql:
+ percent:
+ warning: 75.0
+ critical: 85.0
+
+ cache:
+ thresholds:
+ memory_monitor:
+ percent:
+ warning: 95.0 # Redis/Memcached can use very high memory
+ critical: 99.0
+
+host_threshold_mapping:
+ web-01: webserver
+ web-02: webserver
+ db-01: database
+ db-02: database
+ redis-01: cache
+ memcached-01: cache
+```
+
+#### 3. Sensitivity Levels
+
+Different sensitivity for critical vs. non-critical systems:
+
+```yaml
+threshold_configs:
+ critical:
+ thresholds:
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 70.0 # Very sensitive
+ critical: 80.0
+ hysteresis: 0.15
+
+ standard:
+ thresholds:
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 85.0
+ critical: 95.0
+ hysteresis: 0.1
+
+ relaxed:
+ thresholds:
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 90.0
+ critical: 98.0
+ hysteresis: 0.05
+
+host_threshold_mapping:
+ payment-gateway: critical
+ auth-server: critical
+ web-01: standard
+ web-02: standard
+ test-server: relaxed
+```
+
+### Backward Compatibility
+
+The legacy single threshold configuration is fully supported:
+
+```yaml
+# Old format - still works
+thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+```
+
+This is equivalent to:
+
+```yaml
+# New format
+threshold_configs:
+ default:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+```
+
+### Configuration Priority
+
+1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config
+2. **Default config**: Use `default_threshold_config`
+3. **First alphabetically**: If default not found, use first config alphabetically
+4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds`
+
+### Example: Complete Multi-Threshold Setup
+
+See `hbd/config_multi_threshold_example.yaml` for a complete example with:
+- 4 named configurations (default, high_sensitivity, low_sensitivity, database)
+- Host-to-config mappings for production, development, and test systems
+- Specialized database server thresholds
+- Custom display messages with plugin data
+
diff --git a/hbd/config_multi_threshold_example.yaml b/hbd/config_multi_threshold_example.yaml
new file mode 100644
index 0000000..c14839c
--- /dev/null
+++ b/hbd/config_multi_threshold_example.yaml
@@ -0,0 +1,202 @@
+# ==============================================================================
+# Heartbeat Daemon Multi-Threshold Configuration Example
+# ==============================================================================
+# This file demonstrates the new multi-threshold configuration feature that allows
+# different threshold settings for different hosts/clients.
+#
+# Features:
+# - Define multiple named threshold configurations
+# - Map specific hosts to specific threshold configurations
+# - Set a default configuration for unmapped hosts
+# - Backward compatible with single threshold configuration
+# ==============================================================================
+
+# Global threshold settings
+threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
+
+# Optional: Set default threshold config (defaults to "default" if not specified)
+default_threshold_config: "default"
+
+# ----------------------------------------------------------------------------
+# Multiple Named Threshold Configurations
+# ----------------------------------------------------------------------------
+# Define multiple threshold configurations with different sensitivity levels
+threshold_configs:
+
+ # Default configuration - moderate thresholds for most servers
+ default:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 80.0
+ critical: 90.0
+ operator: ">"
+ load_1min:
+ warning: 4.0
+ critical: 8.0
+ operator: ">"
+
+ memory_monitor:
+ percent:
+ warning: 85.0
+ critical: 95.0
+ operator: ">"
+
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 85.0
+ critical: 95.0
+ operator: ">"
+
+ rtt:
+ # RTT thresholds per remote host
+ router:
+ warning: 50.0 # ms
+ critical: 200.0
+ server1:
+ warning: 100.0
+ critical: 500.0
+
+ # High sensitivity configuration - lower thresholds for critical systems
+ high_sensitivity:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 60.0 # Alert earlier
+ critical: 75.0
+ operator: ">"
+ hysteresis: 0.15 # More hysteresis to reduce flapping
+ load_1min:
+ warning: 2.0
+ critical: 4.0
+ operator: ">"
+
+ memory_monitor:
+ percent:
+ warning: 75.0 # Alert at lower memory usage
+ critical: 85.0
+ operator: ">"
+ display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
+
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 75.0
+ critical: 85.0
+ operator: ">"
+ /var:
+ percent:
+ warning: 80.0
+ critical: 90.0
+ operator: ">"
+
+ rtt:
+ router:
+ warning: 30.0
+ critical: 100.0
+ server1:
+ warning: 50.0
+ critical: 200.0
+
+ # Low sensitivity configuration - higher thresholds for development/test systems
+ low_sensitivity:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 90.0 # Only alert at very high usage
+ critical: 95.0
+ operator: ">"
+
+ memory_monitor:
+ percent:
+ warning: 90.0
+ critical: 98.0
+ operator: ">"
+
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 90.0
+ critical: 95.0
+ operator: ">"
+
+ rtt:
+ router:
+ warning: 100.0
+ critical: 500.0
+
+ # Production database servers - specialized thresholds
+ database:
+ thresholds:
+ cpu_monitor:
+ cpu_percent:
+ warning: 70.0
+ critical: 85.0
+ operator: ">"
+
+ memory_monitor:
+ percent:
+ warning: 90.0 # Databases can use high memory
+ critical: 97.0
+ operator: ">"
+ display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
+
+ disk_monitor:
+ partitions:
+ /:
+ percent:
+ warning: 80.0
+ critical: 90.0
+ operator: ">"
+ /var/lib/mysql: # Database data partition
+ percent:
+ warning: 75.0 # Alert earlier for DB partition
+ critical: 85.0
+ operator: ">"
+
+ rtt:
+ router:
+ warning: 20.0 # Stricter latency requirements
+ critical: 50.0
+
+# ----------------------------------------------------------------------------
+# Host to Threshold Configuration Mapping
+# ----------------------------------------------------------------------------
+# Map specific hosts to specific threshold configurations
+# Hosts not listed here will use the default_threshold_config
+host_threshold_mapping:
+ # Critical production servers
+ prod-web-01: high_sensitivity
+ prod-web-02: high_sensitivity
+ prod-api-01: high_sensitivity
+
+ # Database servers
+ prod-db-01: database
+ prod-db-02: database
+ prod-db-replica: database
+
+ # Development and test systems
+ dev-server-01: low_sensitivity
+ dev-server-02: low_sensitivity
+ test-server-01: low_sensitivity
+ test-server-02: low_sensitivity
+
+ # Everything else uses 'default' (no need to list explicitly)
+
+# ----------------------------------------------------------------------------
+# Backward Compatibility Example
+# ----------------------------------------------------------------------------
+# The old single threshold format is still supported:
+# Just use 'thresholds:' directly without 'threshold_configs:'
+#
+# thresholds:
+# cpu_monitor:
+# cpu_percent:
+# warning: 80.0
+# critical: 90.0
+#
+# This will apply the same thresholds to all hosts.
diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html
index 34a6744..1209480 100644
--- a/hbd/server/templates/alerts.html
+++ b/hbd/server/templates/alerts.html
@@ -397,9 +397,11 @@
const level = alert.level.toLowerCase();
const duration = getDuration(alert.since);
- // Format value with threshold info if available
+ // Use formatted message if available, otherwise build from individual fields
let valueText = `Value: ${formatValue(alert.last_value)}`;
- if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
+ if (alert.formatted_message) {
+ valueText += ` ${alert.formatted_message}`;
+ } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
valueText += ` (threshold: ${alert.operator} ${formatValue(alert.threshold_value)})`;
}
diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py
index f89881a..2c88b72 100644
--- a/hbd/server/threshold.py
+++ b/hbd/server/threshold.py
@@ -55,6 +55,7 @@ class AlertState:
self.last_notification = None
self.threshold_value = None # The threshold value that triggered alert
self.operator = None # The comparison operator (>, <, >=, etc.)
+ self.formatted_message = None # Formatted display message for UI
def update(
self,
@@ -120,6 +121,8 @@ class AlertState:
result["threshold_value"] = self.threshold_value
if self.operator is not None:
result["operator"] = self.operator
+ if self.formatted_message is not None:
+ result["formatted_message"] = self.formatted_message
return result
@@ -285,7 +288,18 @@ class ThresholdChecker:
renotify_interval: Seconds between repeat notifications (default: 1 hour)
journal: Optional MessageJournal instance for logging threshold events
"""
- self.thresholds = {} # {metric_path: ThresholdConfig}
+ # Named threshold configurations: {config_name: {metric_path: ThresholdConfig}}
+ self.threshold_configs = {}
+
+ # Single threshold set for backward compatibility: {metric_path: ThresholdConfig}
+ self.thresholds = {}
+
+ # Host to config name mapping: {host_name: config_name}
+ self.host_config_mapping = {}
+
+ # Default config name to use when no mapping exists
+ self.default_config = "default"
+
self.notification_callback = notification_callback
self.renotify_interval = renotify_interval
self.journal = journal
@@ -293,10 +307,84 @@ class ThresholdChecker:
# Parse configuration
self._parse_config(config)
- logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
+ total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values())
+ if total_thresholds == 0 and len(self.thresholds) > 0:
+ # Backward compatibility: using single threshold set
+ total_thresholds = len(self.thresholds)
+ logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds)
+ else:
+ logger.info(
+ "ThresholdChecker initialized with %d named configurations (%d total thresholds)",
+ len(self.threshold_configs),
+ total_thresholds
+ )
def _parse_config(self, config: Dict[str, Any]):
- """Parse threshold configuration from YAML structure."""
+ """Parse threshold configuration from YAML structure.
+
+ Supports two formats:
+ 1. Legacy format with direct 'thresholds' section
+ 2. New format with 'threshold_configs' and 'host_threshold_mapping'
+ """
+ # Check for new multi-config format
+ if "threshold_configs" in config:
+ self._parse_multi_config(config)
+ elif "thresholds" in config:
+ # Legacy single threshold configuration
+ self._parse_legacy_config(config)
+ else:
+ logger.info("No thresholds configured")
+
+ def _parse_multi_config(self, config: Dict[str, Any]):
+ """Parse multiple named threshold configurations."""
+ threshold_configs = config.get("threshold_configs", {})
+
+ if not threshold_configs:
+ logger.info("No threshold configurations defined")
+ return
+
+ # Parse each named configuration
+ for config_name, config_data in threshold_configs.items():
+ if not isinstance(config_data, dict):
+ logger.warning("Invalid threshold config '%s', skipping", config_name)
+ continue
+
+ if "thresholds" not in config_data:
+ logger.warning("No thresholds in config '%s', skipping", config_name)
+ continue
+
+ logger.info("Parsing threshold configuration: %s", config_name)
+ self.threshold_configs[config_name] = {}
+
+ thresholds_config = config_data["thresholds"]
+ for plugin_name, plugin_thresholds in thresholds_config.items():
+ if not isinstance(plugin_thresholds, dict):
+ continue
+
+ self._parse_plugin_thresholds(
+ plugin_name,
+ plugin_thresholds,
+ target_dict=self.threshold_configs[config_name]
+ )
+
+ # Parse host to config mapping
+ self.host_config_mapping = config.get("host_threshold_mapping", {})
+
+ # Set default config (first one alphabetically or explicitly set)
+ self.default_config = config.get("default_threshold_config", "default")
+ if self.default_config not in self.threshold_configs and self.threshold_configs:
+ # Use first available config as default
+ self.default_config = sorted(self.threshold_configs.keys())[0]
+ logger.info("Using '%s' as default threshold config", self.default_config)
+
+ logger.info(
+ "Loaded %d threshold configurations with %d host mappings",
+ len(self.threshold_configs),
+ len(self.host_config_mapping)
+ )
+
+ def _parse_legacy_config(self, config: Dict[str, Any]):
+ """Parse legacy single threshold configuration for backward compatibility."""
if not config or "thresholds" not in config:
logger.info("No thresholds configured")
return
@@ -307,13 +395,27 @@ class ThresholdChecker:
if not isinstance(plugin_thresholds, dict):
continue
- self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
+ self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds)
- def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
- """Parse thresholds for a specific plugin."""
+ def _parse_plugin_thresholds(
+ self,
+ plugin_name: str,
+ thresholds: Dict[str, Any],
+ target_dict: Optional[Dict[str, ThresholdConfig]] = None
+ ):
+ """Parse thresholds for a specific plugin.
+
+ Args:
+ plugin_name: Name of the plugin
+ thresholds: Threshold configuration dictionary
+ target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds)
+ """
+ if target_dict is None:
+ target_dict = self.thresholds
+
# Special handling for RTT thresholds (per-host)
if plugin_name == "rtt":
- self._parse_rtt_thresholds(thresholds)
+ self._parse_rtt_thresholds(thresholds, target_dict)
return
for metric_name, threshold_config in thresholds.items():
@@ -322,7 +424,7 @@ class ThresholdChecker:
# Handle nested metrics (e.g., partitions./.percent)
if metric_name == "partitions":
- self._parse_partition_thresholds(plugin_name, threshold_config)
+ self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
continue
metric_path = f"{plugin_name}.{metric_name}"
@@ -331,7 +433,7 @@ class ThresholdChecker:
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
- display = threshold_config.get("display")
+ display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
@@ -349,7 +451,7 @@ class ThresholdChecker:
display=display
)
- self.thresholds[metric_path] = threshold
+ target_dict[metric_path] = threshold
logger.debug(
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
metric_path,
@@ -358,8 +460,22 @@ class ThresholdChecker:
operator
)
- def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
- """Parse partition-specific thresholds for disk monitoring."""
+ def _parse_partition_thresholds(
+ self,
+ plugin_name: str,
+ partitions: Dict[str, Any],
+ target_dict: Optional[Dict[str, ThresholdConfig]] = None
+ ):
+ """Parse partition-specific thresholds for disk monitoring.
+
+ Args:
+ plugin_name: Name of the plugin
+ partitions: Partition threshold configuration
+ target_dict: Dictionary to store parsed thresholds
+ """
+ if target_dict is None:
+ target_dict = self.thresholds
+
for partition, metrics in partitions.items():
if not isinstance(metrics, dict):
continue
@@ -390,9 +506,13 @@ class ThresholdChecker:
display=display
)
- self.thresholds[metric_path] = threshold
+ target_dict[metric_path] = threshold
- def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
+ def _parse_rtt_thresholds(
+ self,
+ rtt_thresholds: Dict[str, Any],
+ target_dict: Optional[Dict[str, ThresholdConfig]] = None
+ ):
"""Parse RTT thresholds (per-host network latency thresholds).
RTT thresholds are configured as:
@@ -401,7 +521,14 @@ class ThresholdChecker:
hostname1:
warning: 100.0 # ms
critical: 500.0 # ms
+
+ Args:
+ rtt_thresholds: RTT threshold configuration
+ target_dict: Dictionary to store parsed thresholds
"""
+ if target_dict is None:
+ target_dict = self.thresholds
+
for hostname, threshold_config in rtt_thresholds.items():
if not isinstance(threshold_config, dict):
continue
@@ -430,7 +557,7 @@ class ThresholdChecker:
display=display
)
- self.thresholds[metric_path] = threshold
+ target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
hostname,
@@ -438,6 +565,37 @@ class ThresholdChecker:
critical
)
+ def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
+ """Get the appropriate threshold configuration for a host.
+
+ Args:
+ host_name: Name of the host
+
+ Returns:
+ Dictionary of thresholds for this host
+ """
+ # Legacy mode: single threshold set for all hosts
+ if self.thresholds and not self.threshold_configs:
+ return self.thresholds
+
+ # Multi-config mode: look up host-specific configuration
+ if self.threshold_configs:
+ config_name = self.host_config_mapping.get(host_name, self.default_config)
+
+ if config_name in self.threshold_configs:
+ return self.threshold_configs[config_name]
+ else:
+ logger.warning(
+ "Threshold config '%s' not found for host '%s', using default '%s'",
+ config_name,
+ host_name,
+ self.default_config
+ )
+ return self.threshold_configs.get(self.default_config, {})
+
+ # No thresholds configured
+ return {}
+
def check_value(
self,
host_name: str,
@@ -457,10 +615,13 @@ class ThresholdChecker:
Returns:
Tuple of (old_level, new_level) if state changed, None otherwise
"""
- if metric_path not in self.thresholds:
+ # Get host-specific thresholds
+ thresholds = self.get_thresholds_for_host(host_name)
+
+ if metric_path not in thresholds:
return None
- threshold = self.thresholds[metric_path]
+ threshold = thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
@@ -484,14 +645,17 @@ class ThresholdChecker:
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
- self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
+ # For check_value, we don't have full plugin data, pass None
+ lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
+ # Update alert state with formatted message
+ alert_state.formatted_message = formatted_msg
+ self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
return (old_level, new_level)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
- self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+ self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
return None
-
def check_plugin_data(
self,
host_name: str,
@@ -513,14 +677,17 @@ class ThresholdChecker:
"""
state_changes = []
+ # Get host-specific thresholds
+ thresholds = self.get_thresholds_for_host(host_name)
+
# Check flat metrics
for metric_name, value in data.items():
metric_path = f"{plugin_name}.{metric_name}"
- if metric_path not in self.thresholds:
+ if metric_path not in thresholds:
continue
- threshold = self.thresholds[metric_path]
+ threshold = thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
@@ -545,10 +712,13 @@ class ThresholdChecker:
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
- self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
+ lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
+ # Update alert state with formatted message
+ alert_state.formatted_message = formatted_msg
+ self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
- self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+ self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
# Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics(
@@ -570,6 +740,9 @@ class ThresholdChecker:
state_changes: list,
):
"""Check nested metrics like partition-specific thresholds."""
+ # Get host-specific thresholds
+ thresholds = self.get_thresholds_for_host(host_name)
+
# Look for partition data in disk_monitor
if plugin_name == "disk_monitor" and "partitions" in data:
partitions = data["partitions"]
@@ -583,10 +756,10 @@ class ThresholdChecker:
for metric_name, value in metrics.items():
metric_path = f"{plugin_name}.{partition}.{metric_name}"
- if metric_path not in self.thresholds:
+ if metric_path not in thresholds:
continue
- threshold = self.thresholds[metric_path]
+ threshold = thresholds[metric_path]
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
@@ -608,16 +781,20 @@ class ThresholdChecker:
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
- self._trigger_notification(
+ lvl, message, formatted_msg = self._trigger_notification(
host_name,
metric_path,
old_level,
new_level,
value,
- threshold
+ threshold,
+ data # Pass full plugin data for format string
)
+ # Update alert state with formatted message
+ alert_state.formatted_message = formatted_msg
+ self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
elif new_level != AlertLevel.OK:
- self._check_renotify(host_name, alert_state, metric_path, value, threshold)
+ self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
def _trigger_notification(
self,
@@ -627,8 +804,19 @@ class ThresholdChecker:
new_level: AlertLevel,
value: Any,
threshold: ThresholdConfig,
+ plugin_data: Optional[Dict[str, Any]] = None,
):
- """Trigger a notification for an alert state change."""
+ """Trigger a notification for an alert state change.
+
+ Args:
+ host_name: Name of the host
+ metric_path: Full metric path
+ old_level: Previous alert level
+ new_level: New alert level
+ value: Current metric value
+ threshold: Threshold configuration
+ plugin_data: Optional dictionary of all plugin data fields for format string
+ """
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
@@ -646,20 +834,59 @@ class ThresholdChecker:
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
if threshold_value is not None:
- message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
+ # Use display format string
+ threshold_info = self._format_display(
+ threshold.display,
+ value=value,
+ threshold_value=threshold_value,
+ op_symbol=op_symbol,
+ plugin_data=plugin_data
+ )
+ message = f"{metric_path} = {value} {threshold_info}"
else:
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
if threshold_value is not None:
- message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
+ # Use display format string
+ threshold_info = self._format_display(
+ threshold.display,
+ value=value,
+ threshold_value=threshold_value,
+ op_symbol=op_symbol,
+ plugin_data=plugin_data
+ )
+ message = f"{metric_path} = {value} {threshold_info}"
else:
message = f"{metric_path} = {value}"
else:
lvl = "UNKNOWN"
message = f"{metric_path} = {value}"
- # Send notification
+ # Return the formatted threshold info for storing in AlertState
+ formatted_threshold_msg = None
+ if threshold_value is not None and new_level != AlertLevel.OK:
+ formatted_threshold_msg = self._format_display(
+ threshold.display,
+ value=value,
+ threshold_value=threshold_value,
+ op_symbol=op_symbol,
+ plugin_data=plugin_data
+ )
+
+ return lvl, message, formatted_threshold_msg
+
+ def _send_notification(
+ self,
+ host_name: str,
+ lvl: str,
+ message: str,
+ metric_path: str,
+ old_level: AlertLevel,
+ new_level: AlertLevel,
+ value: Any,
+ ):
+ """Send notification and log to journal/eventlog."""
if self.notification_callback is not None:
try:
self.notification_callback(f"{lvl}: {host_name} - {message}")
@@ -684,6 +911,56 @@ class ThresholdChecker:
# Log to eventlog as well
eventlog(host_name, lvl, message, service="threshold")
+ def _format_display(
+ self,
+ display_format: str,
+ value: Any,
+ threshold_value: float,
+ op_symbol: str,
+ plugin_data: Optional[Dict[str, Any]] = None,
+ ) -> str:
+ """Format the display string using available data.
+
+ Args:
+ display_format: Format string from threshold config
+ value: Current metric value
+ threshold_value: Threshold value that was exceeded
+ op_symbol: Comparison operator symbol
+ plugin_data: Optional dictionary of plugin data fields
+
+ Returns:
+ Formatted display string
+ """
+ # Build format context with standard variables
+ format_context = {
+ 'value': value,
+ 'threshold_value': threshold_value,
+ 'op_symbol': op_symbol,
+ }
+
+ # Add all plugin data fields if available
+ if plugin_data:
+ format_context.update(plugin_data)
+
+ try:
+ # Format the display string
+ return display_format.format(**format_context)
+ except KeyError as e:
+ logger.warning(
+ "Missing format variable in display string '%s': %s",
+ display_format,
+ e
+ )
+ # Fallback to default format
+ return f"(threshold: {op_symbol} {threshold_value})"
+ except Exception as e:
+ logger.error(
+ "Error formatting display string '%s': %s",
+ display_format,
+ e
+ )
+ return f"(threshold: {op_symbol} {threshold_value})"
+
def _check_renotify(
self,
host_name: str,
@@ -691,8 +968,18 @@ class ThresholdChecker:
metric_path: str,
value: Any,
threshold: ThresholdConfig,
+ plugin_data: Optional[Dict[str, Any]] = None,
):
- """Check if we should send a repeat notification."""
+ """Check if we should send a repeat notification.
+
+ Args:
+ host_name: Name of the host
+ alert_state: Current alert state
+ metric_path: Full metric path
+ value: Current metric value
+ threshold: Threshold configuration
+ plugin_data: Optional dictionary of all plugin data fields
+ """
if alert_state.level == AlertLevel.OK:
return
@@ -718,7 +1005,15 @@ class ThresholdChecker:
# Time to re-notify
if threshold_value is not None:
- message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
+ # Use display format string
+ threshold_info = self._format_display(
+ threshold.display,
+ value=value,
+ threshold_value=threshold_value,
+ op_symbol=op_symbol,
+ plugin_data=plugin_data
+ )
+ message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
diff --git a/nagios_bad.sh b/nagios_bad.sh
new file mode 100755
index 0000000..2dc1341
--- /dev/null
+++ b/nagios_bad.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+#echo "OK - all is well"
+echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"