per-client threshold config
This commit is contained in:
@@ -50,6 +50,28 @@ journal_max_size: 104857600 # Max size (100MB default)
|
|||||||
journal_max_backups: 10 # Number of backups to keep
|
journal_max_backups: 10 # Number of backups to keep
|
||||||
|
|
||||||
thresholds:
|
thresholds:
|
||||||
|
default:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 3.0
|
||||||
|
critical: 95.0
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 90.0
|
||||||
|
rtt:
|
||||||
|
y:
|
||||||
|
warning: 30
|
||||||
|
critical: 250.0
|
||||||
|
|
||||||
|
|
||||||
|
freebsd_server:
|
||||||
cpu_monitor:
|
cpu_monitor:
|
||||||
cpu_percent:
|
cpu_percent:
|
||||||
warning: 80.0
|
warning: 80.0
|
||||||
@@ -65,23 +87,25 @@ thresholds:
|
|||||||
warning: 85.0
|
warning: 85.0
|
||||||
critical: 90.0
|
critical: 90.0
|
||||||
nagios_runner:
|
nagios_runner:
|
||||||
overall_status_code:
|
# overall_status_code:
|
||||||
warning: 1
|
# warning: 1
|
||||||
critical: 2
|
# critical: 2
|
||||||
operator: ">="
|
# operator: ">="
|
||||||
load_status:
|
load_status:
|
||||||
warning: WARNING
|
warning: WARNING
|
||||||
critical: CRITICAL
|
critical: CRITICAL
|
||||||
operator: "=="
|
operator: "=="
|
||||||
UPS_load:
|
UPS_load:
|
||||||
|
display: "{ups_output}"
|
||||||
warning: 70
|
warning: 70
|
||||||
critical: 80
|
critical: 80
|
||||||
operator: ">="
|
operator: ">="
|
||||||
UPS_status_code:
|
UPS_status_code:
|
||||||
|
display: "{ups_output}"
|
||||||
warning: 1
|
warning: 1
|
||||||
critical: 2
|
critical: 2
|
||||||
operator: ">="
|
operator: ">="
|
||||||
nextcloud_apps_status:
|
nextcloud_apps_status_code:
|
||||||
display: "{nextcloud_apps_output}"
|
display: "{nextcloud_apps_output}"
|
||||||
warning: 1
|
warning: 1
|
||||||
critical: 2
|
critical: 2
|
||||||
@@ -90,3 +114,55 @@ thresholds:
|
|||||||
y:
|
y:
|
||||||
warning: 30
|
warning: 30
|
||||||
critical: 250.0
|
critical: 250.0
|
||||||
|
|
||||||
|
truenas_server:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 3.0
|
||||||
|
critical: 95.0
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 90.0
|
||||||
|
nagios_runner:
|
||||||
|
# overall_status_code:
|
||||||
|
# warning: 1
|
||||||
|
# critical: 2
|
||||||
|
# operator: ">="
|
||||||
|
load_status:
|
||||||
|
warning: WARNING
|
||||||
|
critical: CRITICAL
|
||||||
|
operator: "=="
|
||||||
|
UPS_load:
|
||||||
|
display: "{ups_output}"
|
||||||
|
warning: 70
|
||||||
|
critical: 80
|
||||||
|
operator: ">="
|
||||||
|
UPS_status_code:
|
||||||
|
display: "{ups_output}"
|
||||||
|
warning: 1
|
||||||
|
critical: 2
|
||||||
|
operator: ">="
|
||||||
|
nextcloud_apps_status_code:
|
||||||
|
display: "{nextcloud_apps_output}"
|
||||||
|
warning: 1
|
||||||
|
critical: 2
|
||||||
|
operator: ">="
|
||||||
|
rtt:
|
||||||
|
y:
|
||||||
|
warning: 30
|
||||||
|
critical: 250.0
|
||||||
|
|
||||||
|
|
||||||
|
host_threshold_mapping:
|
||||||
|
# Critical production servers
|
||||||
|
|
||||||
|
wally: freebsd_server
|
||||||
|
eris: truenas_server
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -56,6 +56,7 @@ thresholds:
|
|||||||
critical: 90.0
|
critical: 90.0
|
||||||
operator: ">"
|
operator: ">"
|
||||||
hysteresis: 0.1
|
hysteresis: 0.1
|
||||||
|
display: "display format"
|
||||||
enabled: true
|
enabled: true
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -82,6 +83,8 @@ Note: At least one of `warning` or `critical` must be specified.
|
|||||||
- Range: 0.0 to 1.0
|
- Range: 0.0 to 1.0
|
||||||
- Prevents rapid state transitions when value hovers near threshold
|
- Prevents rapid state transitions when value hovers near threshold
|
||||||
|
|
||||||
|
- **display**: f-string to hold the display format for alert messages
|
||||||
|
- defaults to "(threshold: {op_symbol} {threshold_value})"
|
||||||
- **enabled**: Whether this threshold is active (default: `true`)
|
- **enabled**: Whether this threshold is active (default: `true`)
|
||||||
|
|
||||||
### Comparison Operators
|
### Comparison Operators
|
||||||
@@ -740,3 +743,217 @@ Planned features:
|
|||||||
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
- [Message Journal Documentation](MESSAGE_JOURNAL.md)
|
||||||
- Configuration examples: `hbd/config_thresholds_example.yaml`
|
- Configuration examples: `hbd/config_thresholds_example.yaml`
|
||||||
- Test suite: `test_threshold.py`
|
- Test suite: `test_threshold.py`
|
||||||
|
|
||||||
|
## Multi-Threshold Configuration
|
||||||
|
|
||||||
|
**New in version 2.0**: Support for multiple named threshold configurations with per-host mapping.
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
The multi-threshold feature allows you to:
|
||||||
|
- Define multiple sets of threshold configurations
|
||||||
|
- Map different hosts to different threshold sets
|
||||||
|
- Use different sensitivity levels for different environments
|
||||||
|
- Maintain a default configuration for unmapped hosts
|
||||||
|
|
||||||
|
### Configuration Structure
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Optional: Set the default configuration name (defaults to "default")
|
||||||
|
default_threshold_config: "default"
|
||||||
|
|
||||||
|
# Define multiple named threshold configurations
|
||||||
|
threshold_configs:
|
||||||
|
# Configuration name 1
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
# Standard threshold definitions
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
|
||||||
|
# Configuration name 2
|
||||||
|
high_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 60.0
|
||||||
|
critical: 75.0
|
||||||
|
|
||||||
|
# Configuration name 3
|
||||||
|
low_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 95.0
|
||||||
|
|
||||||
|
# Map specific hosts to specific configurations
|
||||||
|
host_threshold_mapping:
|
||||||
|
prod-web-01: high_sensitivity
|
||||||
|
prod-web-02: high_sensitivity
|
||||||
|
dev-server-01: low_sensitivity
|
||||||
|
# Unmapped hosts use default_threshold_config
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use Cases
|
||||||
|
|
||||||
|
#### 1. Environment-Based Thresholds
|
||||||
|
|
||||||
|
Different thresholds for production vs. development:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
production:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 70.0 # Alert earlier in production
|
||||||
|
critical: 85.0
|
||||||
|
|
||||||
|
development:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 90.0 # More relaxed for dev
|
||||||
|
critical: 98.0
|
||||||
|
|
||||||
|
host_threshold_mapping:
|
||||||
|
prod-web-01: production
|
||||||
|
prod-web-02: production
|
||||||
|
dev-web-01: development
|
||||||
|
dev-web-02: development
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Server Role-Based Thresholds
|
||||||
|
|
||||||
|
Different thresholds based on server function:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
webserver:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
|
||||||
|
database:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 70.0
|
||||||
|
critical: 85.0
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0 # Databases can use high memory
|
||||||
|
critical: 97.0
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/var/lib/mysql:
|
||||||
|
percent:
|
||||||
|
warning: 75.0
|
||||||
|
critical: 85.0
|
||||||
|
|
||||||
|
cache:
|
||||||
|
thresholds:
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 95.0 # Redis/Memcached can use very high memory
|
||||||
|
critical: 99.0
|
||||||
|
|
||||||
|
host_threshold_mapping:
|
||||||
|
web-01: webserver
|
||||||
|
web-02: webserver
|
||||||
|
db-01: database
|
||||||
|
db-02: database
|
||||||
|
redis-01: cache
|
||||||
|
memcached-01: cache
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Sensitivity Levels
|
||||||
|
|
||||||
|
Different sensitivity for critical vs. non-critical systems:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
threshold_configs:
|
||||||
|
critical:
|
||||||
|
thresholds:
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 70.0 # Very sensitive
|
||||||
|
critical: 80.0
|
||||||
|
hysteresis: 0.15
|
||||||
|
|
||||||
|
standard:
|
||||||
|
thresholds:
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
hysteresis: 0.1
|
||||||
|
|
||||||
|
relaxed:
|
||||||
|
thresholds:
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 98.0
|
||||||
|
hysteresis: 0.05
|
||||||
|
|
||||||
|
host_threshold_mapping:
|
||||||
|
payment-gateway: critical
|
||||||
|
auth-server: critical
|
||||||
|
web-01: standard
|
||||||
|
web-02: standard
|
||||||
|
test-server: relaxed
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backward Compatibility
|
||||||
|
|
||||||
|
The legacy single threshold configuration is fully supported:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Old format - still works
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
```
|
||||||
|
|
||||||
|
This is equivalent to:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# New format
|
||||||
|
threshold_configs:
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration Priority
|
||||||
|
|
||||||
|
1. **Host-specific mapping**: If host is in `host_threshold_mapping`, use that config
|
||||||
|
2. **Default config**: Use `default_threshold_config`
|
||||||
|
3. **First alphabetically**: If default not found, use first config alphabetically
|
||||||
|
4. **Legacy fallback**: If `threshold_configs` not present, use `thresholds`
|
||||||
|
|
||||||
|
### Example: Complete Multi-Threshold Setup
|
||||||
|
|
||||||
|
See `hbd/config_multi_threshold_example.yaml` for a complete example with:
|
||||||
|
- 4 named configurations (default, high_sensitivity, low_sensitivity, database)
|
||||||
|
- Host-to-config mappings for production, development, and test systems
|
||||||
|
- Specialized database server thresholds
|
||||||
|
- Custom display messages with plugin data
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,202 @@
|
|||||||
|
# ==============================================================================
|
||||||
|
# Heartbeat Daemon Multi-Threshold Configuration Example
|
||||||
|
# ==============================================================================
|
||||||
|
# This file demonstrates the new multi-threshold configuration feature that allows
|
||||||
|
# different threshold settings for different hosts/clients.
|
||||||
|
#
|
||||||
|
# Features:
|
||||||
|
# - Define multiple named threshold configurations
|
||||||
|
# - Map specific hosts to specific threshold configurations
|
||||||
|
# - Set a default configuration for unmapped hosts
|
||||||
|
# - Backward compatible with single threshold configuration
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Global threshold settings
|
||||||
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||||
|
|
||||||
|
# Optional: Set default threshold config (defaults to "default" if not specified)
|
||||||
|
default_threshold_config: "default"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Multiple Named Threshold Configurations
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Define multiple threshold configurations with different sensitivity levels
|
||||||
|
threshold_configs:
|
||||||
|
|
||||||
|
# Default configuration - moderate thresholds for most servers
|
||||||
|
default:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
load_1min:
|
||||||
|
warning: 4.0
|
||||||
|
critical: 8.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 85.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
# RTT thresholds per remote host
|
||||||
|
router:
|
||||||
|
warning: 50.0 # ms
|
||||||
|
critical: 200.0
|
||||||
|
server1:
|
||||||
|
warning: 100.0
|
||||||
|
critical: 500.0
|
||||||
|
|
||||||
|
# High sensitivity configuration - lower thresholds for critical systems
|
||||||
|
high_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 60.0 # Alert earlier
|
||||||
|
critical: 75.0
|
||||||
|
operator: ">"
|
||||||
|
hysteresis: 0.15 # More hysteresis to reduce flapping
|
||||||
|
load_1min:
|
||||||
|
warning: 2.0
|
||||||
|
critical: 4.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert at lower memory usage
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 75.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
/var:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
router:
|
||||||
|
warning: 30.0
|
||||||
|
critical: 100.0
|
||||||
|
server1:
|
||||||
|
warning: 50.0
|
||||||
|
critical: 200.0
|
||||||
|
|
||||||
|
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||||
|
low_sensitivity:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 90.0 # Only alert at very high usage
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 98.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 90.0
|
||||||
|
critical: 95.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
router:
|
||||||
|
warning: 100.0
|
||||||
|
critical: 500.0
|
||||||
|
|
||||||
|
# Production database servers - specialized thresholds
|
||||||
|
database:
|
||||||
|
thresholds:
|
||||||
|
cpu_monitor:
|
||||||
|
cpu_percent:
|
||||||
|
warning: 70.0
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
memory_monitor:
|
||||||
|
percent:
|
||||||
|
warning: 90.0 # Databases can use high memory
|
||||||
|
critical: 97.0
|
||||||
|
operator: ">"
|
||||||
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
||||||
|
|
||||||
|
disk_monitor:
|
||||||
|
partitions:
|
||||||
|
/:
|
||||||
|
percent:
|
||||||
|
warning: 80.0
|
||||||
|
critical: 90.0
|
||||||
|
operator: ">"
|
||||||
|
/var/lib/mysql: # Database data partition
|
||||||
|
percent:
|
||||||
|
warning: 75.0 # Alert earlier for DB partition
|
||||||
|
critical: 85.0
|
||||||
|
operator: ">"
|
||||||
|
|
||||||
|
rtt:
|
||||||
|
router:
|
||||||
|
warning: 20.0 # Stricter latency requirements
|
||||||
|
critical: 50.0
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Host to Threshold Configuration Mapping
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Map specific hosts to specific threshold configurations
|
||||||
|
# Hosts not listed here will use the default_threshold_config
|
||||||
|
host_threshold_mapping:
|
||||||
|
# Critical production servers
|
||||||
|
prod-web-01: high_sensitivity
|
||||||
|
prod-web-02: high_sensitivity
|
||||||
|
prod-api-01: high_sensitivity
|
||||||
|
|
||||||
|
# Database servers
|
||||||
|
prod-db-01: database
|
||||||
|
prod-db-02: database
|
||||||
|
prod-db-replica: database
|
||||||
|
|
||||||
|
# Development and test systems
|
||||||
|
dev-server-01: low_sensitivity
|
||||||
|
dev-server-02: low_sensitivity
|
||||||
|
test-server-01: low_sensitivity
|
||||||
|
test-server-02: low_sensitivity
|
||||||
|
|
||||||
|
# Everything else uses 'default' (no need to list explicitly)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Backward Compatibility Example
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# The old single threshold format is still supported:
|
||||||
|
# Just use 'thresholds:' directly without 'threshold_configs:'
|
||||||
|
#
|
||||||
|
# thresholds:
|
||||||
|
# cpu_monitor:
|
||||||
|
# cpu_percent:
|
||||||
|
# warning: 80.0
|
||||||
|
# critical: 90.0
|
||||||
|
#
|
||||||
|
# This will apply the same thresholds to all hosts.
|
||||||
@@ -397,9 +397,11 @@
|
|||||||
const level = alert.level.toLowerCase();
|
const level = alert.level.toLowerCase();
|
||||||
const duration = getDuration(alert.since);
|
const duration = getDuration(alert.since);
|
||||||
|
|
||||||
// Format value with threshold info if available
|
// Use formatted message if available, otherwise build from individual fields
|
||||||
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||||
if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
if (alert.formatted_message) {
|
||||||
|
valueText += ` <span class="threshold-info">${alert.formatted_message}</span>`;
|
||||||
|
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+330
-35
@@ -55,6 +55,7 @@ class AlertState:
|
|||||||
self.last_notification = None
|
self.last_notification = None
|
||||||
self.threshold_value = None # The threshold value that triggered alert
|
self.threshold_value = None # The threshold value that triggered alert
|
||||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||||
|
self.formatted_message = None # Formatted display message for UI
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
@@ -120,6 +121,8 @@ class AlertState:
|
|||||||
result["threshold_value"] = self.threshold_value
|
result["threshold_value"] = self.threshold_value
|
||||||
if self.operator is not None:
|
if self.operator is not None:
|
||||||
result["operator"] = self.operator
|
result["operator"] = self.operator
|
||||||
|
if self.formatted_message is not None:
|
||||||
|
result["formatted_message"] = self.formatted_message
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -285,7 +288,18 @@ class ThresholdChecker:
|
|||||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||||
journal: Optional MessageJournal instance for logging threshold events
|
journal: Optional MessageJournal instance for logging threshold events
|
||||||
"""
|
"""
|
||||||
self.thresholds = {} # {metric_path: ThresholdConfig}
|
# Named threshold configurations: {config_name: {metric_path: ThresholdConfig}}
|
||||||
|
self.threshold_configs = {}
|
||||||
|
|
||||||
|
# Single threshold set for backward compatibility: {metric_path: ThresholdConfig}
|
||||||
|
self.thresholds = {}
|
||||||
|
|
||||||
|
# Host to config name mapping: {host_name: config_name}
|
||||||
|
self.host_config_mapping = {}
|
||||||
|
|
||||||
|
# Default config name to use when no mapping exists
|
||||||
|
self.default_config = "default"
|
||||||
|
|
||||||
self.notification_callback = notification_callback
|
self.notification_callback = notification_callback
|
||||||
self.renotify_interval = renotify_interval
|
self.renotify_interval = renotify_interval
|
||||||
self.journal = journal
|
self.journal = journal
|
||||||
@@ -293,10 +307,84 @@ class ThresholdChecker:
|
|||||||
# Parse configuration
|
# Parse configuration
|
||||||
self._parse_config(config)
|
self._parse_config(config)
|
||||||
|
|
||||||
logger.info("ThresholdChecker initialized with %d thresholds", len(self.thresholds))
|
total_thresholds = sum(len(cfg) for cfg in self.threshold_configs.values())
|
||||||
|
if total_thresholds == 0 and len(self.thresholds) > 0:
|
||||||
|
# Backward compatibility: using single threshold set
|
||||||
|
total_thresholds = len(self.thresholds)
|
||||||
|
logger.info("ThresholdChecker initialized with %d thresholds (legacy format)", total_thresholds)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"ThresholdChecker initialized with %d named configurations (%d total thresholds)",
|
||||||
|
len(self.threshold_configs),
|
||||||
|
total_thresholds
|
||||||
|
)
|
||||||
|
|
||||||
def _parse_config(self, config: Dict[str, Any]):
|
def _parse_config(self, config: Dict[str, Any]):
|
||||||
"""Parse threshold configuration from YAML structure."""
|
"""Parse threshold configuration from YAML structure.
|
||||||
|
|
||||||
|
Supports two formats:
|
||||||
|
1. Legacy format with direct 'thresholds' section
|
||||||
|
2. New format with 'threshold_configs' and 'host_threshold_mapping'
|
||||||
|
"""
|
||||||
|
# Check for new multi-config format
|
||||||
|
if "threshold_configs" in config:
|
||||||
|
self._parse_multi_config(config)
|
||||||
|
elif "thresholds" in config:
|
||||||
|
# Legacy single threshold configuration
|
||||||
|
self._parse_legacy_config(config)
|
||||||
|
else:
|
||||||
|
logger.info("No thresholds configured")
|
||||||
|
|
||||||
|
def _parse_multi_config(self, config: Dict[str, Any]):
|
||||||
|
"""Parse multiple named threshold configurations."""
|
||||||
|
threshold_configs = config.get("threshold_configs", {})
|
||||||
|
|
||||||
|
if not threshold_configs:
|
||||||
|
logger.info("No threshold configurations defined")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse each named configuration
|
||||||
|
for config_name, config_data in threshold_configs.items():
|
||||||
|
if not isinstance(config_data, dict):
|
||||||
|
logger.warning("Invalid threshold config '%s', skipping", config_name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "thresholds" not in config_data:
|
||||||
|
logger.warning("No thresholds in config '%s', skipping", config_name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info("Parsing threshold configuration: %s", config_name)
|
||||||
|
self.threshold_configs[config_name] = {}
|
||||||
|
|
||||||
|
thresholds_config = config_data["thresholds"]
|
||||||
|
for plugin_name, plugin_thresholds in thresholds_config.items():
|
||||||
|
if not isinstance(plugin_thresholds, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._parse_plugin_thresholds(
|
||||||
|
plugin_name,
|
||||||
|
plugin_thresholds,
|
||||||
|
target_dict=self.threshold_configs[config_name]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse host to config mapping
|
||||||
|
self.host_config_mapping = config.get("host_threshold_mapping", {})
|
||||||
|
|
||||||
|
# Set default config (first one alphabetically or explicitly set)
|
||||||
|
self.default_config = config.get("default_threshold_config", "default")
|
||||||
|
if self.default_config not in self.threshold_configs and self.threshold_configs:
|
||||||
|
# Use first available config as default
|
||||||
|
self.default_config = sorted(self.threshold_configs.keys())[0]
|
||||||
|
logger.info("Using '%s' as default threshold config", self.default_config)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Loaded %d threshold configurations with %d host mappings",
|
||||||
|
len(self.threshold_configs),
|
||||||
|
len(self.host_config_mapping)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_legacy_config(self, config: Dict[str, Any]):
|
||||||
|
"""Parse legacy single threshold configuration for backward compatibility."""
|
||||||
if not config or "thresholds" not in config:
|
if not config or "thresholds" not in config:
|
||||||
logger.info("No thresholds configured")
|
logger.info("No thresholds configured")
|
||||||
return
|
return
|
||||||
@@ -307,13 +395,27 @@ class ThresholdChecker:
|
|||||||
if not isinstance(plugin_thresholds, dict):
|
if not isinstance(plugin_thresholds, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._parse_plugin_thresholds(plugin_name, plugin_thresholds)
|
self._parse_plugin_thresholds(plugin_name, plugin_thresholds, target_dict=self.thresholds)
|
||||||
|
|
||||||
|
def _parse_plugin_thresholds(
|
||||||
|
self,
|
||||||
|
plugin_name: str,
|
||||||
|
thresholds: Dict[str, Any],
|
||||||
|
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||||
|
):
|
||||||
|
"""Parse thresholds for a specific plugin.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
thresholds: Threshold configuration dictionary
|
||||||
|
target_dict: Dictionary to store parsed thresholds (defaults to self.thresholds)
|
||||||
|
"""
|
||||||
|
if target_dict is None:
|
||||||
|
target_dict = self.thresholds
|
||||||
|
|
||||||
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
|
||||||
"""Parse thresholds for a specific plugin."""
|
|
||||||
# Special handling for RTT thresholds (per-host)
|
# Special handling for RTT thresholds (per-host)
|
||||||
if plugin_name == "rtt":
|
if plugin_name == "rtt":
|
||||||
self._parse_rtt_thresholds(thresholds)
|
self._parse_rtt_thresholds(thresholds, target_dict)
|
||||||
return
|
return
|
||||||
|
|
||||||
for metric_name, threshold_config in thresholds.items():
|
for metric_name, threshold_config in thresholds.items():
|
||||||
@@ -322,7 +424,7 @@ class ThresholdChecker:
|
|||||||
|
|
||||||
# Handle nested metrics (e.g., partitions./.percent)
|
# Handle nested metrics (e.g., partitions./.percent)
|
||||||
if metric_name == "partitions":
|
if metric_name == "partitions":
|
||||||
self._parse_partition_thresholds(plugin_name, threshold_config)
|
self._parse_partition_thresholds(plugin_name, threshold_config, target_dict)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
metric_path = f"{plugin_name}.{metric_name}"
|
metric_path = f"{plugin_name}.{metric_name}"
|
||||||
@@ -331,7 +433,7 @@ class ThresholdChecker:
|
|||||||
warning = threshold_config.get("warning")
|
warning = threshold_config.get("warning")
|
||||||
critical = threshold_config.get("critical")
|
critical = threshold_config.get("critical")
|
||||||
operator = threshold_config.get("operator", ">")
|
operator = threshold_config.get("operator", ">")
|
||||||
display = threshold_config.get("display")
|
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
||||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||||
enabled = threshold_config.get("enabled", True)
|
enabled = threshold_config.get("enabled", True)
|
||||||
|
|
||||||
@@ -349,7 +451,7 @@ class ThresholdChecker:
|
|||||||
display=display
|
display=display
|
||||||
)
|
)
|
||||||
|
|
||||||
self.thresholds[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
|
"Registered threshold for %s: warn=%s, crit=%s, op=%s",
|
||||||
metric_path,
|
metric_path,
|
||||||
@@ -358,8 +460,22 @@ class ThresholdChecker:
|
|||||||
operator
|
operator
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_partition_thresholds(self, plugin_name: str, partitions: Dict[str, Any]):
|
def _parse_partition_thresholds(
|
||||||
"""Parse partition-specific thresholds for disk monitoring."""
|
self,
|
||||||
|
plugin_name: str,
|
||||||
|
partitions: Dict[str, Any],
|
||||||
|
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||||
|
):
|
||||||
|
"""Parse partition-specific thresholds for disk monitoring.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_name: Name of the plugin
|
||||||
|
partitions: Partition threshold configuration
|
||||||
|
target_dict: Dictionary to store parsed thresholds
|
||||||
|
"""
|
||||||
|
if target_dict is None:
|
||||||
|
target_dict = self.thresholds
|
||||||
|
|
||||||
for partition, metrics in partitions.items():
|
for partition, metrics in partitions.items():
|
||||||
if not isinstance(metrics, dict):
|
if not isinstance(metrics, dict):
|
||||||
continue
|
continue
|
||||||
@@ -390,9 +506,13 @@ class ThresholdChecker:
|
|||||||
display=display
|
display=display
|
||||||
)
|
)
|
||||||
|
|
||||||
self.thresholds[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
|
|
||||||
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
|
def _parse_rtt_thresholds(
|
||||||
|
self,
|
||||||
|
rtt_thresholds: Dict[str, Any],
|
||||||
|
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||||
|
):
|
||||||
"""Parse RTT thresholds (per-host network latency thresholds).
|
"""Parse RTT thresholds (per-host network latency thresholds).
|
||||||
|
|
||||||
RTT thresholds are configured as:
|
RTT thresholds are configured as:
|
||||||
@@ -401,7 +521,14 @@ class ThresholdChecker:
|
|||||||
hostname1:
|
hostname1:
|
||||||
warning: 100.0 # ms
|
warning: 100.0 # ms
|
||||||
critical: 500.0 # ms
|
critical: 500.0 # ms
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rtt_thresholds: RTT threshold configuration
|
||||||
|
target_dict: Dictionary to store parsed thresholds
|
||||||
"""
|
"""
|
||||||
|
if target_dict is None:
|
||||||
|
target_dict = self.thresholds
|
||||||
|
|
||||||
for hostname, threshold_config in rtt_thresholds.items():
|
for hostname, threshold_config in rtt_thresholds.items():
|
||||||
if not isinstance(threshold_config, dict):
|
if not isinstance(threshold_config, dict):
|
||||||
continue
|
continue
|
||||||
@@ -430,7 +557,7 @@ class ThresholdChecker:
|
|||||||
display=display
|
display=display
|
||||||
)
|
)
|
||||||
|
|
||||||
self.thresholds[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
||||||
hostname,
|
hostname,
|
||||||
@@ -438,6 +565,37 @@ class ThresholdChecker:
|
|||||||
critical
|
critical
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||||
|
"""Get the appropriate threshold configuration for a host.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of thresholds for this host
|
||||||
|
"""
|
||||||
|
# Legacy mode: single threshold set for all hosts
|
||||||
|
if self.thresholds and not self.threshold_configs:
|
||||||
|
return self.thresholds
|
||||||
|
|
||||||
|
# Multi-config mode: look up host-specific configuration
|
||||||
|
if self.threshold_configs:
|
||||||
|
config_name = self.host_config_mapping.get(host_name, self.default_config)
|
||||||
|
|
||||||
|
if config_name in self.threshold_configs:
|
||||||
|
return self.threshold_configs[config_name]
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Threshold config '%s' not found for host '%s', using default '%s'",
|
||||||
|
config_name,
|
||||||
|
host_name,
|
||||||
|
self.default_config
|
||||||
|
)
|
||||||
|
return self.threshold_configs.get(self.default_config, {})
|
||||||
|
|
||||||
|
# No thresholds configured
|
||||||
|
return {}
|
||||||
|
|
||||||
def check_value(
|
def check_value(
|
||||||
self,
|
self,
|
||||||
host_name: str,
|
host_name: str,
|
||||||
@@ -457,10 +615,13 @@ class ThresholdChecker:
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (old_level, new_level) if state changed, None otherwise
|
Tuple of (old_level, new_level) if state changed, None otherwise
|
||||||
"""
|
"""
|
||||||
if metric_path not in self.thresholds:
|
# Get host-specific thresholds
|
||||||
|
thresholds = self.get_thresholds_for_host(host_name)
|
||||||
|
|
||||||
|
if metric_path not in thresholds:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
threshold = self.thresholds[metric_path]
|
threshold = thresholds[metric_path]
|
||||||
|
|
||||||
# Get or create alert state
|
# Get or create alert state
|
||||||
if metric_path not in alert_states:
|
if metric_path not in alert_states:
|
||||||
@@ -484,14 +645,17 @@ class ThresholdChecker:
|
|||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
# For check_value, we don't have full plugin data, pass None
|
||||||
|
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
|
||||||
|
# Update alert state with formatted message
|
||||||
|
alert_state.formatted_message = formatted_msg
|
||||||
|
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||||
return (old_level, new_level)
|
return (old_level, new_level)
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
# Check if we should re-notify
|
# Check if we should re-notify
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def check_plugin_data(
|
def check_plugin_data(
|
||||||
self,
|
self,
|
||||||
host_name: str,
|
host_name: str,
|
||||||
@@ -513,14 +677,17 @@ class ThresholdChecker:
|
|||||||
"""
|
"""
|
||||||
state_changes = []
|
state_changes = []
|
||||||
|
|
||||||
|
# Get host-specific thresholds
|
||||||
|
thresholds = self.get_thresholds_for_host(host_name)
|
||||||
|
|
||||||
# Check flat metrics
|
# Check flat metrics
|
||||||
for metric_name, value in data.items():
|
for metric_name, value in data.items():
|
||||||
metric_path = f"{plugin_name}.{metric_name}"
|
metric_path = f"{plugin_name}.{metric_name}"
|
||||||
|
|
||||||
if metric_path not in self.thresholds:
|
if metric_path not in thresholds:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
threshold = self.thresholds[metric_path]
|
threshold = thresholds[metric_path]
|
||||||
|
|
||||||
# Get or create alert state
|
# Get or create alert state
|
||||||
if metric_path not in alert_states:
|
if metric_path not in alert_states:
|
||||||
@@ -545,10 +712,13 @@ class ThresholdChecker:
|
|||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
|
||||||
|
# Update alert state with formatted message
|
||||||
|
alert_state.formatted_message = formatted_msg
|
||||||
|
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
# Check if we should re-notify
|
# Check if we should re-notify
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||||
|
|
||||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||||
self._check_nested_metrics(
|
self._check_nested_metrics(
|
||||||
@@ -570,6 +740,9 @@ class ThresholdChecker:
|
|||||||
state_changes: list,
|
state_changes: list,
|
||||||
):
|
):
|
||||||
"""Check nested metrics like partition-specific thresholds."""
|
"""Check nested metrics like partition-specific thresholds."""
|
||||||
|
# Get host-specific thresholds
|
||||||
|
thresholds = self.get_thresholds_for_host(host_name)
|
||||||
|
|
||||||
# Look for partition data in disk_monitor
|
# Look for partition data in disk_monitor
|
||||||
if plugin_name == "disk_monitor" and "partitions" in data:
|
if plugin_name == "disk_monitor" and "partitions" in data:
|
||||||
partitions = data["partitions"]
|
partitions = data["partitions"]
|
||||||
@@ -583,10 +756,10 @@ class ThresholdChecker:
|
|||||||
for metric_name, value in metrics.items():
|
for metric_name, value in metrics.items():
|
||||||
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
metric_path = f"{plugin_name}.{partition}.{metric_name}"
|
||||||
|
|
||||||
if metric_path not in self.thresholds:
|
if metric_path not in thresholds:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
threshold = self.thresholds[metric_path]
|
threshold = thresholds[metric_path]
|
||||||
|
|
||||||
if metric_path not in alert_states:
|
if metric_path not in alert_states:
|
||||||
alert_states[metric_path] = AlertState(metric_path)
|
alert_states[metric_path] = AlertState(metric_path)
|
||||||
@@ -608,16 +781,20 @@ class ThresholdChecker:
|
|||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
self._trigger_notification(
|
lvl, message, formatted_msg = self._trigger_notification(
|
||||||
host_name,
|
host_name,
|
||||||
metric_path,
|
metric_path,
|
||||||
old_level,
|
old_level,
|
||||||
new_level,
|
new_level,
|
||||||
value,
|
value,
|
||||||
threshold
|
threshold,
|
||||||
|
data # Pass full plugin data for format string
|
||||||
)
|
)
|
||||||
|
# Update alert state with formatted message
|
||||||
|
alert_state.formatted_message = formatted_msg
|
||||||
|
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||||
|
|
||||||
def _trigger_notification(
|
def _trigger_notification(
|
||||||
self,
|
self,
|
||||||
@@ -627,8 +804,19 @@ class ThresholdChecker:
|
|||||||
new_level: AlertLevel,
|
new_level: AlertLevel,
|
||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
"""Trigger a notification for an alert state change."""
|
"""Trigger a notification for an alert state change.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
metric_path: Full metric path
|
||||||
|
old_level: Previous alert level
|
||||||
|
new_level: New alert level
|
||||||
|
value: Current metric value
|
||||||
|
threshold: Threshold configuration
|
||||||
|
plugin_data: Optional dictionary of all plugin data fields for format string
|
||||||
|
"""
|
||||||
# Determine which threshold was exceeded
|
# Determine which threshold was exceeded
|
||||||
threshold_value = None
|
threshold_value = None
|
||||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||||
@@ -646,20 +834,59 @@ class ThresholdChecker:
|
|||||||
elif new_level == AlertLevel.WARNING:
|
elif new_level == AlertLevel.WARNING:
|
||||||
lvl = "WARNING"
|
lvl = "WARNING"
|
||||||
if threshold_value is not None:
|
if threshold_value is not None:
|
||||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
# Use display format string
|
||||||
|
threshold_info = self._format_display(
|
||||||
|
threshold.display,
|
||||||
|
value=value,
|
||||||
|
threshold_value=threshold_value,
|
||||||
|
op_symbol=op_symbol,
|
||||||
|
plugin_data=plugin_data
|
||||||
|
)
|
||||||
|
message = f"{metric_path} = {value} {threshold_info}"
|
||||||
else:
|
else:
|
||||||
message = f"{metric_path} = {value}"
|
message = f"{metric_path} = {value}"
|
||||||
elif new_level == AlertLevel.CRITICAL:
|
elif new_level == AlertLevel.CRITICAL:
|
||||||
lvl = "CRITICAL"
|
lvl = "CRITICAL"
|
||||||
if threshold_value is not None:
|
if threshold_value is not None:
|
||||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
# Use display format string
|
||||||
|
threshold_info = self._format_display(
|
||||||
|
threshold.display,
|
||||||
|
value=value,
|
||||||
|
threshold_value=threshold_value,
|
||||||
|
op_symbol=op_symbol,
|
||||||
|
plugin_data=plugin_data
|
||||||
|
)
|
||||||
|
message = f"{metric_path} = {value} {threshold_info}"
|
||||||
else:
|
else:
|
||||||
message = f"{metric_path} = {value}"
|
message = f"{metric_path} = {value}"
|
||||||
else:
|
else:
|
||||||
lvl = "UNKNOWN"
|
lvl = "UNKNOWN"
|
||||||
message = f"{metric_path} = {value}"
|
message = f"{metric_path} = {value}"
|
||||||
|
|
||||||
# Send notification
|
# Return the formatted threshold info for storing in AlertState
|
||||||
|
formatted_threshold_msg = None
|
||||||
|
if threshold_value is not None and new_level != AlertLevel.OK:
|
||||||
|
formatted_threshold_msg = self._format_display(
|
||||||
|
threshold.display,
|
||||||
|
value=value,
|
||||||
|
threshold_value=threshold_value,
|
||||||
|
op_symbol=op_symbol,
|
||||||
|
plugin_data=plugin_data
|
||||||
|
)
|
||||||
|
|
||||||
|
return lvl, message, formatted_threshold_msg
|
||||||
|
|
||||||
|
def _send_notification(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
lvl: str,
|
||||||
|
message: str,
|
||||||
|
metric_path: str,
|
||||||
|
old_level: AlertLevel,
|
||||||
|
new_level: AlertLevel,
|
||||||
|
value: Any,
|
||||||
|
):
|
||||||
|
"""Send notification and log to journal/eventlog."""
|
||||||
if self.notification_callback is not None:
|
if self.notification_callback is not None:
|
||||||
try:
|
try:
|
||||||
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
||||||
@@ -684,6 +911,56 @@ class ThresholdChecker:
|
|||||||
# Log to eventlog as well
|
# Log to eventlog as well
|
||||||
eventlog(host_name, lvl, message, service="threshold")
|
eventlog(host_name, lvl, message, service="threshold")
|
||||||
|
|
||||||
|
def _format_display(
|
||||||
|
self,
|
||||||
|
display_format: str,
|
||||||
|
value: Any,
|
||||||
|
threshold_value: float,
|
||||||
|
op_symbol: str,
|
||||||
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Format the display string using available data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
display_format: Format string from threshold config
|
||||||
|
value: Current metric value
|
||||||
|
threshold_value: Threshold value that was exceeded
|
||||||
|
op_symbol: Comparison operator symbol
|
||||||
|
plugin_data: Optional dictionary of plugin data fields
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted display string
|
||||||
|
"""
|
||||||
|
# Build format context with standard variables
|
||||||
|
format_context = {
|
||||||
|
'value': value,
|
||||||
|
'threshold_value': threshold_value,
|
||||||
|
'op_symbol': op_symbol,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add all plugin data fields if available
|
||||||
|
if plugin_data:
|
||||||
|
format_context.update(plugin_data)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Format the display string
|
||||||
|
return display_format.format(**format_context)
|
||||||
|
except KeyError as e:
|
||||||
|
logger.warning(
|
||||||
|
"Missing format variable in display string '%s': %s",
|
||||||
|
display_format,
|
||||||
|
e
|
||||||
|
)
|
||||||
|
# Fallback to default format
|
||||||
|
return f"(threshold: {op_symbol} {threshold_value})"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Error formatting display string '%s': %s",
|
||||||
|
display_format,
|
||||||
|
e
|
||||||
|
)
|
||||||
|
return f"(threshold: {op_symbol} {threshold_value})"
|
||||||
|
|
||||||
def _check_renotify(
|
def _check_renotify(
|
||||||
self,
|
self,
|
||||||
host_name: str,
|
host_name: str,
|
||||||
@@ -691,8 +968,18 @@ class ThresholdChecker:
|
|||||||
metric_path: str,
|
metric_path: str,
|
||||||
value: Any,
|
value: Any,
|
||||||
threshold: ThresholdConfig,
|
threshold: ThresholdConfig,
|
||||||
|
plugin_data: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
"""Check if we should send a repeat notification."""
|
"""Check if we should send a repeat notification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
alert_state: Current alert state
|
||||||
|
metric_path: Full metric path
|
||||||
|
value: Current metric value
|
||||||
|
threshold: Threshold configuration
|
||||||
|
plugin_data: Optional dictionary of all plugin data fields
|
||||||
|
"""
|
||||||
if alert_state.level == AlertLevel.OK:
|
if alert_state.level == AlertLevel.OK:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -718,7 +1005,15 @@ class ThresholdChecker:
|
|||||||
|
|
||||||
# Time to re-notify
|
# Time to re-notify
|
||||||
if threshold_value is not None:
|
if threshold_value is not None:
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
|
# Use display format string
|
||||||
|
threshold_info = self._format_display(
|
||||||
|
threshold.display,
|
||||||
|
value=value,
|
||||||
|
threshold_value=threshold_value,
|
||||||
|
op_symbol=op_symbol,
|
||||||
|
plugin_data=plugin_data
|
||||||
|
)
|
||||||
|
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||||
else:
|
else:
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||||
|
|
||||||
|
|||||||
Executable
+4
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
#echo "OK - all is well"
|
||||||
|
echo "WARNING - 12 apps require update: calendar->6.2.2 ,call_summary_bot->3.3.0 ,collectives->4.2.0 ,contacts->8.3.7 ,forum->0.36.0 ,mail->5.7.6 ,news->28.1.0 ,notes->4.13.1 ,notify_push->1.3.1 ,ownershiptransfer->1.4.0 ,richdocuments->9.0.5 ,spreed->22.0.10"
|
||||||
Reference in New Issue
Block a user