0543266c92
- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
321 lines
10 KiB
Python
321 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Demonstration of the threshold alerting system.
|
|
|
|
This script shows how thresholds work by simulating plugin data
|
|
with values that cross various threshold boundaries.
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from hbd.threshold import ThresholdChecker, AlertLevel
|
|
|
|
|
|
def demo_basic_thresholds():
|
|
"""Demonstrate basic threshold checking."""
|
|
print("=" * 70)
|
|
print("DEMO 1: Basic Threshold Checking")
|
|
print("=" * 70)
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
"operator": ">",
|
|
"hysteresis": 0.1,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
|
|
def notifier(msg):
|
|
notifications.append(msg)
|
|
print(f" 📧 NOTIFICATION: {msg}")
|
|
|
|
checker = ThresholdChecker(config, notification_callback=notifier)
|
|
alert_states = {}
|
|
|
|
# Simulate CPU values over time
|
|
test_values = [
|
|
(50.0, "Normal operation"),
|
|
(85.0, "Crosses WARNING threshold"),
|
|
(87.0, "Still in WARNING"),
|
|
(95.0, "Escalates to CRITICAL"),
|
|
(92.0, "Still CRITICAL (in hysteresis)"),
|
|
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
|
|
(79.0, "Recovers to OK"),
|
|
(50.0, "Back to normal"),
|
|
]
|
|
|
|
print("\nSimulating CPU usage over time:")
|
|
print("-" * 70)
|
|
|
|
for value, description in test_values:
|
|
print(f"\n📊 CPU: {value}% - {description}")
|
|
|
|
plugin_data = {"cpu_percent": value}
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="cpu_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
|
if current_state:
|
|
print(f" Current state: {current_state.level.name}")
|
|
|
|
if state_changes:
|
|
for metric, old_level, new_level, val in state_changes:
|
|
print(f" ⚠️ State change: {old_level.name} → {new_level.name}")
|
|
|
|
print(f"\n📈 Summary: {len(notifications)} notifications sent")
|
|
print("=" * 70)
|
|
|
|
|
|
def demo_multiple_metrics():
|
|
"""Demonstrate monitoring multiple metrics."""
|
|
print("\n\n" + "=" * 70)
|
|
print("DEMO 2: Multiple Metrics and Alert Summary")
|
|
print("=" * 70)
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
|
"load_1min": {"warning": 4.0, "critical": 8.0},
|
|
},
|
|
"memory_monitor": {
|
|
"percent": {"warning": 85.0, "critical": 95.0},
|
|
"available_mb": {
|
|
"warning": 1000,
|
|
"critical": 500,
|
|
"operator": "<",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
|
alert_states = {}
|
|
|
|
# Simulate problematic system state
|
|
print("\nSimulating a system under load:")
|
|
print("-" * 70)
|
|
|
|
scenarios = [
|
|
{
|
|
"name": "Initial state - all OK",
|
|
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
|
|
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
|
},
|
|
{
|
|
"name": "CPU spikes to WARNING",
|
|
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
|
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
|
|
},
|
|
{
|
|
"name": "Memory also reaches WARNING",
|
|
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
|
|
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
|
},
|
|
{
|
|
"name": "CPU escalates to CRITICAL",
|
|
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
|
|
"memory_monitor": {"percent": 88.0, "available_mb": 800},
|
|
},
|
|
{
|
|
"name": "System recovering",
|
|
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
|
|
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
|
|
},
|
|
]
|
|
|
|
for scenario in scenarios:
|
|
print(f"\n📍 {scenario['name']}")
|
|
|
|
# Check CPU metrics
|
|
checker.check_plugin_data(
|
|
"testhost",
|
|
"cpu_monitor",
|
|
scenario["cpu_monitor"],
|
|
alert_states
|
|
)
|
|
|
|
# Check memory metrics
|
|
checker.check_plugin_data(
|
|
"testhost",
|
|
"memory_monitor",
|
|
scenario["memory_monitor"],
|
|
alert_states
|
|
)
|
|
|
|
# Show alert summary
|
|
summary = checker.get_alert_summary(alert_states)
|
|
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
|
|
|
|
# Show active alerts
|
|
active = checker.get_active_alerts(alert_states)
|
|
if active:
|
|
print(f" Active alerts:")
|
|
for alert in active:
|
|
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
|
|
|
|
print(f"\n📈 Total notifications sent: {len(notifications)}")
|
|
print("=" * 70)
|
|
|
|
|
|
def demo_hysteresis():
|
|
"""Demonstrate hysteresis effect."""
|
|
print("\n\n" + "=" * 70)
|
|
print("DEMO 3: Hysteresis Prevents Flapping")
|
|
print("=" * 70)
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
"hysteresis": 0.1, # 10% hysteresis
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
|
alert_states = {}
|
|
|
|
print("\nCritical threshold: 90%")
|
|
print("Hysteresis: 10%")
|
|
print("Recovery threshold: 81% (90 - 10% of 90)")
|
|
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
|
|
print("-" * 70)
|
|
|
|
# Simulate fluctuating values
|
|
test_values = [
|
|
(75.0, "Normal"),
|
|
(92.0, "Crosses CRITICAL"),
|
|
(88.0, "Drops but still above 81% (stays CRITICAL)"),
|
|
(86.0, "Still above 81% (stays CRITICAL)"),
|
|
(83.0, "Still above 81% (stays CRITICAL)"),
|
|
(80.0, "Below 81% - recovers to OK"),
|
|
(88.0, "Rises again but below 90% (stays OK)"),
|
|
(91.0, "Crosses CRITICAL again"),
|
|
]
|
|
|
|
for value, description in test_values:
|
|
print(f"\n📊 CPU: {value:5.1f}% - {description}")
|
|
|
|
plugin_data = {"cpu_percent": value}
|
|
state_changes = checker.check_plugin_data(
|
|
"testhost",
|
|
"cpu_monitor",
|
|
plugin_data,
|
|
alert_states,
|
|
)
|
|
|
|
current_state = alert_states.get("cpu_monitor.cpu_percent")
|
|
print(f" State: {current_state.level.name}")
|
|
|
|
if state_changes:
|
|
print(f" 📧 Notification sent (state changed)")
|
|
else:
|
|
print(f" ✓ No notification (state unchanged - hysteresis working)")
|
|
|
|
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
|
|
print("=" * 70)
|
|
|
|
|
|
def demo_inverse_threshold():
|
|
"""Demonstrate inverse thresholds (less than)."""
|
|
print("\n\n" + "=" * 70)
|
|
print("DEMO 4: Inverse Thresholds (Alert When Low)")
|
|
print("=" * 70)
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"memory_monitor": {
|
|
"available_mb": {
|
|
"warning": 1000, # Warn when < 1000 MB
|
|
"critical": 500, # Critical when < 500 MB
|
|
"operator": "<",
|
|
"hysteresis": 0.1,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
|
alert_states = {}
|
|
|
|
print("\nMonitoring available memory (alert when LOW):")
|
|
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
|
|
print("-" * 70)
|
|
|
|
test_values = [
|
|
(2000, "Plenty of memory"),
|
|
(800, "Drops below 1000 MB - WARNING"),
|
|
(450, "Drops below 500 MB - CRITICAL"),
|
|
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
|
|
(600, "Enough recovery - back to WARNING"),
|
|
(1200, "Fully recovered - OK"),
|
|
]
|
|
|
|
for value, description in test_values:
|
|
print(f"\n💾 Available: {value} MB - {description}")
|
|
|
|
plugin_data = {"available_mb": value}
|
|
state_changes = checker.check_plugin_data(
|
|
"testhost",
|
|
"memory_monitor",
|
|
plugin_data,
|
|
alert_states,
|
|
)
|
|
|
|
current_state = alert_states.get("memory_monitor.available_mb")
|
|
print(f" State: {current_state.level.name}")
|
|
|
|
if state_changes:
|
|
for metric, old_level, new_level, val in state_changes:
|
|
print(f" 📧 {old_level.name} → {new_level.name}")
|
|
|
|
print(f"\n📈 Notifications sent: {len(notifications)}")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("\n")
|
|
print("╔" + "═" * 68 + "╗")
|
|
print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
|
|
print("╚" + "═" * 68 + "╝")
|
|
|
|
demo_basic_thresholds()
|
|
demo_multiple_metrics()
|
|
demo_hysteresis()
|
|
demo_inverse_threshold()
|
|
|
|
print("\n\n" + "=" * 70)
|
|
print("DEMONSTRATION COMPLETE")
|
|
print("=" * 70)
|
|
print("\nKey takeaways:")
|
|
print(" • Thresholds detect when metrics exceed configured limits")
|
|
print(" • Notifications sent only on state changes, not every check")
|
|
print(" • Hysteresis prevents alert flapping")
|
|
print(" • Supports both 'greater than' and 'less than' thresholds")
|
|
print(" • Multiple metrics can be monitored simultaneously")
|
|
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
|
|
print("=" * 70)
|
|
print()
|