Files
heartbeat/scripts/demo_threshold.py
Andreas Wrede fed71d97d6 chore: clean up dev scratch files from project root
- Remove rndc-key from tracking, add to .gitignore
- Move async_sms_send.py, demo_threshold.py, nagios_bad.sh to scripts/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-12 23:54:27 -04:00

321 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Demonstration of the threshold alerting system.
This script shows how thresholds work by simulating plugin data
with values that cross various threshold boundaries.
"""
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from hbd.threshold import ThresholdChecker, AlertLevel
def demo_basic_thresholds():
"""Demonstrate basic threshold checking."""
print("=" * 70)
print("DEMO 1: Basic Threshold Checking")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"operator": ">",
"hysteresis": 0.1,
}
}
}
}
notifications = []
def notifier(msg):
notifications.append(msg)
print(f" 📧 NOTIFICATION: {msg}")
checker = ThresholdChecker(config, notification_callback=notifier)
alert_states = {}
# Simulate CPU values over time
test_values = [
(50.0, "Normal operation"),
(85.0, "Crosses WARNING threshold"),
(87.0, "Still in WARNING"),
(95.0, "Escalates to CRITICAL"),
(92.0, "Still CRITICAL (in hysteresis)"),
(85.0, "Still CRITICAL (above recovery threshold of 81)"),
(79.0, "Recovers to OK"),
(50.0, "Back to normal"),
]
print("\nSimulating CPU usage over time:")
print("-" * 70)
for value, description in test_values:
print(f"\n📊 CPU: {value}% - {description}")
plugin_data = {"cpu_percent": value}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
current_state = alert_states.get("cpu_monitor.cpu_percent")
if current_state:
print(f" Current state: {current_state.level.name}")
if state_changes:
for metric, old_level, new_level, val in state_changes:
print(f" ⚠️ State change: {old_level.name}{new_level.name}")
print(f"\n📈 Summary: {len(notifications)} notifications sent")
print("=" * 70)
def demo_multiple_metrics():
"""Demonstrate monitoring multiple metrics."""
print("\n\n" + "=" * 70)
print("DEMO 2: Multiple Metrics and Alert Summary")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {"warning": 80.0, "critical": 90.0},
"load_1min": {"warning": 4.0, "critical": 8.0},
},
"memory_monitor": {
"percent": {"warning": 85.0, "critical": 95.0},
"available_mb": {
"warning": 1000,
"critical": 500,
"operator": "<",
},
},
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
# Simulate problematic system state
print("\nSimulating a system under load:")
print("-" * 70)
scenarios = [
{
"name": "Initial state - all OK",
"cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
},
{
"name": "CPU spikes to WARNING",
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
"memory_monitor": {"percent": 60.0, "available_mb": 2000},
},
{
"name": "Memory also reaches WARNING",
"cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
"memory_monitor": {"percent": 88.0, "available_mb": 800},
},
{
"name": "CPU escalates to CRITICAL",
"cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
"memory_monitor": {"percent": 88.0, "available_mb": 800},
},
{
"name": "System recovering",
"cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
"memory_monitor": {"percent": 65.0, "available_mb": 1500},
},
]
for scenario in scenarios:
print(f"\n📍 {scenario['name']}")
# Check CPU metrics
checker.check_plugin_data(
"testhost",
"cpu_monitor",
scenario["cpu_monitor"],
alert_states
)
# Check memory metrics
checker.check_plugin_data(
"testhost",
"memory_monitor",
scenario["memory_monitor"],
alert_states
)
# Show alert summary
summary = checker.get_alert_summary(alert_states)
print(f" Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
# Show active alerts
active = checker.get_active_alerts(alert_states)
if active:
print(f" Active alerts:")
for alert in active:
print(f" - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
print(f"\n📈 Total notifications sent: {len(notifications)}")
print("=" * 70)
def demo_hysteresis():
"""Demonstrate hysteresis effect."""
print("\n\n" + "=" * 70)
print("DEMO 3: Hysteresis Prevents Flapping")
print("=" * 70)
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"hysteresis": 0.1, # 10% hysteresis
}
}
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
print("\nCritical threshold: 90%")
print("Hysteresis: 10%")
print("Recovery threshold: 81% (90 - 10% of 90)")
print("\nSimulating CPU fluctuating near CRITICAL threshold:")
print("-" * 70)
# Simulate fluctuating values
test_values = [
(75.0, "Normal"),
(92.0, "Crosses CRITICAL"),
(88.0, "Drops but still above 81% (stays CRITICAL)"),
(86.0, "Still above 81% (stays CRITICAL)"),
(83.0, "Still above 81% (stays CRITICAL)"),
(80.0, "Below 81% - recovers to OK"),
(88.0, "Rises again but below 90% (stays OK)"),
(91.0, "Crosses CRITICAL again"),
]
for value, description in test_values:
print(f"\n📊 CPU: {value:5.1f}% - {description}")
plugin_data = {"cpu_percent": value}
state_changes = checker.check_plugin_data(
"testhost",
"cpu_monitor",
plugin_data,
alert_states,
)
current_state = alert_states.get("cpu_monitor.cpu_percent")
print(f" State: {current_state.level.name}")
if state_changes:
print(f" 📧 Notification sent (state changed)")
else:
print(f" ✓ No notification (state unchanged - hysteresis working)")
print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
print("=" * 70)
def demo_inverse_threshold():
"""Demonstrate inverse thresholds (less than)."""
print("\n\n" + "=" * 70)
print("DEMO 4: Inverse Thresholds (Alert When Low)")
print("=" * 70)
config = {
"thresholds": {
"memory_monitor": {
"available_mb": {
"warning": 1000, # Warn when < 1000 MB
"critical": 500, # Critical when < 500 MB
"operator": "<",
"hysteresis": 0.1,
}
}
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
print("\nMonitoring available memory (alert when LOW):")
print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
print("-" * 70)
test_values = [
(2000, "Plenty of memory"),
(800, "Drops below 1000 MB - WARNING"),
(450, "Drops below 500 MB - CRITICAL"),
(520, "Rises but still in hysteresis zone - stays CRITICAL"),
(600, "Enough recovery - back to WARNING"),
(1200, "Fully recovered - OK"),
]
for value, description in test_values:
print(f"\n💾 Available: {value} MB - {description}")
plugin_data = {"available_mb": value}
state_changes = checker.check_plugin_data(
"testhost",
"memory_monitor",
plugin_data,
alert_states,
)
current_state = alert_states.get("memory_monitor.available_mb")
print(f" State: {current_state.level.name}")
if state_changes:
for metric, old_level, new_level, val in state_changes:
print(f" 📧 {old_level.name}{new_level.name}")
print(f"\n📈 Notifications sent: {len(notifications)}")
print("=" * 70)
if __name__ == "__main__":
print("\n")
print("" + "" * 68 + "")
print("" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "")
print("" + "" * 68 + "")
demo_basic_thresholds()
demo_multiple_metrics()
demo_hysteresis()
demo_inverse_threshold()
print("\n\n" + "=" * 70)
print("DEMONSTRATION COMPLETE")
print("=" * 70)
print("\nKey takeaways:")
print(" • Thresholds detect when metrics exceed configured limits")
print(" • Notifications sent only on state changes, not every check")
print(" • Hysteresis prevents alert flapping")
print(" • Supports both 'greater than' and 'less than' thresholds")
print(" • Multiple metrics can be monitored simultaneously")
print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
print("=" * 70)
print()