Files
heartbeat/tests/test_threshold.py
Andreas Wrede ba96da9622 refactor: move loose test files out of project root
- tests/test_threshold.py: has proper pytest test functions
- scripts/test_*.py: manual run scripts with no test functions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-12 23:52:34 -04:00

496 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Test suite for the threshold checking and alerting system.
Tests cover:
- Threshold configuration parsing
- Threshold evaluation (all operators)
- Hysteresis functionality
- Alert state tracking
- State change detection
- Notification triggering
- Re-notification logic
"""
import sys
import time
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from hbd.threshold import (
ThresholdChecker,
ThresholdConfig,
AlertLevel,
AlertState,
ComparisonOperator,
)
def test_threshold_config_basic():
"""Test basic threshold configuration."""
print("Test 1: Basic threshold configuration...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
)
# Test below warning
result = config.evaluate(50.0)
assert result == AlertLevel.OK, f"Expected OK, got {result}"
# Test at warning
result = config.evaluate(80.0)
assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
# Test above warning but below critical
result = config.evaluate(85.0)
assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
# Test above critical
result = config.evaluate(95.0)
assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
print(" ✓ Basic threshold configuration works")
def test_threshold_operators():
"""Test all comparison operators."""
print("\nTest 2: Comparison operators...")
# Greater than operator
config_gt = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
critical=90.0,
operator=">",
)
assert config_gt.evaluate(85.0) == AlertLevel.WARNING
assert config_gt.evaluate(75.0) == AlertLevel.OK
# Less than operator (for inverse thresholds like available memory)
config_lt = ThresholdConfig(
metric_path="memory.available_mb",
warning=1000,
critical=500,
operator="<",
)
assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
# Greater than or equal
config_gte = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
operator=">=",
)
assert config_gte.evaluate(80.0) == AlertLevel.WARNING
assert config_gte.evaluate(79.9) == AlertLevel.OK
# Less than or equal
config_lte = ThresholdConfig(
metric_path="test.metric",
warning=20.0,
operator="<=",
)
assert config_lte.evaluate(20.0) == AlertLevel.WARNING
assert config_lte.evaluate(20.1) == AlertLevel.OK
print(" ✓ All comparison operators work correctly")
def test_hysteresis():
"""Test hysteresis to prevent flapping."""
print("\nTest 3: Hysteresis...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
hysteresis=0.1, # 10% hysteresis
)
# Start at OK, go to WARNING
result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
assert result == AlertLevel.WARNING, "Should enter WARNING state"
# Try to recover with insufficient improvement (within hysteresis)
# Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
# Recover with sufficient improvement
result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
assert result == AlertLevel.OK, "Should recover to OK"
# Test critical hysteresis
result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
# Try to recover from critical with insufficient improvement
# Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
# Sufficient improvement to drop from CRITICAL (below 81)
result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
# Now test dropping from CRITICAL to WARNING
result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
# Drop to between warning and critical, but still in hysteresis zone
result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
# Drop below critical hysteresis but still above warning threshold
# At 80.1, we're above WARNING (80) so should evaluate to WARNING
result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
print(" ✓ Hysteresis prevents flapping")
def test_alert_state():
"""Test alert state tracking."""
print("\nTest 4: Alert state tracking...")
alert = AlertState("cpu_monitor.cpu_percent")
# Initial state
assert alert.level == AlertLevel.OK
assert alert.notification_count == 0
# Update to WARNING - should trigger notification
changed = alert.update(AlertLevel.WARNING, 85.0)
assert changed == True, "State change should return True"
assert alert.level == AlertLevel.WARNING
assert alert.last_value == 85.0
# Update with same level - no notification
changed = alert.update(AlertLevel.WARNING, 86.0)
assert changed == False, "No state change should return False"
assert alert.last_value == 86.0
# Escalate to CRITICAL
changed = alert.update(AlertLevel.CRITICAL, 95.0)
assert changed == True, "Escalation should trigger notification"
assert alert.level == AlertLevel.CRITICAL
# Recover to OK
changed = alert.update(AlertLevel.OK, 50.0)
assert changed == True, "Recovery should trigger notification"
assert alert.level == AlertLevel.OK
print(" ✓ Alert state tracking works correctly")
def test_threshold_checker_parsing():
"""Test parsing threshold configuration from YAML structure."""
print("\nTest 5: Configuration parsing...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"operator": ">",
"hysteresis": 0.1,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
"memory_monitor": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
"available_mb": {
"warning": 1000,
"critical": 500,
"operator": "<",
},
},
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
checker = ThresholdChecker(config)
# Verify thresholds were parsed
assert "cpu_monitor.cpu_percent" in checker.thresholds
assert "cpu_monitor.load_1min" in checker.thresholds
assert "memory_monitor.percent" in checker.thresholds
assert "memory_monitor.available_mb" in checker.thresholds
assert "disk_monitor./.percent" in checker.thresholds
assert "disk_monitor./home.percent" in checker.thresholds
# Verify operators were parsed correctly
assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly")
def test_check_plugin_data():
"""Test checking plugin data against thresholds."""
print("\nTest 6: Plugin data checking...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
}
}
notifications = []
def notification_callback(msg):
notifications.append(msg)
checker = ThresholdChecker(config, notification_callback=notification_callback)
alert_states = {}
# First check - OK
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 0, "No thresholds violated, no state changes"
assert len(notifications) == 0, "No notifications should be sent"
# Second check - WARNING
plugin_data = {
"cpu_percent": 85.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One metric should change state"
assert state_changes[0][0] == "cpu_monitor.cpu_percent"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1, "One notification should be sent"
assert "WARNING" in notifications[0]
assert "testhost" in notifications[0]
# Third check - CRITICAL
plugin_data = {
"cpu_percent": 95.0,
"load_1min": 9.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should change state"
assert len(notifications) == 2, "Two notifications should be sent"
# Fourth check - Recovery
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 1.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should recover"
assert len(notifications) == 2, "Two recovery notifications"
assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
print(" ✓ Plugin data checking and notifications work")
def test_nested_metrics():
"""Test checking nested metrics like disk partitions."""
print("\nTest 7: Nested metrics (partitions)...")
config = {
"thresholds": {
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
plugin_data = {
"partitions": {
"/": {
"percent": 75.0,
"free_gb": 50.0,
},
"/home": {
"percent": 88.0, # Should trigger WARNING
"free_gb": 100.0,
},
},
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="disk_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One partition should trigger alert"
assert "/home" in state_changes[0][0], "Should be /home partition"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1
print(" ✓ Nested metric checking works")
def test_alert_summary():
"""Test getting alert summaries."""
print("\nTest 8: Alert summaries...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {"warning": 80.0, "critical": 90.0},
"load_1min": {"warning": 4.0, "critical": 8.0},
},
"memory_monitor": {
"percent": {"warning": 85.0, "critical": 95.0},
},
}
}
checker = ThresholdChecker(config)
alert_states = {}
# Create some alert states
plugin_data = {
"cpu_percent": 85.0, # WARNING
"load_1min": 9.0, # CRITICAL
}
checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
plugin_data = {
"percent": 96.0, # CRITICAL
}
checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
# Get summary
summary = checker.get_alert_summary(alert_states)
assert summary["warning"] == 1, "Should have 1 warning"
assert summary["critical"] == 2, "Should have 2 critical"
# Get active alerts
active = checker.get_active_alerts(alert_states)
assert len(active) == 3, "Should have 3 active alerts"
print(" ✓ Alert summaries work correctly")
def run_all_tests():
"""Run all tests."""
print("=" * 70)
print("THRESHOLD SYSTEM TEST SUITE")
print("=" * 70)
try:
test_threshold_config_basic()
test_threshold_operators()
test_hysteresis()
test_alert_state()
test_threshold_checker_parsing()
test_check_plugin_data()
test_nested_metrics()
test_alert_summary()
print("\n" + "=" * 70)
print("✓ ALL TESTS PASSED")
print("=" * 70)
return 0
except AssertionError as e:
print(f"\n✗ TEST FAILED: {e}")
import traceback
traceback.print_exc()
return 1
except Exception as e:
print(f"\n✗ UNEXPECTED ERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(run_all_tests())