0543266c92
- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
496 lines
15 KiB
Python
496 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test suite for the threshold checking and alerting system.
|
|
|
|
Tests cover:
|
|
- Threshold configuration parsing
|
|
- Threshold evaluation (all operators)
|
|
- Hysteresis functionality
|
|
- Alert state tracking
|
|
- State change detection
|
|
- Notification triggering
|
|
- Re-notification logic
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from hbd.threshold import (
|
|
ThresholdChecker,
|
|
ThresholdConfig,
|
|
AlertLevel,
|
|
AlertState,
|
|
ComparisonOperator,
|
|
)
|
|
|
|
|
|
def test_threshold_config_basic():
|
|
"""Test basic threshold configuration."""
|
|
print("Test 1: Basic threshold configuration...")
|
|
|
|
config = ThresholdConfig(
|
|
metric_path="cpu_monitor.cpu_percent",
|
|
warning=80.0,
|
|
critical=90.0,
|
|
operator=">",
|
|
)
|
|
|
|
# Test below warning
|
|
result = config.evaluate(50.0)
|
|
assert result == AlertLevel.OK, f"Expected OK, got {result}"
|
|
|
|
# Test at warning
|
|
result = config.evaluate(80.0)
|
|
assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
|
|
|
|
# Test above warning but below critical
|
|
result = config.evaluate(85.0)
|
|
assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
|
|
|
|
# Test above critical
|
|
result = config.evaluate(95.0)
|
|
assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
|
|
|
|
print(" ✓ Basic threshold configuration works")
|
|
|
|
|
|
def test_threshold_operators():
|
|
"""Test all comparison operators."""
|
|
print("\nTest 2: Comparison operators...")
|
|
|
|
# Greater than operator
|
|
config_gt = ThresholdConfig(
|
|
metric_path="test.metric",
|
|
warning=80.0,
|
|
critical=90.0,
|
|
operator=">",
|
|
)
|
|
assert config_gt.evaluate(85.0) == AlertLevel.WARNING
|
|
assert config_gt.evaluate(75.0) == AlertLevel.OK
|
|
|
|
# Less than operator (for inverse thresholds like available memory)
|
|
config_lt = ThresholdConfig(
|
|
metric_path="memory.available_mb",
|
|
warning=1000,
|
|
critical=500,
|
|
operator="<",
|
|
)
|
|
assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
|
|
assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
|
|
assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
|
|
|
|
# Greater than or equal
|
|
config_gte = ThresholdConfig(
|
|
metric_path="test.metric",
|
|
warning=80.0,
|
|
operator=">=",
|
|
)
|
|
assert config_gte.evaluate(80.0) == AlertLevel.WARNING
|
|
assert config_gte.evaluate(79.9) == AlertLevel.OK
|
|
|
|
# Less than or equal
|
|
config_lte = ThresholdConfig(
|
|
metric_path="test.metric",
|
|
warning=20.0,
|
|
operator="<=",
|
|
)
|
|
assert config_lte.evaluate(20.0) == AlertLevel.WARNING
|
|
assert config_lte.evaluate(20.1) == AlertLevel.OK
|
|
|
|
print(" ✓ All comparison operators work correctly")
|
|
|
|
|
|
def test_hysteresis():
|
|
"""Test hysteresis to prevent flapping."""
|
|
print("\nTest 3: Hysteresis...")
|
|
|
|
config = ThresholdConfig(
|
|
metric_path="cpu_monitor.cpu_percent",
|
|
warning=80.0,
|
|
critical=90.0,
|
|
operator=">",
|
|
hysteresis=0.1, # 10% hysteresis
|
|
)
|
|
|
|
# Start at OK, go to WARNING
|
|
result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
|
|
assert result == AlertLevel.WARNING, "Should enter WARNING state"
|
|
|
|
# Try to recover with insufficient improvement (within hysteresis)
|
|
# Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
|
|
result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
|
|
assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
|
|
|
|
# Recover with sufficient improvement
|
|
result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
|
|
assert result == AlertLevel.OK, "Should recover to OK"
|
|
|
|
# Test critical hysteresis
|
|
result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
|
|
assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
|
|
|
|
# Try to recover from critical with insufficient improvement
|
|
# Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
|
|
result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
|
|
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
|
|
|
|
# Sufficient improvement to drop from CRITICAL (below 81)
|
|
result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
|
|
assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
|
|
|
|
# Now test dropping from CRITICAL to WARNING
|
|
result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
|
|
assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
|
|
|
|
# Drop to between warning and critical, but still in hysteresis zone
|
|
result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
|
|
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
|
|
|
|
# Drop below critical hysteresis but still above warning threshold
|
|
# At 80.1, we're above WARNING (80) so should evaluate to WARNING
|
|
result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
|
|
assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
|
|
|
|
print(" ✓ Hysteresis prevents flapping")
|
|
|
|
|
|
def test_alert_state():
|
|
"""Test alert state tracking."""
|
|
print("\nTest 4: Alert state tracking...")
|
|
|
|
alert = AlertState("cpu_monitor.cpu_percent")
|
|
|
|
# Initial state
|
|
assert alert.level == AlertLevel.OK
|
|
assert alert.notification_count == 0
|
|
|
|
# Update to WARNING - should trigger notification
|
|
changed = alert.update(AlertLevel.WARNING, 85.0)
|
|
assert changed == True, "State change should return True"
|
|
assert alert.level == AlertLevel.WARNING
|
|
assert alert.last_value == 85.0
|
|
|
|
# Update with same level - no notification
|
|
changed = alert.update(AlertLevel.WARNING, 86.0)
|
|
assert changed == False, "No state change should return False"
|
|
assert alert.last_value == 86.0
|
|
|
|
# Escalate to CRITICAL
|
|
changed = alert.update(AlertLevel.CRITICAL, 95.0)
|
|
assert changed == True, "Escalation should trigger notification"
|
|
assert alert.level == AlertLevel.CRITICAL
|
|
|
|
# Recover to OK
|
|
changed = alert.update(AlertLevel.OK, 50.0)
|
|
assert changed == True, "Recovery should trigger notification"
|
|
assert alert.level == AlertLevel.OK
|
|
|
|
print(" ✓ Alert state tracking works correctly")
|
|
|
|
|
|
def test_threshold_checker_parsing():
|
|
"""Test parsing threshold configuration from YAML structure."""
|
|
print("\nTest 5: Configuration parsing...")
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
"operator": ">",
|
|
"hysteresis": 0.1,
|
|
},
|
|
"load_1min": {
|
|
"warning": 4.0,
|
|
"critical": 8.0,
|
|
},
|
|
},
|
|
"memory_monitor": {
|
|
"percent": {
|
|
"warning": 85.0,
|
|
"critical": 95.0,
|
|
},
|
|
"available_mb": {
|
|
"warning": 1000,
|
|
"critical": 500,
|
|
"operator": "<",
|
|
},
|
|
},
|
|
"disk_monitor": {
|
|
"partitions": {
|
|
"/": {
|
|
"percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
},
|
|
},
|
|
"/home": {
|
|
"percent": {
|
|
"warning": 85.0,
|
|
"critical": 95.0,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
checker = ThresholdChecker(config)
|
|
|
|
# Verify thresholds were parsed
|
|
assert "cpu_monitor.cpu_percent" in checker.thresholds
|
|
assert "cpu_monitor.load_1min" in checker.thresholds
|
|
assert "memory_monitor.percent" in checker.thresholds
|
|
assert "memory_monitor.available_mb" in checker.thresholds
|
|
assert "disk_monitor./.percent" in checker.thresholds
|
|
assert "disk_monitor./home.percent" in checker.thresholds
|
|
|
|
# Verify operators were parsed correctly
|
|
assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
|
|
assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
|
|
|
|
print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly")
|
|
|
|
|
|
def test_check_plugin_data():
|
|
"""Test checking plugin data against thresholds."""
|
|
print("\nTest 6: Plugin data checking...")
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
},
|
|
"load_1min": {
|
|
"warning": 4.0,
|
|
"critical": 8.0,
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
|
|
def notification_callback(msg):
|
|
notifications.append(msg)
|
|
|
|
checker = ThresholdChecker(config, notification_callback=notification_callback)
|
|
alert_states = {}
|
|
|
|
# First check - OK
|
|
plugin_data = {
|
|
"cpu_percent": 50.0,
|
|
"load_1min": 2.0,
|
|
}
|
|
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="cpu_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
assert len(state_changes) == 0, "No thresholds violated, no state changes"
|
|
assert len(notifications) == 0, "No notifications should be sent"
|
|
|
|
# Second check - WARNING
|
|
plugin_data = {
|
|
"cpu_percent": 85.0,
|
|
"load_1min": 2.0,
|
|
}
|
|
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="cpu_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
assert len(state_changes) == 1, "One metric should change state"
|
|
assert state_changes[0][0] == "cpu_monitor.cpu_percent"
|
|
assert state_changes[0][2] == AlertLevel.WARNING
|
|
assert len(notifications) == 1, "One notification should be sent"
|
|
assert "WARNING" in notifications[0]
|
|
assert "testhost" in notifications[0]
|
|
|
|
# Third check - CRITICAL
|
|
plugin_data = {
|
|
"cpu_percent": 95.0,
|
|
"load_1min": 9.0,
|
|
}
|
|
|
|
notifications.clear()
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="cpu_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
assert len(state_changes) == 2, "Two metrics should change state"
|
|
assert len(notifications) == 2, "Two notifications should be sent"
|
|
|
|
# Fourth check - Recovery
|
|
plugin_data = {
|
|
"cpu_percent": 50.0,
|
|
"load_1min": 1.0,
|
|
}
|
|
|
|
notifications.clear()
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="cpu_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
assert len(state_changes) == 2, "Two metrics should recover"
|
|
assert len(notifications) == 2, "Two recovery notifications"
|
|
assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
|
|
|
|
print(" ✓ Plugin data checking and notifications work")
|
|
|
|
|
|
def test_nested_metrics():
|
|
"""Test checking nested metrics like disk partitions."""
|
|
print("\nTest 7: Nested metrics (partitions)...")
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"disk_monitor": {
|
|
"partitions": {
|
|
"/": {
|
|
"percent": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
},
|
|
},
|
|
"/home": {
|
|
"percent": {
|
|
"warning": 85.0,
|
|
"critical": 95.0,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
notifications = []
|
|
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
|
|
alert_states = {}
|
|
|
|
plugin_data = {
|
|
"partitions": {
|
|
"/": {
|
|
"percent": 75.0,
|
|
"free_gb": 50.0,
|
|
},
|
|
"/home": {
|
|
"percent": 88.0, # Should trigger WARNING
|
|
"free_gb": 100.0,
|
|
},
|
|
},
|
|
}
|
|
|
|
state_changes = checker.check_plugin_data(
|
|
host_name="testhost",
|
|
plugin_name="disk_monitor",
|
|
data=plugin_data,
|
|
alert_states=alert_states,
|
|
)
|
|
|
|
assert len(state_changes) == 1, "One partition should trigger alert"
|
|
assert "/home" in state_changes[0][0], "Should be /home partition"
|
|
assert state_changes[0][2] == AlertLevel.WARNING
|
|
assert len(notifications) == 1
|
|
|
|
print(" ✓ Nested metric checking works")
|
|
|
|
|
|
def test_alert_summary():
|
|
"""Test getting alert summaries."""
|
|
print("\nTest 8: Alert summaries...")
|
|
|
|
config = {
|
|
"thresholds": {
|
|
"cpu_monitor": {
|
|
"cpu_percent": {"warning": 80.0, "critical": 90.0},
|
|
"load_1min": {"warning": 4.0, "critical": 8.0},
|
|
},
|
|
"memory_monitor": {
|
|
"percent": {"warning": 85.0, "critical": 95.0},
|
|
},
|
|
}
|
|
}
|
|
|
|
checker = ThresholdChecker(config)
|
|
alert_states = {}
|
|
|
|
# Create some alert states
|
|
plugin_data = {
|
|
"cpu_percent": 85.0, # WARNING
|
|
"load_1min": 9.0, # CRITICAL
|
|
}
|
|
checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
|
|
|
|
plugin_data = {
|
|
"percent": 96.0, # CRITICAL
|
|
}
|
|
checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
|
|
|
|
# Get summary
|
|
summary = checker.get_alert_summary(alert_states)
|
|
assert summary["warning"] == 1, "Should have 1 warning"
|
|
assert summary["critical"] == 2, "Should have 2 critical"
|
|
|
|
# Get active alerts
|
|
active = checker.get_active_alerts(alert_states)
|
|
assert len(active) == 3, "Should have 3 active alerts"
|
|
|
|
print(" ✓ Alert summaries work correctly")
|
|
|
|
|
|
def run_all_tests():
|
|
"""Run all tests."""
|
|
print("=" * 70)
|
|
print("THRESHOLD SYSTEM TEST SUITE")
|
|
print("=" * 70)
|
|
|
|
try:
|
|
test_threshold_config_basic()
|
|
test_threshold_operators()
|
|
test_hysteresis()
|
|
test_alert_state()
|
|
test_threshold_checker_parsing()
|
|
test_check_plugin_data()
|
|
test_nested_metrics()
|
|
test_alert_summary()
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✓ ALL TESTS PASSED")
|
|
print("=" * 70)
|
|
return 0
|
|
|
|
except AssertionError as e:
|
|
print(f"\n✗ TEST FAILED: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
except Exception as e:
|
|
print(f"\n✗ UNEXPECTED ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(run_all_tests())
|