#!/usr/bin/env python3 """ Test suite for the threshold checking and alerting system. Tests cover: - Threshold configuration parsing - Threshold evaluation (all operators) - Hysteresis functionality - Alert state tracking - State change detection - Notification triggering - Re-notification logic """ import sys import time from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from hbd.threshold import ( ThresholdChecker, ThresholdConfig, AlertLevel, AlertState, ComparisonOperator, ) def test_threshold_config_basic(): """Test basic threshold configuration.""" print("Test 1: Basic threshold configuration...") config = ThresholdConfig( metric_path="cpu_monitor.cpu_percent", warning=80.0, critical=90.0, operator=">", ) # Test below warning result = config.evaluate(50.0) assert result == AlertLevel.OK, f"Expected OK, got {result}" # Test at warning result = config.evaluate(80.0) assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}" # Test above warning but below critical result = config.evaluate(85.0) assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}" # Test above critical result = config.evaluate(95.0) assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}" print(" ✓ Basic threshold configuration works") def test_threshold_operators(): """Test all comparison operators.""" print("\nTest 2: Comparison operators...") # Greater than operator config_gt = ThresholdConfig( metric_path="test.metric", warning=80.0, critical=90.0, operator=">", ) assert config_gt.evaluate(85.0) == AlertLevel.WARNING assert config_gt.evaluate(75.0) == AlertLevel.OK # Less than operator (for inverse thresholds like available memory) config_lt = ThresholdConfig( metric_path="memory.available_mb", warning=1000, critical=500, operator="<", ) assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000" assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500" assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000" # Greater than or equal config_gte = ThresholdConfig( metric_path="test.metric", warning=80.0, operator=">=", ) assert config_gte.evaluate(80.0) == AlertLevel.WARNING assert config_gte.evaluate(79.9) == AlertLevel.OK # Less than or equal config_lte = ThresholdConfig( metric_path="test.metric", warning=20.0, operator="<=", ) assert config_lte.evaluate(20.0) == AlertLevel.WARNING assert config_lte.evaluate(20.1) == AlertLevel.OK print(" ✓ All comparison operators work correctly") def test_hysteresis(): """Test hysteresis to prevent flapping.""" print("\nTest 3: Hysteresis...") config = ThresholdConfig( metric_path="cpu_monitor.cpu_percent", warning=80.0, critical=90.0, operator=">", hysteresis=0.1, # 10% hysteresis ) # Start at OK, go to WARNING result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK) assert result == AlertLevel.WARNING, "Should enter WARNING state" # Try to recover with insufficient improvement (within hysteresis) # Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72 result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING) assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)" # Recover with sufficient improvement result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING) assert result == AlertLevel.OK, "Should recover to OK" # Test critical hysteresis result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING) assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL" # Try to recover from critical with insufficient improvement # Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81 result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL) assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)" # Sufficient improvement to drop from CRITICAL (below 81) result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL) assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)" # Now test dropping from CRITICAL to WARNING result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK) assert result == AlertLevel.CRITICAL, "Should go to CRITICAL" # Drop to between warning and critical, but still in hysteresis zone result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL) assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)" # Drop below critical hysteresis but still above warning threshold # At 80.1, we're above WARNING (80) so should evaluate to WARNING result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL) assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis" print(" ✓ Hysteresis prevents flapping") def test_alert_state(): """Test alert state tracking.""" print("\nTest 4: Alert state tracking...") alert = AlertState("cpu_monitor.cpu_percent") # Initial state assert alert.level == AlertLevel.OK assert alert.notification_count == 0 # Update to WARNING - should trigger notification changed = alert.update(AlertLevel.WARNING, 85.0) assert changed == True, "State change should return True" assert alert.level == AlertLevel.WARNING assert alert.last_value == 85.0 # Update with same level - no notification changed = alert.update(AlertLevel.WARNING, 86.0) assert changed == False, "No state change should return False" assert alert.last_value == 86.0 # Escalate to CRITICAL changed = alert.update(AlertLevel.CRITICAL, 95.0) assert changed == True, "Escalation should trigger notification" assert alert.level == AlertLevel.CRITICAL # Recover to OK changed = alert.update(AlertLevel.OK, 50.0) assert changed == True, "Recovery should trigger notification" assert alert.level == AlertLevel.OK print(" ✓ Alert state tracking works correctly") def test_threshold_checker_parsing(): """Test parsing threshold configuration from YAML structure.""" print("\nTest 5: Configuration parsing...") config = { "thresholds": { "cpu_monitor": { "cpu_percent": { "warning": 80.0, "critical": 90.0, "operator": ">", "hysteresis": 0.1, }, "load_1min": { "warning": 4.0, "critical": 8.0, }, }, "memory_monitor": { "percent": { "warning": 85.0, "critical": 95.0, }, "available_mb": { "warning": 1000, "critical": 500, "operator": "<", }, }, "disk_monitor": { "partitions": { "/": { "percent": { "warning": 80.0, "critical": 90.0, }, }, "/home": { "percent": { "warning": 85.0, "critical": 95.0, }, }, }, }, } } checker = ThresholdChecker(config) # Verify thresholds were parsed assert "cpu_monitor.cpu_percent" in checker.thresholds assert "cpu_monitor.load_1min" in checker.thresholds assert "memory_monitor.percent" in checker.thresholds assert "memory_monitor.available_mb" in checker.thresholds assert "disk_monitor./.percent" in checker.thresholds assert "disk_monitor./home.percent" in checker.thresholds # Verify operators were parsed correctly assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly") def test_check_plugin_data(): """Test checking plugin data against thresholds.""" print("\nTest 6: Plugin data checking...") config = { "thresholds": { "cpu_monitor": { "cpu_percent": { "warning": 80.0, "critical": 90.0, }, "load_1min": { "warning": 4.0, "critical": 8.0, }, }, } } notifications = [] def notification_callback(msg): notifications.append(msg) checker = ThresholdChecker(config, notification_callback=notification_callback) alert_states = {} # First check - OK plugin_data = { "cpu_percent": 50.0, "load_1min": 2.0, } state_changes = checker.check_plugin_data( host_name="testhost", plugin_name="cpu_monitor", data=plugin_data, alert_states=alert_states, ) assert len(state_changes) == 0, "No thresholds violated, no state changes" assert len(notifications) == 0, "No notifications should be sent" # Second check - WARNING plugin_data = { "cpu_percent": 85.0, "load_1min": 2.0, } state_changes = checker.check_plugin_data( host_name="testhost", plugin_name="cpu_monitor", data=plugin_data, alert_states=alert_states, ) assert len(state_changes) == 1, "One metric should change state" assert state_changes[0][0] == "cpu_monitor.cpu_percent" assert state_changes[0][2] == AlertLevel.WARNING assert len(notifications) == 1, "One notification should be sent" assert "WARNING" in notifications[0] assert "testhost" in notifications[0] # Third check - CRITICAL plugin_data = { "cpu_percent": 95.0, "load_1min": 9.0, } notifications.clear() state_changes = checker.check_plugin_data( host_name="testhost", plugin_name="cpu_monitor", data=plugin_data, alert_states=alert_states, ) assert len(state_changes) == 2, "Two metrics should change state" assert len(notifications) == 2, "Two notifications should be sent" # Fourth check - Recovery plugin_data = { "cpu_percent": 50.0, "load_1min": 1.0, } notifications.clear() state_changes = checker.check_plugin_data( host_name="testhost", plugin_name="cpu_monitor", data=plugin_data, alert_states=alert_states, ) assert len(state_changes) == 2, "Two metrics should recover" assert len(notifications) == 2, "Two recovery notifications" assert any("RECOVERED" in n for n in notifications), "Should have recovery notification" print(" ✓ Plugin data checking and notifications work") def test_nested_metrics(): """Test checking nested metrics like disk partitions.""" print("\nTest 7: Nested metrics (partitions)...") config = { "thresholds": { "disk_monitor": { "partitions": { "/": { "percent": { "warning": 80.0, "critical": 90.0, }, }, "/home": { "percent": { "warning": 85.0, "critical": 95.0, }, }, }, }, } } notifications = [] checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m)) alert_states = {} plugin_data = { "partitions": { "/": { "percent": 75.0, "free_gb": 50.0, }, "/home": { "percent": 88.0, # Should trigger WARNING "free_gb": 100.0, }, }, } state_changes = checker.check_plugin_data( host_name="testhost", plugin_name="disk_monitor", data=plugin_data, alert_states=alert_states, ) assert len(state_changes) == 1, "One partition should trigger alert" assert "/home" in state_changes[0][0], "Should be /home partition" assert state_changes[0][2] == AlertLevel.WARNING assert len(notifications) == 1 print(" ✓ Nested metric checking works") def test_alert_summary(): """Test getting alert summaries.""" print("\nTest 8: Alert summaries...") config = { "thresholds": { "cpu_monitor": { "cpu_percent": {"warning": 80.0, "critical": 90.0}, "load_1min": {"warning": 4.0, "critical": 8.0}, }, "memory_monitor": { "percent": {"warning": 85.0, "critical": 95.0}, }, } } checker = ThresholdChecker(config) alert_states = {} # Create some alert states plugin_data = { "cpu_percent": 85.0, # WARNING "load_1min": 9.0, # CRITICAL } checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states) plugin_data = { "percent": 96.0, # CRITICAL } checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states) # Get summary summary = checker.get_alert_summary(alert_states) assert summary["warning"] == 1, "Should have 1 warning" assert summary["critical"] == 2, "Should have 2 critical" # Get active alerts active = checker.get_active_alerts(alert_states) assert len(active) == 3, "Should have 3 active alerts" print(" ✓ Alert summaries work correctly") def run_all_tests(): """Run all tests.""" print("=" * 70) print("THRESHOLD SYSTEM TEST SUITE") print("=" * 70) try: test_threshold_config_basic() test_threshold_operators() test_hysteresis() test_alert_state() test_threshold_checker_parsing() test_check_plugin_data() test_nested_metrics() test_alert_summary() print("\n" + "=" * 70) print("✓ ALL TESTS PASSED") print("=" * 70) return 0 except AssertionError as e: print(f"\n✗ TEST FAILED: {e}") import traceback traceback.print_exc() return 1 except Exception as e: print(f"\n✗ UNEXPECTED ERROR: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(run_all_tests())