Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:

- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
2026-03-29 11:13:40 -04:00
parent 7e2038ecac
commit 0543266c92
65 changed files with 11371 additions and 140 deletions
@@ -0,0 +1,495 @@
+#!/usr/bin/env python3
+"""
+Test suite for the threshold checking and alerting system.
+
+Tests cover:
+- Threshold configuration parsing
+- Threshold evaluation (all operators)
+- Hysteresis functionality
+- Alert state tracking
+- State change detection
+- Notification triggering
+- Re-notification logic
+"""
+
+import sys
+import time
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from hbd.threshold import (
+    ThresholdChecker,
+    ThresholdConfig,
+    AlertLevel,
+    AlertState,
+    ComparisonOperator,
+)
+
+
+def test_threshold_config_basic():
+    """Test basic threshold configuration."""
+    print("Test 1: Basic threshold configuration...")
+    
+    config = ThresholdConfig(
+        metric_path="cpu_monitor.cpu_percent",
+        warning=80.0,
+        critical=90.0,
+        operator=">",
+    )
+    
+    # Test below warning
+    result = config.evaluate(50.0)
+    assert result == AlertLevel.OK, f"Expected OK, got {result}"
+    
+    # Test at warning
+    result = config.evaluate(80.0)
+    assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
+    
+    # Test above warning but below critical
+    result = config.evaluate(85.0)
+    assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
+    
+    # Test above critical
+    result = config.evaluate(95.0)
+    assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
+    
+    print("  ✓ Basic threshold configuration works")
+
+
+def test_threshold_operators():
+    """Test all comparison operators."""
+    print("\nTest 2: Comparison operators...")
+    
+    # Greater than operator
+    config_gt = ThresholdConfig(
+        metric_path="test.metric",
+        warning=80.0,
+        critical=90.0,
+        operator=">",
+    )
+    assert config_gt.evaluate(85.0) == AlertLevel.WARNING
+    assert config_gt.evaluate(75.0) == AlertLevel.OK
+    
+    # Less than operator (for inverse thresholds like available memory)
+    config_lt = ThresholdConfig(
+        metric_path="memory.available_mb",
+        warning=1000,
+        critical=500,
+        operator="<",
+    )
+    assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
+    assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
+    assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
+    
+    # Greater than or equal
+    config_gte = ThresholdConfig(
+        metric_path="test.metric",
+        warning=80.0,
+        operator=">=",
+    )
+    assert config_gte.evaluate(80.0) == AlertLevel.WARNING
+    assert config_gte.evaluate(79.9) == AlertLevel.OK
+    
+    # Less than or equal
+    config_lte = ThresholdConfig(
+        metric_path="test.metric",
+        warning=20.0,
+        operator="<=",
+    )
+    assert config_lte.evaluate(20.0) == AlertLevel.WARNING
+    assert config_lte.evaluate(20.1) == AlertLevel.OK
+    
+    print("  ✓ All comparison operators work correctly")
+
+
+def test_hysteresis():
+    """Test hysteresis to prevent flapping."""
+    print("\nTest 3: Hysteresis...")
+    
+    config = ThresholdConfig(
+        metric_path="cpu_monitor.cpu_percent",
+        warning=80.0,
+        critical=90.0,
+        operator=">",
+        hysteresis=0.1,  # 10% hysteresis
+    )
+    
+    # Start at OK, go to WARNING
+    result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
+    assert result == AlertLevel.WARNING, "Should enter WARNING state"
+    
+    # Try to recover with insufficient improvement (within hysteresis)
+    # Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
+    result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
+    assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
+    
+    # Recover with sufficient improvement
+    result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
+    assert result == AlertLevel.OK, "Should recover to OK"
+    
+    # Test critical hysteresis
+    result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
+    assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
+    
+    # Try to recover from critical with insufficient improvement
+    # Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
+    result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
+    assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
+    
+    # Sufficient improvement to drop from CRITICAL (below 81)
+    result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
+    assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
+    
+    # Now test dropping from CRITICAL to WARNING
+    result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
+    assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
+    
+    # Drop to between warning and critical, but still in hysteresis zone
+    result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
+    assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
+    
+    # Drop below critical hysteresis but still above warning threshold
+    # At 80.1, we're above WARNING (80) so should evaluate to WARNING
+    result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
+    assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
+    
+    print("  ✓ Hysteresis prevents flapping")
+
+
+def test_alert_state():
+    """Test alert state tracking."""
+    print("\nTest 4: Alert state tracking...")
+    
+    alert = AlertState("cpu_monitor.cpu_percent")
+    
+    # Initial state
+    assert alert.level == AlertLevel.OK
+    assert alert.notification_count == 0
+    
+    # Update to WARNING - should trigger notification
+    changed = alert.update(AlertLevel.WARNING, 85.0)
+    assert changed == True, "State change should return True"
+    assert alert.level == AlertLevel.WARNING
+    assert alert.last_value == 85.0
+    
+    # Update with same level - no notification
+    changed = alert.update(AlertLevel.WARNING, 86.0)
+    assert changed == False, "No state change should return False"
+    assert alert.last_value == 86.0
+    
+    # Escalate to CRITICAL
+    changed = alert.update(AlertLevel.CRITICAL, 95.0)
+    assert changed == True, "Escalation should trigger notification"
+    assert alert.level == AlertLevel.CRITICAL
+    
+    # Recover to OK
+    changed = alert.update(AlertLevel.OK, 50.0)
+    assert changed == True, "Recovery should trigger notification"
+    assert alert.level == AlertLevel.OK
+    
+    print("  ✓ Alert state tracking works correctly")
+
+
+def test_threshold_checker_parsing():
+    """Test parsing threshold configuration from YAML structure."""
+    print("\nTest 5: Configuration parsing...")
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                    "operator": ">",
+                    "hysteresis": 0.1,
+                },
+                "load_1min": {
+                    "warning": 4.0,
+                    "critical": 8.0,
+                },
+            },
+            "memory_monitor": {
+                "percent": {
+                    "warning": 85.0,
+                    "critical": 95.0,
+                },
+                "available_mb": {
+                    "warning": 1000,
+                    "critical": 500,
+                    "operator": "<",
+                },
+            },
+            "disk_monitor": {
+                "partitions": {
+                    "/": {
+                        "percent": {
+                            "warning": 80.0,
+                            "critical": 90.0,
+                        },
+                    },
+                    "/home": {
+                        "percent": {
+                            "warning": 85.0,
+                            "critical": 95.0,
+                        },
+                    },
+                },
+            },
+        }
+    }
+    
+    checker = ThresholdChecker(config)
+    
+    # Verify thresholds were parsed
+    assert "cpu_monitor.cpu_percent" in checker.thresholds
+    assert "cpu_monitor.load_1min" in checker.thresholds
+    assert "memory_monitor.percent" in checker.thresholds
+    assert "memory_monitor.available_mb" in checker.thresholds
+    assert "disk_monitor./.percent" in checker.thresholds
+    assert "disk_monitor./home.percent" in checker.thresholds
+    
+    # Verify operators were parsed correctly
+    assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
+    assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
+    
+    print(f"  ✓ Parsed {len(checker.thresholds)} thresholds correctly")
+
+
+def test_check_plugin_data():
+    """Test checking plugin data against thresholds."""
+    print("\nTest 6: Plugin data checking...")
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                },
+                "load_1min": {
+                    "warning": 4.0,
+                    "critical": 8.0,
+                },
+            },
+        }
+    }
+    
+    notifications = []
+    
+    def notification_callback(msg):
+        notifications.append(msg)
+    
+    checker = ThresholdChecker(config, notification_callback=notification_callback)
+    alert_states = {}
+    
+    # First check - OK
+    plugin_data = {
+        "cpu_percent": 50.0,
+        "load_1min": 2.0,
+    }
+    
+    state_changes = checker.check_plugin_data(
+        host_name="testhost",
+        plugin_name="cpu_monitor",
+        data=plugin_data,
+        alert_states=alert_states,
+    )
+    
+    assert len(state_changes) == 0, "No thresholds violated, no state changes"
+    assert len(notifications) == 0, "No notifications should be sent"
+    
+    # Second check - WARNING
+    plugin_data = {
+        "cpu_percent": 85.0,
+        "load_1min": 2.0,
+    }
+    
+    state_changes = checker.check_plugin_data(
+        host_name="testhost",
+        plugin_name="cpu_monitor",
+        data=plugin_data,
+        alert_states=alert_states,
+    )
+    
+    assert len(state_changes) == 1, "One metric should change state"
+    assert state_changes[0][0] == "cpu_monitor.cpu_percent"
+    assert state_changes[0][2] == AlertLevel.WARNING
+    assert len(notifications) == 1, "One notification should be sent"
+    assert "WARNING" in notifications[0]
+    assert "testhost" in notifications[0]
+    
+    # Third check - CRITICAL
+    plugin_data = {
+        "cpu_percent": 95.0,
+        "load_1min": 9.0,
+    }
+    
+    notifications.clear()
+    state_changes = checker.check_plugin_data(
+        host_name="testhost",
+        plugin_name="cpu_monitor",
+        data=plugin_data,
+        alert_states=alert_states,
+    )
+    
+    assert len(state_changes) == 2, "Two metrics should change state"
+    assert len(notifications) == 2, "Two notifications should be sent"
+    
+    # Fourth check - Recovery
+    plugin_data = {
+        "cpu_percent": 50.0,
+        "load_1min": 1.0,
+    }
+    
+    notifications.clear()
+    state_changes = checker.check_plugin_data(
+        host_name="testhost",
+        plugin_name="cpu_monitor",
+        data=plugin_data,
+        alert_states=alert_states,
+    )
+    
+    assert len(state_changes) == 2, "Two metrics should recover"
+    assert len(notifications) == 2, "Two recovery notifications"
+    assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
+    
+    print("  ✓ Plugin data checking and notifications work")
+
+
+def test_nested_metrics():
+    """Test checking nested metrics like disk partitions."""
+    print("\nTest 7: Nested metrics (partitions)...")
+    
+    config = {
+        "thresholds": {
+            "disk_monitor": {
+                "partitions": {
+                    "/": {
+                        "percent": {
+                            "warning": 80.0,
+                            "critical": 90.0,
+                        },
+                    },
+                    "/home": {
+                        "percent": {
+                            "warning": 85.0,
+                            "critical": 95.0,
+                        },
+                    },
+                },
+            },
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    plugin_data = {
+        "partitions": {
+            "/": {
+                "percent": 75.0,
+                "free_gb": 50.0,
+            },
+            "/home": {
+                "percent": 88.0,  # Should trigger WARNING
+                "free_gb": 100.0,
+            },
+        },
+    }
+    
+    state_changes = checker.check_plugin_data(
+        host_name="testhost",
+        plugin_name="disk_monitor",
+        data=plugin_data,
+        alert_states=alert_states,
+    )
+    
+    assert len(state_changes) == 1, "One partition should trigger alert"
+    assert "/home" in state_changes[0][0], "Should be /home partition"
+    assert state_changes[0][2] == AlertLevel.WARNING
+    assert len(notifications) == 1
+    
+    print("  ✓ Nested metric checking works")
+
+
+def test_alert_summary():
+    """Test getting alert summaries."""
+    print("\nTest 8: Alert summaries...")
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {"warning": 80.0, "critical": 90.0},
+                "load_1min": {"warning": 4.0, "critical": 8.0},
+            },
+            "memory_monitor": {
+                "percent": {"warning": 85.0, "critical": 95.0},
+            },
+        }
+    }
+    
+    checker = ThresholdChecker(config)
+    alert_states = {}
+    
+    # Create some alert states
+    plugin_data = {
+        "cpu_percent": 85.0,  # WARNING
+        "load_1min": 9.0,     # CRITICAL
+    }
+    checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
+    
+    plugin_data = {
+        "percent": 96.0,  # CRITICAL
+    }
+    checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
+    
+    # Get summary
+    summary = checker.get_alert_summary(alert_states)
+    assert summary["warning"] == 1, "Should have 1 warning"
+    assert summary["critical"] == 2, "Should have 2 critical"
+    
+    # Get active alerts
+    active = checker.get_active_alerts(alert_states)
+    assert len(active) == 3, "Should have 3 active alerts"
+    
+    print("  ✓ Alert summaries work correctly")
+
+
+def run_all_tests():
+    """Run all tests."""
+    print("=" * 70)
+    print("THRESHOLD SYSTEM TEST SUITE")
+    print("=" * 70)
+    
+    try:
+        test_threshold_config_basic()
+        test_threshold_operators()
+        test_hysteresis()
+        test_alert_state()
+        test_threshold_checker_parsing()
+        test_check_plugin_data()
+        test_nested_metrics()
+        test_alert_summary()
+        
+        print("\n" + "=" * 70)
+        print("✓ ALL TESTS PASSED")
+        print("=" * 70)
+        return 0
+    
+    except AssertionError as e:
+        print(f"\n✗ TEST FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    except Exception as e:
+        print(f"\n✗ UNEXPECTED ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(run_all_tests())