heartbeat/tests/test_threshold.py

#!/usr/bin/env python3
"""
Test suite for the threshold checking and alerting system.

Tests cover:
- Threshold configuration parsing
- Threshold evaluation (all operators)
- Hysteresis functionality
- Alert state tracking
- State change detection
- Notification triggering
- Re-notification logic
"""

import sys
import time
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from hbd.threshold import (
    ThresholdChecker,
    ThresholdConfig,
    AlertLevel,
    AlertState,
    ComparisonOperator,
)


def test_threshold_config_basic():
    """Test basic threshold configuration."""
    print("Test 1: Basic threshold configuration...")

    config = ThresholdConfig(
        metric_path="cpu_monitor.cpu_percent",
        warning=80.0,
        critical=90.0,
        operator=">",
    )

    # Test below warning
    result = config.evaluate(50.0)
    assert result == AlertLevel.OK, f"Expected OK, got {result}"

    # Test at warning
    result = config.evaluate(80.0)
    assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"

    # Test above warning but below critical
    result = config.evaluate(85.0)
    assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"

    # Test above critical
    result = config.evaluate(95.0)
    assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"

    print("  ✓ Basic threshold configuration works")


def test_threshold_operators():
    """Test all comparison operators."""
    print("\nTest 2: Comparison operators...")

    # Greater than operator
    config_gt = ThresholdConfig(
        metric_path="test.metric",
        warning=80.0,
        critical=90.0,
        operator=">",
    )
    assert config_gt.evaluate(85.0) == AlertLevel.WARNING
    assert config_gt.evaluate(75.0) == AlertLevel.OK

    # Less than operator (for inverse thresholds like available memory)
    config_lt = ThresholdConfig(
        metric_path="memory.available_mb",
        warning=1000,
        critical=500,
        operator="<",
    )
    assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
    assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
    assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"

    # Greater than or equal
    config_gte = ThresholdConfig(
        metric_path="test.metric",
        warning=80.0,
        operator=">=",
    )
    assert config_gte.evaluate(80.0) == AlertLevel.WARNING
    assert config_gte.evaluate(79.9) == AlertLevel.OK

    # Less than or equal
    config_lte = ThresholdConfig(
        metric_path="test.metric",
        warning=20.0,
        operator="<=",
    )
    assert config_lte.evaluate(20.0) == AlertLevel.WARNING
    assert config_lte.evaluate(20.1) == AlertLevel.OK

    print("  ✓ All comparison operators work correctly")


def test_hysteresis():
    """Test hysteresis to prevent flapping."""
    print("\nTest 3: Hysteresis...")

    config = ThresholdConfig(
        metric_path="cpu_monitor.cpu_percent",
        warning=80.0,
        critical=90.0,
        operator=">",
        hysteresis=0.1,  # 10% hysteresis
    )

    # Start at OK, go to WARNING
    result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
    assert result == AlertLevel.WARNING, "Should enter WARNING state"

    # Try to recover with insufficient improvement (within hysteresis)
    # Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
    result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
    assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"

    # Recover with sufficient improvement
    result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
    assert result == AlertLevel.OK, "Should recover to OK"

    # Test critical hysteresis
    result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
    assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"

    # Try to recover from critical with insufficient improvement
    # Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
    result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
    assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"

    # Sufficient improvement to drop from CRITICAL (below 81)
    result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
    assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"

    # Now test dropping from CRITICAL to WARNING
    result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
    assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"

    # Drop to between warning and critical, but still in hysteresis zone
    result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
    assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"

    # Drop below critical hysteresis but still above warning threshold
    # At 80.1, we're above WARNING (80) so should evaluate to WARNING
    result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
    assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"

    print("  ✓ Hysteresis prevents flapping")


def test_alert_state():
    """Test alert state tracking."""
    print("\nTest 4: Alert state tracking...")

    alert = AlertState("cpu_monitor.cpu_percent")

    # Initial state
    assert alert.level == AlertLevel.OK
    assert alert.notification_count == 0

    # Update to WARNING - should trigger notification
    changed = alert.update(AlertLevel.WARNING, 85.0)
    assert changed == True, "State change should return True"
    assert alert.level == AlertLevel.WARNING
    assert alert.last_value == 85.0

    # Update with same level - no notification
    changed = alert.update(AlertLevel.WARNING, 86.0)
    assert changed == False, "No state change should return False"
    assert alert.last_value == 86.0

    # Escalate to CRITICAL
    changed = alert.update(AlertLevel.CRITICAL, 95.0)
    assert changed == True, "Escalation should trigger notification"
    assert alert.level == AlertLevel.CRITICAL

    # Recover to OK
    changed = alert.update(AlertLevel.OK, 50.0)
    assert changed == True, "Recovery should trigger notification"
    assert alert.level == AlertLevel.OK

    print("  ✓ Alert state tracking works correctly")


def test_threshold_checker_parsing():
    """Test parsing threshold configuration from YAML structure."""
    print("\nTest 5: Configuration parsing...")

    config = {
        "thresholds": {
            "cpu_monitor": {
                "cpu_percent": {
                    "warning": 80.0,
                    "critical": 90.0,
                    "operator": ">",
                    "hysteresis": 0.1,
                },
                "load_1min": {
                    "warning": 4.0,
                    "critical": 8.0,
                },
            },
            "memory_monitor": {
                "percent": {
                    "warning": 85.0,
                    "critical": 95.0,
                },
                "available_mb": {
                    "warning": 1000,
                    "critical": 500,
                    "operator": "<",
                },
            },
            "disk_monitor": {
                "partitions": {
                    "/": {
                        "percent": {
                            "warning": 80.0,
                            "critical": 90.0,
                        },
                    },
                    "/home": {
                        "percent": {
                            "warning": 85.0,
                            "critical": 95.0,
                        },
                    },
                },
            },
        }
    }

    checker = ThresholdChecker(config)

    # Verify thresholds were parsed
    assert "cpu_monitor.cpu_percent" in checker.thresholds
    assert "cpu_monitor.load_1min" in checker.thresholds
    assert "memory_monitor.percent" in checker.thresholds
    assert "memory_monitor.available_mb" in checker.thresholds
    assert "disk_monitor./.percent" in checker.thresholds
    assert "disk_monitor./home.percent" in checker.thresholds

    # Verify operators were parsed correctly
    assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
    assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT

    print(f"  ✓ Parsed {len(checker.thresholds)} thresholds correctly")


def test_check_plugin_data():
    """Test checking plugin data against thresholds."""
    print("\nTest 6: Plugin data checking...")

    config = {
        "thresholds": {
            "cpu_monitor": {
                "cpu_percent": {
                    "warning": 80.0,
                    "critical": 90.0,
                },
                "load_1min": {
                    "warning": 4.0,
                    "critical": 8.0,
                },
            },
        }
    }

    notifications = []

    def notification_callback(msg):
        notifications.append(msg)

    checker = ThresholdChecker(config, notification_callback=notification_callback)
    alert_states = {}

    # First check - OK
    plugin_data = {
        "cpu_percent": 50.0,
        "load_1min": 2.0,
    }

    state_changes = checker.check_plugin_data(
        host_name="testhost",
        plugin_name="cpu_monitor",
        data=plugin_data,
        alert_states=alert_states,
    )

    assert len(state_changes) == 0, "No thresholds violated, no state changes"
    assert len(notifications) == 0, "No notifications should be sent"

    # Second check - WARNING
    plugin_data = {
        "cpu_percent": 85.0,
        "load_1min": 2.0,
    }

    state_changes = checker.check_plugin_data(
        host_name="testhost",
        plugin_name="cpu_monitor",
        data=plugin_data,
        alert_states=alert_states,
    )

    assert len(state_changes) == 1, "One metric should change state"
    assert state_changes[0][0] == "cpu_monitor.cpu_percent"
    assert state_changes[0][2] == AlertLevel.WARNING
    assert len(notifications) == 1, "One notification should be sent"
    assert "WARNING" in notifications[0]
    assert "testhost" in notifications[0]

    # Third check - CRITICAL
    plugin_data = {
        "cpu_percent": 95.0,
        "load_1min": 9.0,
    }

    notifications.clear()
    state_changes = checker.check_plugin_data(
        host_name="testhost",
        plugin_name="cpu_monitor",
        data=plugin_data,
        alert_states=alert_states,
    )

    assert len(state_changes) == 2, "Two metrics should change state"
    assert len(notifications) == 2, "Two notifications should be sent"

    # Fourth check - Recovery
    plugin_data = {
        "cpu_percent": 50.0,
        "load_1min": 1.0,
    }

    notifications.clear()
    state_changes = checker.check_plugin_data(
        host_name="testhost",
        plugin_name="cpu_monitor",
        data=plugin_data,
        alert_states=alert_states,
    )

    assert len(state_changes) == 2, "Two metrics should recover"
    assert len(notifications) == 2, "Two recovery notifications"
    assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"

    print("  ✓ Plugin data checking and notifications work")


def test_nested_metrics():
    """Test checking nested metrics like disk partitions."""
    print("\nTest 7: Nested metrics (partitions)...")

    config = {
        "thresholds": {
            "disk_monitor": {
                "partitions": {
                    "/": {
                        "percent": {
                            "warning": 80.0,
                            "critical": 90.0,
                        },
                    },
                    "/home": {
                        "percent": {
                            "warning": 85.0,
                            "critical": 95.0,
                        },
                    },
                },
            },
        }
    }

    notifications = []
    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
    alert_states = {}

    plugin_data = {
        "partitions": {
            "/": {
                "percent": 75.0,
                "free_gb": 50.0,
            },
            "/home": {
                "percent": 88.0,  # Should trigger WARNING
                "free_gb": 100.0,
            },
        },
    }

    state_changes = checker.check_plugin_data(
        host_name="testhost",
        plugin_name="disk_monitor",
        data=plugin_data,
        alert_states=alert_states,
    )

    assert len(state_changes) == 1, "One partition should trigger alert"
    assert "/home" in state_changes[0][0], "Should be /home partition"
    assert state_changes[0][2] == AlertLevel.WARNING
    assert len(notifications) == 1

    print("  ✓ Nested metric checking works")


def test_alert_summary():
    """Test getting alert summaries."""
    print("\nTest 8: Alert summaries...")

    config = {
        "thresholds": {
            "cpu_monitor": {
                "cpu_percent": {"warning": 80.0, "critical": 90.0},
                "load_1min": {"warning": 4.0, "critical": 8.0},
            },
            "memory_monitor": {
                "percent": {"warning": 85.0, "critical": 95.0},
            },
        }
    }

    checker = ThresholdChecker(config)
    alert_states = {}

    # Create some alert states
    plugin_data = {
        "cpu_percent": 85.0,  # WARNING
        "load_1min": 9.0,     # CRITICAL
    }
    checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)

    plugin_data = {
        "percent": 96.0,  # CRITICAL
    }
    checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)

    # Get summary
    summary = checker.get_alert_summary(alert_states)
    assert summary["warning"] == 1, "Should have 1 warning"
    assert summary["critical"] == 2, "Should have 2 critical"

    # Get active alerts
    active = checker.get_active_alerts(alert_states)
    assert len(active) == 3, "Should have 3 active alerts"

    print("  ✓ Alert summaries work correctly")


def run_all_tests():
    """Run all tests."""
    print("=" * 70)
    print("THRESHOLD SYSTEM TEST SUITE")
    print("=" * 70)

    try:
        test_threshold_config_basic()
        test_threshold_operators()
        test_hysteresis()
        test_alert_state()
        test_threshold_checker_parsing()
        test_check_plugin_data()
        test_nested_metrics()
        test_alert_summary()

        print("\n" + "=" * 70)
        print("✓ ALL TESTS PASSED")
        print("=" * 70)
        return 0

    except AssertionError as e:
        print(f"\n✗ TEST FAILED: {e}")
        import traceback
        traceback.print_exc()
        return 1
    except Exception as e:
        print(f"\n✗ UNEXPECTED ERROR: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(run_all_tests())