Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:

- Restructuring of the project directory into client and server components
- Renaming of modules and classes to better reflect their purpose and functionality
- Moving common utilities and configurations to a shared location
- Updating import statements to reflect the new structure
- Adding new documentation files for better clarity on various aspects of the project
- Removing deprecated or unused code to streamline the codebase
- Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
This commit is contained in:
Andreas Wrede
2026-03-29 11:13:40 -04:00
parent 7e2038ecac
commit 0543266c92
65 changed files with 11371 additions and 140 deletions
+495
View File
@@ -0,0 +1,495 @@
#!/usr/bin/env python3
"""
Test suite for the threshold checking and alerting system.
Tests cover:
- Threshold configuration parsing
- Threshold evaluation (all operators)
- Hysteresis functionality
- Alert state tracking
- State change detection
- Notification triggering
- Re-notification logic
"""
import sys
import time
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from hbd.threshold import (
ThresholdChecker,
ThresholdConfig,
AlertLevel,
AlertState,
ComparisonOperator,
)
def test_threshold_config_basic():
"""Test basic threshold configuration."""
print("Test 1: Basic threshold configuration...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
)
# Test below warning
result = config.evaluate(50.0)
assert result == AlertLevel.OK, f"Expected OK, got {result}"
# Test at warning
result = config.evaluate(80.0)
assert result == AlertLevel.OK, f"Expected OK at boundary, got {result}"
# Test above warning but below critical
result = config.evaluate(85.0)
assert result == AlertLevel.WARNING, f"Expected WARNING, got {result}"
# Test above critical
result = config.evaluate(95.0)
assert result == AlertLevel.CRITICAL, f"Expected CRITICAL, got {result}"
print(" ✓ Basic threshold configuration works")
def test_threshold_operators():
"""Test all comparison operators."""
print("\nTest 2: Comparison operators...")
# Greater than operator
config_gt = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
critical=90.0,
operator=">",
)
assert config_gt.evaluate(85.0) == AlertLevel.WARNING
assert config_gt.evaluate(75.0) == AlertLevel.OK
# Less than operator (for inverse thresholds like available memory)
config_lt = ThresholdConfig(
metric_path="memory.available_mb",
warning=1000,
critical=500,
operator="<",
)
assert config_lt.evaluate(800) == AlertLevel.WARNING, "Should warn when below 1000"
assert config_lt.evaluate(400) == AlertLevel.CRITICAL, "Should be critical when below 500"
assert config_lt.evaluate(1500) == AlertLevel.OK, "Should be OK when above 1000"
# Greater than or equal
config_gte = ThresholdConfig(
metric_path="test.metric",
warning=80.0,
operator=">=",
)
assert config_gte.evaluate(80.0) == AlertLevel.WARNING
assert config_gte.evaluate(79.9) == AlertLevel.OK
# Less than or equal
config_lte = ThresholdConfig(
metric_path="test.metric",
warning=20.0,
operator="<=",
)
assert config_lte.evaluate(20.0) == AlertLevel.WARNING
assert config_lte.evaluate(20.1) == AlertLevel.OK
print(" ✓ All comparison operators work correctly")
def test_hysteresis():
"""Test hysteresis to prevent flapping."""
print("\nTest 3: Hysteresis...")
config = ThresholdConfig(
metric_path="cpu_monitor.cpu_percent",
warning=80.0,
critical=90.0,
operator=">",
hysteresis=0.1, # 10% hysteresis
)
# Start at OK, go to WARNING
result = config.evaluate_with_hysteresis(85.0, AlertLevel.OK)
assert result == AlertLevel.WARNING, "Should enter WARNING state"
# Try to recover with insufficient improvement (within hysteresis)
# Warning threshold is 80, hysteresis is 10%, so need to go below 80 - 8 = 72
result = config.evaluate_with_hysteresis(77.0, AlertLevel.WARNING)
assert result == AlertLevel.WARNING, "Should stay in WARNING (hysteresis)"
# Recover with sufficient improvement
result = config.evaluate_with_hysteresis(70.0, AlertLevel.WARNING)
assert result == AlertLevel.OK, "Should recover to OK"
# Test critical hysteresis
result = config.evaluate_with_hysteresis(95.0, AlertLevel.WARNING)
assert result == AlertLevel.CRITICAL, "Should escalate to CRITICAL"
# Try to recover from critical with insufficient improvement
# Critical threshold is 90, hysteresis is 10%, so need to go below 90 - 9 = 81
result = config.evaluate_with_hysteresis(85.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (hysteresis, still above 81)"
# Sufficient improvement to drop from CRITICAL (below 81)
result = config.evaluate_with_hysteresis(75.0, AlertLevel.CRITICAL)
assert result == AlertLevel.OK, "Should drop to OK (below warning threshold)"
# Now test dropping from CRITICAL to WARNING
result = config.evaluate_with_hysteresis(95.0, AlertLevel.OK)
assert result == AlertLevel.CRITICAL, "Should go to CRITICAL"
# Drop to between warning and critical, but still in hysteresis zone
result = config.evaluate_with_hysteresis(82.0, AlertLevel.CRITICAL)
assert result == AlertLevel.CRITICAL, "Should stay CRITICAL (in hysteresis)"
# Drop below critical hysteresis but still above warning threshold
# At 80.1, we're above WARNING (80) so should evaluate to WARNING
result = config.evaluate_with_hysteresis(80.5, AlertLevel.CRITICAL)
assert result == AlertLevel.WARNING, "Should drop to WARNING when below critical hysteresis"
print(" ✓ Hysteresis prevents flapping")
def test_alert_state():
"""Test alert state tracking."""
print("\nTest 4: Alert state tracking...")
alert = AlertState("cpu_monitor.cpu_percent")
# Initial state
assert alert.level == AlertLevel.OK
assert alert.notification_count == 0
# Update to WARNING - should trigger notification
changed = alert.update(AlertLevel.WARNING, 85.0)
assert changed == True, "State change should return True"
assert alert.level == AlertLevel.WARNING
assert alert.last_value == 85.0
# Update with same level - no notification
changed = alert.update(AlertLevel.WARNING, 86.0)
assert changed == False, "No state change should return False"
assert alert.last_value == 86.0
# Escalate to CRITICAL
changed = alert.update(AlertLevel.CRITICAL, 95.0)
assert changed == True, "Escalation should trigger notification"
assert alert.level == AlertLevel.CRITICAL
# Recover to OK
changed = alert.update(AlertLevel.OK, 50.0)
assert changed == True, "Recovery should trigger notification"
assert alert.level == AlertLevel.OK
print(" ✓ Alert state tracking works correctly")
def test_threshold_checker_parsing():
"""Test parsing threshold configuration from YAML structure."""
print("\nTest 5: Configuration parsing...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
"operator": ">",
"hysteresis": 0.1,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
"memory_monitor": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
"available_mb": {
"warning": 1000,
"critical": 500,
"operator": "<",
},
},
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
checker = ThresholdChecker(config)
# Verify thresholds were parsed
assert "cpu_monitor.cpu_percent" in checker.thresholds
assert "cpu_monitor.load_1min" in checker.thresholds
assert "memory_monitor.percent" in checker.thresholds
assert "memory_monitor.available_mb" in checker.thresholds
assert "disk_monitor./.percent" in checker.thresholds
assert "disk_monitor./home.percent" in checker.thresholds
# Verify operators were parsed correctly
assert checker.thresholds["cpu_monitor.cpu_percent"].operator == ComparisonOperator.GT
assert checker.thresholds["memory_monitor.available_mb"].operator == ComparisonOperator.LT
print(f" ✓ Parsed {len(checker.thresholds)} thresholds correctly")
def test_check_plugin_data():
"""Test checking plugin data against thresholds."""
print("\nTest 6: Plugin data checking...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {
"warning": 80.0,
"critical": 90.0,
},
"load_1min": {
"warning": 4.0,
"critical": 8.0,
},
},
}
}
notifications = []
def notification_callback(msg):
notifications.append(msg)
checker = ThresholdChecker(config, notification_callback=notification_callback)
alert_states = {}
# First check - OK
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 0, "No thresholds violated, no state changes"
assert len(notifications) == 0, "No notifications should be sent"
# Second check - WARNING
plugin_data = {
"cpu_percent": 85.0,
"load_1min": 2.0,
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One metric should change state"
assert state_changes[0][0] == "cpu_monitor.cpu_percent"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1, "One notification should be sent"
assert "WARNING" in notifications[0]
assert "testhost" in notifications[0]
# Third check - CRITICAL
plugin_data = {
"cpu_percent": 95.0,
"load_1min": 9.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should change state"
assert len(notifications) == 2, "Two notifications should be sent"
# Fourth check - Recovery
plugin_data = {
"cpu_percent": 50.0,
"load_1min": 1.0,
}
notifications.clear()
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="cpu_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 2, "Two metrics should recover"
assert len(notifications) == 2, "Two recovery notifications"
assert any("RECOVERED" in n for n in notifications), "Should have recovery notification"
print(" ✓ Plugin data checking and notifications work")
def test_nested_metrics():
"""Test checking nested metrics like disk partitions."""
print("\nTest 7: Nested metrics (partitions)...")
config = {
"thresholds": {
"disk_monitor": {
"partitions": {
"/": {
"percent": {
"warning": 80.0,
"critical": 90.0,
},
},
"/home": {
"percent": {
"warning": 85.0,
"critical": 95.0,
},
},
},
},
}
}
notifications = []
checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
alert_states = {}
plugin_data = {
"partitions": {
"/": {
"percent": 75.0,
"free_gb": 50.0,
},
"/home": {
"percent": 88.0, # Should trigger WARNING
"free_gb": 100.0,
},
},
}
state_changes = checker.check_plugin_data(
host_name="testhost",
plugin_name="disk_monitor",
data=plugin_data,
alert_states=alert_states,
)
assert len(state_changes) == 1, "One partition should trigger alert"
assert "/home" in state_changes[0][0], "Should be /home partition"
assert state_changes[0][2] == AlertLevel.WARNING
assert len(notifications) == 1
print(" ✓ Nested metric checking works")
def test_alert_summary():
"""Test getting alert summaries."""
print("\nTest 8: Alert summaries...")
config = {
"thresholds": {
"cpu_monitor": {
"cpu_percent": {"warning": 80.0, "critical": 90.0},
"load_1min": {"warning": 4.0, "critical": 8.0},
},
"memory_monitor": {
"percent": {"warning": 85.0, "critical": 95.0},
},
}
}
checker = ThresholdChecker(config)
alert_states = {}
# Create some alert states
plugin_data = {
"cpu_percent": 85.0, # WARNING
"load_1min": 9.0, # CRITICAL
}
checker.check_plugin_data("testhost", "cpu_monitor", plugin_data, alert_states)
plugin_data = {
"percent": 96.0, # CRITICAL
}
checker.check_plugin_data("testhost", "memory_monitor", plugin_data, alert_states)
# Get summary
summary = checker.get_alert_summary(alert_states)
assert summary["warning"] == 1, "Should have 1 warning"
assert summary["critical"] == 2, "Should have 2 critical"
# Get active alerts
active = checker.get_active_alerts(alert_states)
assert len(active) == 3, "Should have 3 active alerts"
print(" ✓ Alert summaries work correctly")
def run_all_tests():
"""Run all tests."""
print("=" * 70)
print("THRESHOLD SYSTEM TEST SUITE")
print("=" * 70)
try:
test_threshold_config_basic()
test_threshold_operators()
test_hysteresis()
test_alert_state()
test_threshold_checker_parsing()
test_check_plugin_data()
test_nested_metrics()
test_alert_summary()
print("\n" + "=" * 70)
print("✓ ALL TESTS PASSED")
print("=" * 70)
return 0
except AssertionError as e:
print(f"\n✗ TEST FAILED: {e}")
import traceback
traceback.print_exc()
return 1
except Exception as e:
print(f"\n✗ UNEXPECTED ERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(run_all_tests())