Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:

- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
2026-03-29 11:13:40 -04:00
parent 7e2038ecac
commit 0543266c92
65 changed files with 11371 additions and 140 deletions
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+Demonstration of the threshold alerting system.
+
+This script shows how thresholds work by simulating plugin data
+with values that cross various threshold boundaries.
+"""
+
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from hbd.threshold import ThresholdChecker, AlertLevel
+
+
+def demo_basic_thresholds():
+    """Demonstrate basic threshold checking."""
+    print("=" * 70)
+    print("DEMO 1: Basic Threshold Checking")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                    "operator": ">",
+                    "hysteresis": 0.1,
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    
+    def notifier(msg):
+        notifications.append(msg)
+        print(f"  📧 NOTIFICATION: {msg}")
+    
+    checker = ThresholdChecker(config, notification_callback=notifier)
+    alert_states = {}
+    
+    # Simulate CPU values over time
+    test_values = [
+        (50.0, "Normal operation"),
+        (85.0, "Crosses WARNING threshold"),
+        (87.0, "Still in WARNING"),
+        (95.0, "Escalates to CRITICAL"),
+        (92.0, "Still CRITICAL (in hysteresis)"),
+        (85.0, "Still CRITICAL (above recovery threshold of 81)"),
+        (79.0, "Recovers to OK"),
+        (50.0, "Back to normal"),
+    ]
+    
+    print("\nSimulating CPU usage over time:")
+    print("-" * 70)
+    
+    for value, description in test_values:
+        print(f"\n📊 CPU: {value}% - {description}")
+        
+        plugin_data = {"cpu_percent": value}
+        state_changes = checker.check_plugin_data(
+            host_name="testhost",
+            plugin_name="cpu_monitor",
+            data=plugin_data,
+            alert_states=alert_states,
+        )
+        
+        current_state = alert_states.get("cpu_monitor.cpu_percent")
+        if current_state:
+            print(f"  Current state: {current_state.level.name}")
+        
+        if state_changes:
+            for metric, old_level, new_level, val in state_changes:
+                print(f"  ⚠️  State change: {old_level.name} → {new_level.name}")
+    
+    print(f"\n📈 Summary: {len(notifications)} notifications sent")
+    print("=" * 70)
+
+
+def demo_multiple_metrics():
+    """Demonstrate monitoring multiple metrics."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 2: Multiple Metrics and Alert Summary")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {"warning": 80.0, "critical": 90.0},
+                "load_1min": {"warning": 4.0, "critical": 8.0},
+            },
+            "memory_monitor": {
+                "percent": {"warning": 85.0, "critical": 95.0},
+                "available_mb": {
+                    "warning": 1000,
+                    "critical": 500,
+                    "operator": "<",
+                },
+            },
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    # Simulate problematic system state
+    print("\nSimulating a system under load:")
+    print("-" * 70)
+    
+    scenarios = [
+        {
+            "name": "Initial state - all OK",
+            "cpu_monitor": {"cpu_percent": 50.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 60.0, "available_mb": 2000},
+        },
+        {
+            "name": "CPU spikes to WARNING",
+            "cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 60.0, "available_mb": 2000},
+        },
+        {
+            "name": "Memory also reaches WARNING",
+            "cpu_monitor": {"cpu_percent": 85.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 88.0, "available_mb": 800},
+        },
+        {
+            "name": "CPU escalates to CRITICAL",
+            "cpu_monitor": {"cpu_percent": 95.0, "load_1min": 5.0},
+            "memory_monitor": {"percent": 88.0, "available_mb": 800},
+        },
+        {
+            "name": "System recovering",
+            "cpu_monitor": {"cpu_percent": 70.0, "load_1min": 2.0},
+            "memory_monitor": {"percent": 65.0, "available_mb": 1500},
+        },
+    ]
+    
+    for scenario in scenarios:
+        print(f"\n📍 {scenario['name']}")
+        
+        # Check CPU metrics
+        checker.check_plugin_data(
+            "testhost",
+            "cpu_monitor",
+            scenario["cpu_monitor"],
+            alert_states
+        )
+        
+        # Check memory metrics
+        checker.check_plugin_data(
+            "testhost",
+            "memory_monitor",
+            scenario["memory_monitor"],
+            alert_states
+        )
+        
+        # Show alert summary
+        summary = checker.get_alert_summary(alert_states)
+        print(f"  Alerts: OK={summary['ok']}, WARNING={summary['warning']}, CRITICAL={summary['critical']}")
+        
+        # Show active alerts
+        active = checker.get_active_alerts(alert_states)
+        if active:
+            print(f"  Active alerts:")
+            for alert in active:
+                print(f"    - {alert.metric_path}: {alert.level.name} (value={alert.last_value})")
+    
+    print(f"\n📈 Total notifications sent: {len(notifications)}")
+    print("=" * 70)
+
+
+def demo_hysteresis():
+    """Demonstrate hysteresis effect."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 3: Hysteresis Prevents Flapping")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "cpu_monitor": {
+                "cpu_percent": {
+                    "warning": 80.0,
+                    "critical": 90.0,
+                    "hysteresis": 0.1,  # 10% hysteresis
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    print("\nCritical threshold: 90%")
+    print("Hysteresis: 10%")
+    print("Recovery threshold: 81% (90 - 10% of 90)")
+    print("\nSimulating CPU fluctuating near CRITICAL threshold:")
+    print("-" * 70)
+    
+    # Simulate fluctuating values
+    test_values = [
+        (75.0, "Normal"),
+        (92.0, "Crosses CRITICAL"),
+        (88.0, "Drops but still above 81% (stays CRITICAL)"),
+        (86.0, "Still above 81% (stays CRITICAL)"),
+        (83.0, "Still above 81% (stays CRITICAL)"),
+        (80.0, "Below 81% - recovers to OK"),
+        (88.0, "Rises again but below 90% (stays OK)"),
+        (91.0, "Crosses CRITICAL again"),
+    ]
+    
+    for value, description in test_values:
+        print(f"\n📊 CPU: {value:5.1f}% - {description}")
+        
+        plugin_data = {"cpu_percent": value}
+        state_changes = checker.check_plugin_data(
+            "testhost",
+            "cpu_monitor",
+            plugin_data,
+            alert_states,
+        )
+        
+        current_state = alert_states.get("cpu_monitor.cpu_percent")
+        print(f"  State: {current_state.level.name}")
+        
+        if state_changes:
+            print(f"  📧 Notification sent (state changed)")
+        else:
+            print(f"  ✓  No notification (state unchanged - hysteresis working)")
+    
+    print(f"\n📈 Notifications sent: {len(notifications)} (without hysteresis would be ≥6)")
+    print("=" * 70)
+
+
+def demo_inverse_threshold():
+    """Demonstrate inverse thresholds (less than)."""
+    print("\n\n" + "=" * 70)
+    print("DEMO 4: Inverse Thresholds (Alert When Low)")
+    print("=" * 70)
+    
+    config = {
+        "thresholds": {
+            "memory_monitor": {
+                "available_mb": {
+                    "warning": 1000,   # Warn when < 1000 MB
+                    "critical": 500,   # Critical when < 500 MB
+                    "operator": "<",
+                    "hysteresis": 0.1,
+                }
+            }
+        }
+    }
+    
+    notifications = []
+    checker = ThresholdChecker(config, notification_callback=lambda m: notifications.append(m))
+    alert_states = {}
+    
+    print("\nMonitoring available memory (alert when LOW):")
+    print("WARNING when < 1000 MB, CRITICAL when < 500 MB")
+    print("-" * 70)
+    
+    test_values = [
+        (2000, "Plenty of memory"),
+        (800, "Drops below 1000 MB - WARNING"),
+        (450, "Drops below 500 MB - CRITICAL"),
+        (520, "Rises but still in hysteresis zone - stays CRITICAL"),
+        (600, "Enough recovery - back to WARNING"),
+        (1200, "Fully recovered - OK"),
+    ]
+    
+    for value, description in test_values:
+        print(f"\n💾 Available: {value} MB - {description}")
+        
+        plugin_data = {"available_mb": value}
+        state_changes = checker.check_plugin_data(
+            "testhost",
+            "memory_monitor",
+            plugin_data,
+            alert_states,
+        )
+        
+        current_state = alert_states.get("memory_monitor.available_mb")
+        print(f"  State: {current_state.level.name}")
+        
+        if state_changes:
+            for metric, old_level, new_level, val in state_changes:
+                print(f"  📧 {old_level.name} → {new_level.name}")
+    
+    print(f"\n📈 Notifications sent: {len(notifications)}")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    print("\n")
+    print("╔" + "═" * 68 + "╗")
+    print("║" + " " * 15 + "THRESHOLD ALERTING DEMONSTRATION" + " " * 21 + "║")
+    print("╚" + "═" * 68 + "╝")
+    
+    demo_basic_thresholds()
+    demo_multiple_metrics()
+    demo_hysteresis()
+    demo_inverse_threshold()
+    
+    print("\n\n" + "=" * 70)
+    print("DEMONSTRATION COMPLETE")
+    print("=" * 70)
+    print("\nKey takeaways:")
+    print("  • Thresholds detect when metrics exceed configured limits")
+    print("  • Notifications sent only on state changes, not every check")
+    print("  • Hysteresis prevents alert flapping")
+    print("  • Supports both 'greater than' and 'less than' thresholds")
+    print("  • Multiple metrics can be monitored simultaneously")
+    print("\nFor full documentation, see docs/THRESHOLD_ALERTING.md")
+    print("=" * 70)
+    print()