Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:
- Restructuring of the project directory into client and server components - Renaming of modules and classes to better reflect their purpose and functionality - Moving common utilities and configurations to a shared location - Updating import statements to reflect the new structure - Adding new documentation files for better clarity on various aspects of the project - Removing deprecated or unused code to streamline the codebase - Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
This commit is contained in:
@@ -0,0 +1,254 @@
|
||||
# ==============================================================================
|
||||
# Heartbeat Daemon Threshold Configuration Example
|
||||
# ==============================================================================
|
||||
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
||||
# Thresholds can be defined for any metric collected by monitoring plugins.
|
||||
#
|
||||
# Threshold levels:
|
||||
# - WARNING: First level of concern, typically for early notification
|
||||
# - CRITICAL: Severe condition requiring immediate attention
|
||||
#
|
||||
# Alert notifications are sent when:
|
||||
# - A metric crosses from OK to WARNING or CRITICAL
|
||||
# - A metric crosses from WARNING to CRITICAL
|
||||
# - A metric recovers (returns to a lower severity level)
|
||||
#
|
||||
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
||||
# ==============================================================================
|
||||
|
||||
# Global threshold settings
|
||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
||||
|
||||
# Threshold definitions per plugin
|
||||
thresholds:
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# CPU Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
cpu_monitor:
|
||||
# Overall CPU usage percentage (0-100)
|
||||
cpu_percent:
|
||||
warning: 80.0 # Warn when CPU usage exceeds 80%
|
||||
critical: 90.0 # Critical when CPU usage exceeds 90%
|
||||
operator: ">" # Alert when value is GREATER than threshold
|
||||
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
||||
enabled: true
|
||||
|
||||
# 1-minute load average
|
||||
load_1min:
|
||||
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
||||
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15 # 15% hysteresis
|
||||
enabled: true
|
||||
|
||||
# 5-minute load average
|
||||
load_5min:
|
||||
warning: 3.0
|
||||
critical: 6.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# 15-minute load average
|
||||
load_15min:
|
||||
warning: 2.0
|
||||
critical: 4.0
|
||||
operator: ">"
|
||||
hysteresis: 0.15
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Memory Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
memory_monitor:
|
||||
# Memory usage percentage
|
||||
percent:
|
||||
warning: 85.0 # Warn at 85% memory usage
|
||||
critical: 95.0 # Critical at 95% memory usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Available memory in MB (inverse threshold - alert when LOW)
|
||||
available_mb:
|
||||
warning: 1000 # Warn when less than 1GB available
|
||||
critical: 500 # Critical when less than 500MB available
|
||||
operator: "<" # Alert when value is LESS than threshold
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Swap usage percentage
|
||||
swap_percent:
|
||||
warning: 50.0 # Warn at 50% swap usage
|
||||
critical: 80.0 # Critical at 80% swap usage
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Disk Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
disk_monitor:
|
||||
# Partition-specific thresholds
|
||||
# Use the mount point as the key
|
||||
partitions:
|
||||
# Root filesystem
|
||||
/:
|
||||
percent:
|
||||
warning: 80.0 # Warn at 80% disk usage
|
||||
critical: 90.0 # Critical at 90% disk usage
|
||||
operator: ">"
|
||||
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 10.0 # Warn when less than 10GB free
|
||||
critical: 5.0 # Critical when less than 5GB free
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# Home filesystem (if separate partition)
|
||||
/home:
|
||||
percent:
|
||||
warning: 85.0
|
||||
critical: 95.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
# Var filesystem (logs, etc.)
|
||||
/var:
|
||||
percent:
|
||||
warning: 80.0
|
||||
critical: 90.0
|
||||
operator: ">"
|
||||
hysteresis: 0.05
|
||||
enabled: true
|
||||
|
||||
free_gb:
|
||||
warning: 5.0 # Var needs space for logs
|
||||
critical: 2.0
|
||||
operator: "<"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Network Monitor Thresholds
|
||||
# ----------------------------------------------------------------------------
|
||||
network_monitor:
|
||||
# Total error count across all interfaces
|
||||
errors_total:
|
||||
warning: 100 # Warn at 100 errors
|
||||
critical: 1000 # Critical at 1000 errors
|
||||
operator: ">"
|
||||
hysteresis: 0.2 # 20% hysteresis for counters
|
||||
enabled: true
|
||||
|
||||
# Total dropped packets
|
||||
dropin_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
dropout_total:
|
||||
warning: 50
|
||||
critical: 200
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# TCP connections in TIME_WAIT state
|
||||
connections_TIME_WAIT:
|
||||
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
||||
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
||||
operator: ">"
|
||||
hysteresis: 0.2
|
||||
enabled: true
|
||||
|
||||
# Total established connections
|
||||
connections_ESTABLISHED:
|
||||
warning: 500
|
||||
critical: 1000
|
||||
operator: ">"
|
||||
hysteresis: 0.1
|
||||
enabled: true
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Nagios Plugin Thresholds (if using nagios_runner)
|
||||
# ----------------------------------------------------------------------------
|
||||
nagios_runner:
|
||||
# Nagios plugins report exit codes:
|
||||
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
||||
# We can threshold on the exit_code directly
|
||||
exit_code:
|
||||
warning: 1 # Map Nagios WARNING to our WARNING
|
||||
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
||||
operator: ">=" # Alert when exit code >= threshold
|
||||
hysteresis: 0.0 # No hysteresis for exit codes
|
||||
enabled: true
|
||||
|
||||
# ==============================================================================
|
||||
# Notification Configuration
|
||||
# ==============================================================================
|
||||
# Configure notification methods (email, pushover, etc.)
|
||||
# These are used when threshold violations occur
|
||||
|
||||
# Email notifications
|
||||
toemail:
|
||||
- admin@example.com
|
||||
- oncall@example.com
|
||||
fromemail: heartbeat@example.com
|
||||
smtpserver: smtp.example.com
|
||||
smtpport: 587
|
||||
smtpuser: heartbeat@example.com
|
||||
smtppassword: your-password-here
|
||||
|
||||
# Pushover notifications (optional)
|
||||
# pushover_token: your-pushover-app-token
|
||||
# pushover_user: your-pushover-user-key
|
||||
|
||||
# Mattermost webhook (optional)
|
||||
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
||||
|
||||
# ==============================================================================
|
||||
# Watched Hosts
|
||||
# ==============================================================================
|
||||
# Hosts in this list will trigger notifications for:
|
||||
# - Heartbeat timeouts/overdue
|
||||
# - Threshold violations
|
||||
# - Boot messages
|
||||
watchhosts:
|
||||
- webserver01
|
||||
- database01
|
||||
- mailserver
|
||||
- critical-app
|
||||
|
||||
# ==============================================================================
|
||||
# Additional Server Settings
|
||||
# ==============================================================================
|
||||
hb_port: 50003 # UDP port for heartbeat messages
|
||||
hbd_port: 50004 # HTTP port for web interface
|
||||
grace: 10 # Grace period for overdue detection (seconds)
|
||||
debug: 0 # Debug level (0-3)
|
||||
verbose: false # Verbose output
|
||||
|
||||
# Journal settings (message logging)
|
||||
journal_enabled: true
|
||||
journal_path: /var/log/heartbeat/messages.journal
|
||||
journal_max_size: 104857600 # 100MB before rotation
|
||||
journal_max_backups: 10
|
||||
|
||||
# ==============================================================================
|
||||
# Example: Production Configuration with Conservative Thresholds
|
||||
# ==============================================================================
|
||||
# For production systems, consider:
|
||||
# - Higher warning thresholds to reduce alert fatigue
|
||||
# - Appropriate hysteresis values (5-15% typical)
|
||||
# - Re-notification intervals matching on-call rotation
|
||||
# - Multiple escalation contacts
|
||||
# - Integration with incident management systems
|
||||
# ==============================================================================
|
||||
Reference in New Issue
Block a user