Files
heartbeat/hbd/config_thresholds_example.yaml
T
Andreas Wrede 0543266c92 Major refactoring of the codebase, including restructuring of files and directories, renaming of modules and classes, and improvements to the overall organization and readability of the code. This refactoring aims to enhance maintainability, scalability, and clarity of the codebase while preserving existing functionality. The changes include:
- Restructuring of the project directory into client and server components
- Renaming of modules and classes to better reflect their purpose and functionality
- Moving common utilities and configurations to a shared location
- Updating import statements to reflect the new structure
- Adding new documentation files for better clarity on various aspects of the project
- Removing deprecated or unused code to streamline the codebase
- Ensuring that all existing functionality is preserved and that the codebase remains functional after the refactoring.
2026-03-29 11:13:40 -04:00

255 lines
8.6 KiB
YAML

# ==============================================================================
# Heartbeat Daemon Threshold Configuration Example
# ==============================================================================
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
# Thresholds can be defined for any metric collected by monitoring plugins.
#
# Threshold levels:
# - WARNING: First level of concern, typically for early notification
# - CRITICAL: Severe condition requiring immediate attention
#
# Alert notifications are sent when:
# - A metric crosses from OK to WARNING or CRITICAL
# - A metric crosses from WARNING to CRITICAL
# - A metric recovers (returns to a lower severity level)
#
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Threshold definitions per plugin
thresholds:
# ----------------------------------------------------------------------------
# CPU Monitor Thresholds
# ----------------------------------------------------------------------------
cpu_monitor:
# Overall CPU usage percentage (0-100)
cpu_percent:
warning: 80.0 # Warn when CPU usage exceeds 80%
critical: 90.0 # Critical when CPU usage exceeds 90%
operator: ">" # Alert when value is GREATER than threshold
hysteresis: 0.1 # 10% hysteresis to prevent flapping
enabled: true
# 1-minute load average
load_1min:
warning: 4.0 # Warn when 1-min load exceeds 4.0
critical: 8.0 # Critical when 1-min load exceeds 8.0
operator: ">"
hysteresis: 0.15 # 15% hysteresis
enabled: true
# 5-minute load average
load_5min:
warning: 3.0
critical: 6.0
operator: ">"
hysteresis: 0.15
enabled: true
# 15-minute load average
load_15min:
warning: 2.0
critical: 4.0
operator: ">"
hysteresis: 0.15
enabled: true
# ----------------------------------------------------------------------------
# Memory Monitor Thresholds
# ----------------------------------------------------------------------------
memory_monitor:
# Memory usage percentage
percent:
warning: 85.0 # Warn at 85% memory usage
critical: 95.0 # Critical at 95% memory usage
operator: ">"
hysteresis: 0.1
enabled: true
# Available memory in MB (inverse threshold - alert when LOW)
available_mb:
warning: 1000 # Warn when less than 1GB available
critical: 500 # Critical when less than 500MB available
operator: "<" # Alert when value is LESS than threshold
hysteresis: 0.1
enabled: true
# Swap usage percentage
swap_percent:
warning: 50.0 # Warn at 50% swap usage
critical: 80.0 # Critical at 80% swap usage
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Disk Monitor Thresholds
# ----------------------------------------------------------------------------
disk_monitor:
# Partition-specific thresholds
# Use the mount point as the key
partitions:
# Root filesystem
/:
percent:
warning: 80.0 # Warn at 80% disk usage
critical: 90.0 # Critical at 90% disk usage
operator: ">"
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
enabled: true
free_gb:
warning: 10.0 # Warn when less than 10GB free
critical: 5.0 # Critical when less than 5GB free
operator: "<"
hysteresis: 0.1
enabled: true
# Home filesystem (if separate partition)
/home:
percent:
warning: 85.0
critical: 95.0
operator: ">"
hysteresis: 0.05
enabled: true
# Var filesystem (logs, etc.)
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
hysteresis: 0.05
enabled: true
free_gb:
warning: 5.0 # Var needs space for logs
critical: 2.0
operator: "<"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Network Monitor Thresholds
# ----------------------------------------------------------------------------
network_monitor:
# Total error count across all interfaces
errors_total:
warning: 100 # Warn at 100 errors
critical: 1000 # Critical at 1000 errors
operator: ">"
hysteresis: 0.2 # 20% hysteresis for counters
enabled: true
# Total dropped packets
dropin_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
dropout_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
# TCP connections in TIME_WAIT state
connections_TIME_WAIT:
warning: 1000 # Warn at 1000 TIME_WAIT connections
critical: 5000 # Critical at 5000 TIME_WAIT connections
operator: ">"
hysteresis: 0.2
enabled: true
# Total established connections
connections_ESTABLISHED:
warning: 500
critical: 1000
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Nagios Plugin Thresholds (if using nagios_runner)
# ----------------------------------------------------------------------------
nagios_runner:
# Nagios plugins report exit codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
# We can threshold on the exit_code directly
exit_code:
warning: 1 # Map Nagios WARNING to our WARNING
critical: 2 # Map Nagios CRITICAL to our CRITICAL
operator: ">=" # Alert when exit code >= threshold
hysteresis: 0.0 # No hysteresis for exit codes
enabled: true
# ==============================================================================
# Notification Configuration
# ==============================================================================
# Configure notification methods (email, pushover, etc.)
# These are used when threshold violations occur
# Email notifications
toemail:
- admin@example.com
- oncall@example.com
fromemail: heartbeat@example.com
smtpserver: smtp.example.com
smtpport: 587
smtpuser: heartbeat@example.com
smtppassword: your-password-here
# Pushover notifications (optional)
# pushover_token: your-pushover-app-token
# pushover_user: your-pushover-user-key
# Mattermost webhook (optional)
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
# ==============================================================================
# Watched Hosts
# ==============================================================================
# Hosts in this list will trigger notifications for:
# - Heartbeat timeouts/overdue
# - Threshold violations
# - Boot messages
watchhosts:
- webserver01
- database01
- mailserver
- critical-app
# ==============================================================================
# Additional Server Settings
# ==============================================================================
hb_port: 50003 # UDP port for heartbeat messages
hbd_port: 50004 # HTTP port for web interface
grace: 10 # Grace period for overdue detection (seconds)
debug: 0 # Debug level (0-3)
verbose: false # Verbose output
# Journal settings (message logging)
journal_enabled: true
journal_path: /var/log/heartbeat/messages.journal
journal_max_size: 104857600 # 100MB before rotation
journal_max_backups: 10
# ==============================================================================
# Example: Production Configuration with Conservative Thresholds
# ==============================================================================
# For production systems, consider:
# - Higher warning thresholds to reduce alert fatigue
# - Appropriate hysteresis values (5-15% typical)
# - Re-notification intervals matching on-call rotation
# - Multiple escalation contacts
# - Integration with incident management systems
# ==============================================================================