heartbeat/hbd/config_thresholds_example.yaml

# ==============================================================================
# Heartbeat Daemon Threshold Configuration Example
# ==============================================================================
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
# Thresholds can be defined for any metric collected by monitoring plugins.
#
# Threshold levels:
#   - WARNING: First level of concern, typically for early notification
#   - CRITICAL: Severe condition requiring immediate attention
#
# Alert notifications are sent when:
#   - A metric crosses from OK to WARNING or CRITICAL
#   - A metric crosses from WARNING to CRITICAL
#   - A metric recovers (returns to a lower severity level)
#
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
# ==============================================================================

# Global threshold settings
threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts (seconds)

# Threshold definitions per plugin
thresholds:

  # ----------------------------------------------------------------------------
  # CPU Monitor Thresholds
  # ----------------------------------------------------------------------------
  cpu_monitor:
    # Overall CPU usage percentage (0-100)
    cpu_percent:
      warning: 80.0         # Warn when CPU usage exceeds 80%
      critical: 90.0        # Critical when CPU usage exceeds 90%
      operator: ">"         # Alert when value is GREATER than threshold
      hysteresis: 0.1       # 10% hysteresis to prevent flapping
      enabled: true

    # 1-minute load average
    load_1min:
      warning: 4.0          # Warn when 1-min load exceeds 4.0
      critical: 8.0         # Critical when 1-min load exceeds 8.0
      operator: ">"
      hysteresis: 0.15      # 15% hysteresis
      enabled: true

    # 5-minute load average
    load_5min:
      warning: 3.0
      critical: 6.0
      operator: ">"
      hysteresis: 0.15
      enabled: true

    # 15-minute load average
    load_15min:
      warning: 2.0
      critical: 4.0
      operator: ">"
      hysteresis: 0.15
      enabled: true

  # ----------------------------------------------------------------------------
  # Memory Monitor Thresholds
  # ----------------------------------------------------------------------------
  memory_monitor:
    # Memory usage percentage
    percent:
      warning: 85.0         # Warn at 85% memory usage
      critical: 95.0        # Critical at 95% memory usage
      operator: ">"
      hysteresis: 0.1
      enabled: true

    # Available memory in MB (inverse threshold - alert when LOW)
    available_mb:
      warning: 1000         # Warn when less than 1GB available
      critical: 500         # Critical when less than 500MB available
      operator: "<"         # Alert when value is LESS than threshold
      hysteresis: 0.1
      enabled: true

    # Swap usage percentage
    swap_percent:
      warning: 50.0         # Warn at 50% swap usage
      critical: 80.0        # Critical at 80% swap usage
      operator: ">"
      hysteresis: 0.1
      enabled: true

  # ----------------------------------------------------------------------------
  # Disk Monitor Thresholds
  # ----------------------------------------------------------------------------
  disk_monitor:
    # Partition-specific thresholds
    # Use the mount point as the key
    partitions:
      # Root filesystem
      /:
        percent:
          warning: 80.0     # Warn at 80% disk usage
          critical: 90.0    # Critical at 90% disk usage
          operator: ">"
          hysteresis: 0.05  # 5% hysteresis for disk (more stable)
          enabled: true

        free_gb:
          warning: 10.0     # Warn when less than 10GB free
          critical: 5.0     # Critical when less than 5GB free
          operator: "<"
          hysteresis: 0.1
          enabled: true

      # Home filesystem (if separate partition)
      /home:
        percent:
          warning: 85.0
          critical: 95.0
          operator: ">"
          hysteresis: 0.05
          enabled: true

      # Var filesystem (logs, etc.)
      /var:
        percent:
          warning: 80.0
          critical: 90.0
          operator: ">"
          hysteresis: 0.05
          enabled: true

        free_gb:
          warning: 5.0      # Var needs space for logs
          critical: 2.0
          operator: "<"
          hysteresis: 0.1
          enabled: true

  # ----------------------------------------------------------------------------
  # ZFS Monitor Thresholds
  # ----------------------------------------------------------------------------
  zfs_monitor:
    # Pool health check — built-in default; shown here for reference/override.
    # status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
    # Use '*' to apply the same rule to every pool, or name a specific pool.
    pools:
      '*':
        status:
          warning: 1           # Alert WARNING when pool is DEGRADED
          critical: 2           # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
          operator: ">="
          hysteresis: 0.0       # No hysteresis — a degraded pool is always alerting
          grace: 0              # Fire immediately — don't wait for a second collection
          display: "ZFS pool {pool_name} is {health}"

      # Per-pool capacity thresholds (optional; add pools you care about)
      # tank:
      #   capacity:
      #     warning: 75.0       # Warn at 75% used
      #     critical: 90.0      # Critical at 90% used
      #     operator: ">"
      #     hysteresis: 0.05

  # ----------------------------------------------------------------------------
  # Network Monitor Thresholds
  # ----------------------------------------------------------------------------
  network_monitor:
    # Total error count across all interfaces
    errors_total:
      warning: 100          # Warn at 100 errors
      critical: 1000        # Critical at 1000 errors
      operator: ">"
      hysteresis: 0.2       # 20% hysteresis for counters
      enabled: true

    # Total dropped packets
    dropin_total:
      warning: 50
      critical: 200
      operator: ">"
      hysteresis: 0.2
      enabled: true

    dropout_total:
      warning: 50
      critical: 200
      operator: ">"
      hysteresis: 0.2
      enabled: true

    # TCP connections in TIME_WAIT state
    connections_TIME_WAIT:
      warning: 1000         # Warn at 1000 TIME_WAIT connections
      critical: 5000        # Critical at 5000 TIME_WAIT connections
      operator: ">"
      hysteresis: 0.2
      enabled: true

    # Total established connections
    connections_ESTABLISHED:
      warning: 500
      critical: 1000
      operator: ">"
      hysteresis: 0.1
      enabled: true

  # ----------------------------------------------------------------------------
  # Nagios Plugin Thresholds (if using nagios_runner)
  # ----------------------------------------------------------------------------
  nagios_runner:
    # Nagios plugins report exit codes:
    #   0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
    # We can threshold on the exit_code directly
    exit_code:
      warning: 1            # Map Nagios WARNING to our WARNING
      critical: 2           # Map Nagios CRITICAL to our CRITICAL
      operator: ">="        # Alert when exit code >= threshold
      hysteresis: 0.0       # No hysteresis for exit codes
      enabled: true

# ==============================================================================
# Notification Configuration
# ==============================================================================
# Configure notification methods (email, pushover, etc.)
# These are used when threshold violations occur

# Email notifications
toemail:
  - admin@example.com
  - oncall@example.com
fromemail: heartbeat@example.com
smtpserver: smtp.example.com
smtpport: 587
smtpuser: heartbeat@example.com
smtppassword: your-password-here

# Pushover notifications (optional)
# pushover_token: your-pushover-app-token
# pushover_user: your-pushover-user-key

# Mattermost webhook (optional)
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id

# ==============================================================================
# Watched Hosts
# ==============================================================================
# Hosts in this list will trigger notifications for:
#   - Heartbeat timeouts/overdue
#   - Threshold violations
#   - Boot messages
watchhosts:
  - webserver01
  - database01
  - mailserver
  - critical-app

# ==============================================================================
# Additional Server Settings
# ==============================================================================
hb_port: 50003            # UDP port for heartbeat messages
hbd_port: 50004           # HTTP port for web interface
grace: 10                 # Grace period for overdue detection (seconds)
debug: 0                  # Debug level (0-3)
verbose: false            # Verbose output

# Journal settings (message logging)
journal_enabled: true
journal_path: /var/log/heartbeat/messages.journal
journal_max_size: 104857600  # 100MB before rotation
journal_max_backups: 10

# ==============================================================================
# Example: Production Configuration with Conservative Thresholds
# ==============================================================================
# For production systems, consider:
#   - Higher warning thresholds to reduce alert fatigue
#   - Appropriate hysteresis values (5-15% typical)
#   - Re-notification intervals matching on-call rotation
#   - Multiple escalation contacts
#   - Integration with incident management systems
# ==============================================================================