eole-prometheus/tmpl/alert-rules.yml

#
# Alert Rules
#
groups:
- name: EoleRules
  rules:
  # Instance is Down
  - alert: JobInstanceDown
    expr: up == 0
    for: 1m
    annotations:
      DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.
      SUMMARY: Job instance is down

  # Heavy CPU usage
  - alert: cpu_threshold_exceeded
    expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))
      > 80
    annotations:
      description: This device's cpu usage has exceeded the threshold with a value
        of {{ $value }}.
      summary: Instance {{ $labels.instance }} CPU usage is dangerously high

  # Heavy Memory usage
  - alert: mem_threshold_exceeded
    expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})
      / 1e+06 < 80
    annotations:
      description: This device's memory usage has exceeded the threshold with a value
        of {{ $value }}.
      summary: Instance {{ $labels.instance }} memory usage is dangerously high

  # Heavy "/" use
  - alert: filesystem_threshold_exceeded
    expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
      * 100 < 90
    annotations:
      description: This device's filesystem usage has exceeded the threshold with
        a value of {{ $value }}.
      summary: Instance {{ $labels.instance }} filesystem usage is dangerously high