eole-prometheus/tmpl/alert-rules.yml

#
# Alert Rules
#
groups:
- name: EoleRules
  rules:
  # Instance is Down
  - alert: JobInstanceDown
    expr: up == 0
    for: 1m
    annotations:
      DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.
      SUMMARY: Job instance is down

  # Heavy CPU usage
  - alert: cpu_threshold_exceeded
    expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))
      > 80
    annotations:
      description: This device's cpu usage has exceeded the threshold with a value
        of {{ $value }}.
      summary: Instance {{ $labels.instance }} CPU usage is dangerously high

  # Heavy Memory usage
  - alert: mem_threshold_exceeded
    expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})
      / 1e+06 < 80
    annotations:
      description: This device's memory usage has exceeded the threshold with a value
        of {{ $value }}.
      summary: Instance {{ $labels.instance }} memory usage is dangerously high

  # Heavy "/" use
  - alert: filesystem_threshold_exceeded
    expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
      * 100 < 20
    annotations:
      description: This device's filesystem usage has exceeded the threshold with
        a value of {{ $value }}.
      summary: Instance {{ $labels.instance }} filesystem usage is dangerously high

  # Heavy CPU temperature
  - alert: cpu_temp_threshold_exceeded
    expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
      > 50
    annotations:
      description: This device's cpu temperature has exceeded the threshold with a value
        of {{ $value }}.
      summary: Instance {{ $labels.instance }} CPU temperature is dangerously high
Adding alert rules file template 2018-06-11 09:04:58 +02:00			`#`
			`# Alert Rules`
			`#`
			`groups:`
			`- name: EoleRules`
			`rules:`
			`# Instance is Down`
			`- alert: JobInstanceDown`
			`expr: up == 0`
			`for: 1m`
			`annotations:`
			`DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.`
			`SUMMARY: Job instance is down`

			`# Heavy CPU usage`
			`- alert: cpu_threshold_exceeded`
			`expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))`
			`> 80`
			`annotations:`
			`description: This device's cpu usage has exceeded the threshold with a value`
			`of {{ $value }}.`
			`summary: Instance {{ $labels.instance }} CPU usage is dangerously high`

			`# Heavy Memory usage`
			`- alert: mem_threshold_exceeded`
			`expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})`
			`/ 1e+06 < 80`
			`annotations:`
			`description: This device's memory usage has exceeded the threshold with a value`
			`of {{ $value }}.`
			`summary: Instance {{ $labels.instance }} memory usage is dangerously high`

			`# Heavy "/" use`
			`- alert: filesystem_threshold_exceeded`
			`expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}`
réglage niveau alert du filesystème 2018-06-26 14:06:37 +02:00			`* 100 < 20`
Adding alert rules file template 2018-06-11 09:04:58 +02:00			`annotations:`
			`description: This device's filesystem usage has exceeded the threshold with`
			`a value of {{ $value }}.`
			`summary: Instance {{ $labels.instance }} filesystem usage is dangerously high`

ajout alerte température CPU 2019-06-07 15:22:30 +02:00			`# Heavy CPU temperature`
			`- alert: cpu_temp_threshold_exceeded`
			`expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)`
			`> 50`
			`annotations:`
			`description: This device's cpu temperature has exceeded the threshold with a value`
			`of {{ $value }}.`
			`summary: Instance {{ $labels.instance }} CPU temperature is dangerously high`