2018-06-11 09:04:58 +02:00
|
|
|
#
|
|
|
|
# Alert Rules
|
|
|
|
#
|
|
|
|
groups:
|
|
|
|
- name: EoleRules
|
|
|
|
rules:
|
|
|
|
# Instance is Down
|
|
|
|
- alert: JobInstanceDown
|
|
|
|
expr: up == 0
|
|
|
|
for: 1m
|
|
|
|
annotations:
|
|
|
|
DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.
|
|
|
|
SUMMARY: Job instance is down
|
|
|
|
|
|
|
|
# Heavy CPU usage
|
|
|
|
- alert: cpu_threshold_exceeded
|
|
|
|
expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))
|
|
|
|
> 80
|
|
|
|
annotations:
|
|
|
|
description: This device's cpu usage has exceeded the threshold with a value
|
|
|
|
of {{ $value }}.
|
|
|
|
summary: Instance {{ $labels.instance }} CPU usage is dangerously high
|
|
|
|
|
|
|
|
# Heavy Memory usage
|
|
|
|
- alert: mem_threshold_exceeded
|
|
|
|
expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})
|
|
|
|
/ 1e+06 < 80
|
|
|
|
annotations:
|
|
|
|
description: This device's memory usage has exceeded the threshold with a value
|
|
|
|
of {{ $value }}.
|
|
|
|
summary: Instance {{ $labels.instance }} memory usage is dangerously high
|
|
|
|
|
|
|
|
# Heavy "/" use
|
|
|
|
- alert: filesystem_threshold_exceeded
|
|
|
|
expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
|
2018-06-26 14:06:37 +02:00
|
|
|
* 100 < 20
|
2018-06-11 09:04:58 +02:00
|
|
|
annotations:
|
|
|
|
description: This device's filesystem usage has exceeded the threshold with
|
|
|
|
a value of {{ $value }}.
|
|
|
|
summary: Instance {{ $labels.instance }} filesystem usage is dangerously high
|
|
|
|
|
2019-06-07 15:22:30 +02:00
|
|
|
# Heavy CPU temperature
|
|
|
|
- alert: cpu_temp_threshold_exceeded
|
|
|
|
expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
|
|
|
|
> 50
|
|
|
|
annotations:
|
|
|
|
description: This device's cpu temperature has exceeded the threshold with a value
|
|
|
|
of {{ $value }}.
|
|
|
|
summary: Instance {{ $labels.instance }} CPU temperature is dangerously high
|
|
|
|
|