eole-prometheus/tmpl/alert-rules.yml

51 lines
1.8 KiB
YAML
Raw Normal View History

2018-06-11 09:04:58 +02:00
#
# Alert Rules
#
groups:
- name: EoleRules
rules:
# Instance is Down
- alert: JobInstanceDown
expr: up == 0
for: 1m
annotations:
DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.
SUMMARY: Job instance is down
# Heavy CPU usage
- alert: cpu_threshold_exceeded
expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))
> 80
annotations:
description: This device's cpu usage has exceeded the threshold with a value
of {{ $value }}.
summary: Instance {{ $labels.instance }} CPU usage is dangerously high
# Heavy Memory usage
- alert: mem_threshold_exceeded
expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})
/ 1e+06 < 80
annotations:
description: This device's memory usage has exceeded the threshold with a value
of {{ $value }}.
summary: Instance {{ $labels.instance }} memory usage is dangerously high
# Heavy "/" use
- alert: filesystem_threshold_exceeded
expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
2018-06-26 14:06:37 +02:00
* 100 < 20
2018-06-11 09:04:58 +02:00
annotations:
description: This device's filesystem usage has exceeded the threshold with
a value of {{ $value }}.
summary: Instance {{ $labels.instance }} filesystem usage is dangerously high
2019-06-07 15:22:30 +02:00
# Heavy CPU temperature
- alert: cpu_temp_threshold_exceeded
expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
> 50
annotations:
description: This device's cpu temperature has exceeded the threshold with a value
of {{ $value }}.
summary: Instance {{ $labels.instance }} CPU temperature is dangerously high