# # Alert Rules # groups: - name: EoleRules rules: # Instance is Down - alert: JobInstanceDown expr: up == 0 for: 1m annotations: DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down. SUMMARY: Job instance is down # Heavy CPU usage - alert: cpu_threshold_exceeded expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m])))) > 80 annotations: description: This device's cpu usage has exceeded the threshold with a value of {{ $value }}. summary: Instance {{ $labels.instance }} CPU usage is dangerously high # Heavy Memory usage - alert: mem_threshold_exceeded expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"}) / 1e+06 < 80 annotations: description: This device's memory usage has exceeded the threshold with a value of {{ $value }}. summary: Instance {{ $labels.instance }} memory usage is dangerously high # Heavy "/" use - alert: filesystem_threshold_exceeded expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"} * 100 < 20 annotations: description: This device's filesystem usage has exceeded the threshold with a value of {{ $value }}. summary: Instance {{ $labels.instance }} filesystem usage is dangerously high # Heavy CPU temperature - alert: cpu_temp_threshold_exceeded expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance) > 50 annotations: description: This device's cpu temperature has exceeded the threshold with a value of {{ $value }}. summary: Instance {{ $labels.instance }} CPU temperature is dangerously high