Compare commits
2 Commits
master
...
ajout_regl
Author | SHA1 | Date |
---|---|---|
vfebvre | a3472eaea5 | |
vfebvre | 4ef331f6a1 |
|
@ -5,6 +5,7 @@
|
||||||
<file filelist='prometheus' name='/etc/prometheus/prometheus.yml' mkdir='True' rm='True'/>
|
<file filelist='prometheus' name='/etc/prometheus/prometheus.yml' mkdir='True' rm='True'/>
|
||||||
<file filelist='prometheus-alertmanager' name='/etc/prometheus/alertmanager.yml' mkdir='True' rm='True'/>
|
<file filelist='prometheus-alertmanager' name='/etc/prometheus/alertmanager.yml' mkdir='True' rm='True'/>
|
||||||
<file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/alert-rules.yml' mkdir='True' rm='True'/>
|
<file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/alert-rules.yml' mkdir='True' rm='True'/>
|
||||||
|
<file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/alert-rules-node-exporter.yml' mkdir='True' rm='True'/>
|
||||||
<file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/predict-rules.yml' mkdir='True' rm='True'/>
|
<file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/predict-rules.yml' mkdir='True' rm='True'/>
|
||||||
<file filelist='grafana' name='/etc/grafana/grafana.ini' mkdir='True' rm='True'/>
|
<file filelist='grafana' name='/etc/grafana/grafana.ini' mkdir='True' rm='True'/>
|
||||||
<file filelist='grafana' name='/etc/grafana/provisioning/dashboards/eole.yml' source='grafana-dashboards.yml' mkdir='True' rm='True'/>
|
<file filelist='grafana' name='/etc/grafana/provisioning/dashboards/eole.yml' source='grafana-dashboards.yml' mkdir='True' rm='True'/>
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
#
|
||||||
|
# Alert Rules
|
||||||
|
#
|
||||||
|
groups:
|
||||||
|
- name: GeneralNodeExporterRules
|
||||||
|
rules:
|
||||||
|
# TooMuch Data IN
|
||||||
|
- alert: TooMuchNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: AlertInodes
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree / node_memory_SwapTotal)) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,16 @@ groups:
|
||||||
- alert: filesystem_threshold_exceeded
|
- alert: filesystem_threshold_exceeded
|
||||||
expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
|
expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
|
||||||
* 100 < 20
|
* 100 < 20
|
||||||
|
for: 2m
|
||||||
|
annotations:
|
||||||
|
description: This device's filesystem usage has exceeded the threshold with
|
||||||
|
a value of {{ $value }}.
|
||||||
|
summary: Instance {{ $labels.instance }} filesystem usage is dangerously high
|
||||||
|
|
||||||
|
# Heavy "/var" use
|
||||||
|
- alert: var_filesystem_threshold_exceeded
|
||||||
|
expr: node_filesystem_avail{job="node",mountpoint="/var"} / node_filesystem_size{job="node"}
|
||||||
|
* 100 < 20
|
||||||
annotations:
|
annotations:
|
||||||
description: This device's filesystem usage has exceeded the threshold with
|
description: This device's filesystem usage has exceeded the threshold with
|
||||||
a value of {{ $value }}.
|
a value of {{ $value }}.
|
||||||
|
@ -42,7 +52,7 @@ groups:
|
||||||
# Heavy CPU temperature
|
# Heavy CPU temperature
|
||||||
- alert: cpu_temp_threshold_exceeded
|
- alert: cpu_temp_threshold_exceeded
|
||||||
expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
|
expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
|
||||||
> 70
|
> 50
|
||||||
annotations:
|
annotations:
|
||||||
description: This device's cpu temperature has exceeded the threshold with a value
|
description: This device's cpu temperature has exceeded the threshold with a value
|
||||||
of {{ $value }}.
|
of {{ $value }}.
|
||||||
|
|
Loading…
Reference in New Issue