ajout de règles supplémentaires pour les alertes

ajout surveillance /var
2021-10-21 16:09:53 +02:00 · 2021-10-21 11:00:11 +02:00
3 changed files with 47 additions and 1 deletions
--- a/dicos/70_prometheus.xml
+++ b/dicos/70_prometheus.xml
@@ -5,6 +5,7 @@
        <file filelist='prometheus'   name='/etc/prometheus/prometheus.yml'          mkdir='True' rm='True'/>
        <file filelist='prometheus-alertmanager' name='/etc/prometheus/alertmanager.yml'        mkdir='True' rm='True'/>
        <file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/alert-rules.yml' mkdir='True' rm='True'/>
+        <file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/alert-rules-node-exporter.yml' mkdir='True' rm='True'/>
        <file filelist='prometheus-alertmanager' name='/etc/prometheus/rules.d/predict-rules.yml' mkdir='True' rm='True'/>
        <file filelist='grafana'      name='/etc/grafana/grafana.ini'                mkdir='True' rm='True'/>
        <file filelist='grafana'      name='/etc/grafana/provisioning/dashboards/eole.yml' source='grafana-dashboards.yml'     mkdir='True' rm='True'/>
--- a/tmpl/alert-rules-node-exporter.yml
+++ b/tmpl/alert-rules-node-exporter.yml
@@ -0,0 +1,35 @@
+#
+# Alert Rules
+#
+groups:
+- name: GeneralNodeExporterRules 
+  rules:
+  # TooMuch Data IN
+  - alert: TooMuchNetworkThroughputIn
+    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput in (instance {{ $labels.instance }})
+      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    
+  - alert: AlertInodes
+  expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
+  for: 2m
+  labels:
+    severity: warning
+  annotations:
+    summary: Host out of inodes (instance {{ $labels.instance }})
+    description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: HostSwapIsFillingUp
+   expr: (1 - (node_memory_SwapFree / node_memory_SwapTotal)) * 100 > 80
+   for: 2m
+   labels:
+     severity: warning
+   annotations:
+     summary: Host swap is filling up (instance {{ $labels.instance }})
+     description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+ 
+  
--- a/tmpl/alert-rules.yml
+++ b/tmpl/alert-rules.yml
@@ -34,15 +34,25 @@ groups:
  - alert: filesystem_threshold_exceeded
    expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
      * 100 < 20
+    for: 2m
    annotations:
      description: This device's filesystem usage has exceeded the threshold with
        a value of {{ $value }}.
      summary: Instance {{ $labels.instance }} filesystem usage is dangerously high

+  # Heavy "/var" use
+  - alert: var_filesystem_threshold_exceeded
+    expr: node_filesystem_avail{job="node",mountpoint="/var"} / node_filesystem_size{job="node"}
+      * 100 < 20
+    annotations:
+      description: This device's filesystem usage has exceeded the threshold with
+        a value of {{ $value }}.
+      summary: Instance {{ $labels.instance }} filesystem usage is dangerously high
+  
  # Heavy CPU temperature
  - alert: cpu_temp_threshold_exceeded
    expr: avg(node_hwmon_temp_celsius{job="node"}) BY (instance)
-      > 70
+      > 50
    annotations:
      description: This device's cpu temperature has exceeded the threshold with a value
        of {{ $value }}.
Author	SHA1	Message	Date
vfebvre	a3472eaea5	ajout de règles supplémentaires pour les alertes	2021-10-21 16:09:53 +02:00
vfebvre	4ef331f6a1	ajout surveillance /var	2021-10-21 11:00:11 +02:00