Browse Source

Adding alert rules file template

2.6.2/master
Philippe Caseiro 1 year ago
parent
commit
1013775b1a
3 changed files with 46 additions and 4 deletions
  1. 4
    3
      dicos/70_prometheus.xml
  2. 41
    0
      tmpl/alert-rules.yml
  3. 1
    1
      tmpl/prometheus.yml

+ 4
- 3
dicos/70_prometheus.xml View File

@@ -1,9 +1,10 @@
1 1
 <?xml version="1.0" encoding="utf-8"?>
2 2
 <creole>
3 3
     <files>
4
-        <file filelist='prometheus'   name='/etc/prometheus/prometheus.yml'   mkdir='True' rm='True'/>
5
-        <file filelist='alertmanager' name='/etc/prometheus/alertmanager.yml' mkdir='True' rm='True'/>
6
-        <file filelist='grafana'      name='/etc/grafana/grafana.ini'         mkdir='True' rm='True'/>
4
+        <file filelist='prometheus'   name='/etc/prometheus/prometheus.yml'          mkdir='True' rm='True'/>
5
+        <file filelist='alertmanager' name='/etc/prometheus/alertmanager.yml'        mkdir='True' rm='True'/>
6
+		<file filelist='alertmanager' name='/etc/prometheus/rules.d/alert-rules.yml' mkdir='True' rm='True'/>
7
+        <file filelist='grafana'      name='/etc/grafana/grafana.ini'                mkdir='True' rm='True'/>
7 8
 
8 9
         <service>prometheus</service>
9 10
         <service>alertmanager</service>

+ 41
- 0
tmpl/alert-rules.yml View File

@@ -0,0 +1,41 @@
1
+#
2
+# Alert Rules
3
+#
4
+groups:
5
+- name: EoleRules
6
+  rules:
7
+  # Instance is Down
8
+  - alert: JobInstanceDown
9
+    expr: up == 0
10
+    for: 1m
11
+    annotations:
12
+      DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down.
13
+      SUMMARY: Job instance is down
14
+
15
+  # Heavy CPU usage
16
+  - alert: cpu_threshold_exceeded
17
+    expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m]))))
18
+      > 80
19
+    annotations:
20
+      description: This device's cpu usage has exceeded the threshold with a value
21
+        of {{ $value }}.
22
+      summary: Instance {{ $labels.instance }} CPU usage is dangerously high
23
+
24
+  # Heavy Memory usage
25
+  - alert: mem_threshold_exceeded
26
+    expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"})
27
+      / 1e+06 < 80
28
+    annotations:
29
+      description: This device's memory usage has exceeded the threshold with a value
30
+        of {{ $value }}.
31
+      summary: Instance {{ $labels.instance }} memory usage is dangerously high
32
+
33
+  # Heavy "/" use
34
+  - alert: filesystem_threshold_exceeded
35
+    expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"}
36
+      * 100 < 90
37
+    annotations:
38
+      description: This device's filesystem usage has exceeded the threshold with
39
+        a value of {{ $value }}.
40
+      summary: Instance {{ $labels.instance }} filesystem usage is dangerously high
41
+

+ 1
- 1
tmpl/prometheus.yml View File

@@ -5,7 +5,7 @@ global:
5 5
   scrape_timeout: %%prometheusScrapeTimeout
6 6
 
7 7
 rule_files:
8
-- "/etc/prometheus/rules.d/*.yml"
8
+  - "/etc/prometheus/rules.d/*.yml"
9 9
 
10 10
 scrape_configs:
11 11
   - job_name: %%prometheusJobName

Loading…
Cancel
Save