Browse Source

Adding alertmanager support

Philippe Caseiro 11 months ago
parent
commit
c10edef336
3 changed files with 214 additions and 28 deletions
  1. 94
    28
      dicos/70_prometheus.xml
  2. 119
    0
      tmpl/alertmanager.yml
  3. 1
    0
      tmpl/prometheus.yml

+ 94
- 28
dicos/70_prometheus.xml View File

@@ -1,9 +1,12 @@
1 1
 <?xml version="1.0" encoding="utf-8"?>
2 2
 <creole>
3 3
     <files>
4
-        <file filelist='prometheus' name='/etc/prometheus/prometheus.yml' source='prometheus.yml' mkdir='True' rm='True'/>
5
-        <file filelist='grafana' name='/etc/grafana/grafana.ini' source='grafana.ini' mkdir='True' rm='True'/>
4
+        <file filelist='prometheus' name='/etc/prometheus/prometheus.yml'   mkdir='True' rm='True'/>
5
+        <file filelist='prometheus' name='/etc/prometheus/alertmanager.yml' mkdir='True' rm='True'/>
6
+        <file filelist='grafana'    name='/etc/grafana/grafana.ini'         mkdir='True' rm='True'/>
7
+
6 8
         <service>prometheus</service>
9
+        <service>alertmanager</service>
7 10
         <service>grafana-server</service>
8 11
         <service_access service='prometheus'>
9 12
             <port service_accesslist="saLemon">80</port>
@@ -46,21 +49,21 @@
46 49
     <family name='Clients prometheus'>
47 50
             <variable name='ajout_client_prometheus' type='oui/non' description="Ajouter un nouveau client à Prometheus">
48 51
                 <value>non</value>
49
-			</variable>
50
-			<!-- Client standard  -->
51
-			<variable name='prCli' type='string' description='Nom du client prometheus' multi='True'/>
52
-			<variable name='prCliIP' type='ip' description="Adresse IP du client prometheus"/>
53
-			<variable name='prCliSonde' type='string' description="Sonde a utiliser pour ce client">
54
-				<value>Node Exporter</value>
55
-			</variable>
52
+            </variable>
53
+            <!-- Client standard  -->
54
+            <variable name='prCli' type='string' description='Nom du client prometheus' multi='True'/>
55
+            <variable name='prCliIP' type='ip' description="Adresse IP du client prometheus"/>
56
+            <variable name='prCliSonde' type='string' description="Sonde a utiliser pour ce client">
57
+                <value>Node Exporter</value>
58
+            </variable>
56 59
 
57 60
             <variable name='addPrOpenCli' type='oui/non' description="Ajouter un client personnalisé">
58 61
                 <value>non</value>
59
-			</variable>
60
-			<!-- Client libre  -->
61
-			<variable name='prOpenCli' type='string' description='Nom du client personnalisé prometheus' multi='True'/>
62
-			<variable name='prOpenCliIP' type='ip' description="Adresse IP"/>
63
-			<variable name='prOpenCliPort' type='number' description="Port d'écoute de la sonde"/>
62
+            </variable>
63
+            <!-- Client libre  -->
64
+            <variable name='prOpenCli' type='string' description='Nom du client personnalisé prometheus' multi='True'/>
65
+            <variable name='prOpenCliIP' type='ip' description="Adresse IP"/>
66
+            <variable name='prOpenCliPort' type='number' description="Port d'écoute de la sonde"/>
64 67
         </family>
65 68
     <family name="grafana">
66 69
         <variable name='grafana_domain' type='string' description="Nom de Domaine ou IP pour accèder à l'interface Grafana" mandatory='True'>
@@ -79,21 +82,84 @@
79 82
                 <value>false</value>
80 83
             </variable>
81 84
     </family>
85
+
86
+    <family name="alertes prometheus">
87
+        <variable name='alSMTPHost' type='string' description="Adresse du serveur SMTP pour l'envois des alertes"/>
88
+        <variable name='alSMTPPort' type='string' description="Port d'écoute du serveur SMTP pour l'envois des alertes"/>
89
+        <variable name='alFrom' type='string' description="Adresse d'origine des emails d'alerte"/>
90
+        <variable name='alSMTPAuth' type='oui/non' description="Authentification requise sur le serveur SMTP ?">
91
+            <value>non</value>
92
+        </variable>
93
+        <variable name='alSMTPUser' type='string' description="Utilisateur SMTP"/>
94
+        <variable name='alSMTPPass' type='string' description="Mot de passe"/>
95
+
96
+        <variable name='alDefaultReceiver' type='string' description='Nom du "receiver" par défaut'/>
97
+
98
+        <variable name='alReceiver' type='string' description="Nom du destinataire"/>
99
+        <variable name='alReceiverEmail' type='string' description="Adresse email du destinataire"/>
100
+
101
+        <variable name='alRoute' type='string' description="Nom de la rêgle de distribution des alertes" multi="true"/>
102
+        <variable name='alRouteMatchSource' type='string' description='Source de correspondance'/>
103
+        <variable name='alRouteMatchValue' type='string' description='Valeur attendue'/>
104
+        <variable name='alRouteMatchReceiver' type='string' description="Equipe destinataire de l'alerte"/>
105
+
106
+        <variable name='alRouteRegxp' type='string' description="Rêgle de distribution des alertes" multi="true"/>
107
+        <variable name='alRouteMatchRegExpSource' type='string' description='Source de correspondance'/>
108
+        <variable name='alRouteMatchRegExp' type='string' description='Expression régulière'/>
109
+        <variable name='alRouteMatchRegxpRecv' type='string' description="Equipe destinataire de l'alerte (regxp)"/>
110
+
111
+		<variable name='alSubRoute' type='string' description="Nom de la rêgle maitresse"/>
112
+        <variable name='alSubRouteMatchSource' type='string' description='Source de correspondance'/>
113
+        <variable name='alSubRouteMatchValue' type='string' description='Valeur attendue'/>
114
+        <variable name='alSubRouteMatchReceiver' type='string' description="Equipe destinataire de l'alerte"/>
115
+    </family>
116
+
117
+    <separators>
118
+        <separator name='alSMTPHost'>Configuration SMTP pour l'envois des alertes</separator>
119
+        <separator name='alDefaultReceiver'>Destinatires</separator>
120
+        <separator name='alRoute'>Rêgles de distribution</separator>
121
+        <separator name='alSubRoute'>Sous-rêgles de distribution</separator>
122
+    </separators>
123
+
82 124
     </variables>
83 125
     <constraints>
84
-		<group master='prCli'>
85
-			<slave>prCliIP</slave>
86
-			<slave>prCliSonde</slave>
87
-		</group>
88 126
 
89
-		<group master='prOpenCli'>
90
-			<slave>prOpenCliIP</slave>
91
-			<slave>prOpenCliPort</slave>
92
-		</group>
127
+        <group master='alReceiver'>
128
+            <slave>alReceiverEmail</slave>
129
+            <slave>prCliSonde</slave>
130
+        </group>
131
+
132
+        <group master='alRoute'>
133
+            <slave>alRouteMatchSource</slave>
134
+            <slave>alRouteMatchValue</slave>
135
+            <slave>alRouteMatchReceiver</slave>
136
+        </group>
137
+
138
+        <group master='alRouteRegxp'>
139
+            <slave>alRouteMatchRegExpSource</slave>
140
+            <slave>alRouteMatchRegExp</slave>
141
+            <slave>alRouteMatchRegxpRecv</slave>
142
+        </group>
93 143
 
94
-		<check name='valid_enum' target='prCliSonde'>
95
-			<param>['Node Exporter','Port']</param>
96
-		</check>
144
+        <group master='alSubRoute'>
145
+            <slave>alSubRouteMatchSource</slave>
146
+            <slave>alSubRouteMatchValue</slave>
147
+            <slave>alSubRouteMatchReceiver</slave>
148
+        </group>
149
+
150
+        <group master='prCli'>
151
+            <slave>prCliIP</slave>
152
+            <slave>prCliSonde</slave>
153
+        </group>
154
+
155
+        <group master='prOpenCli'>
156
+            <slave>prOpenCliIP</slave>
157
+            <slave>prOpenCliPort</slave>
158
+        </group>
159
+
160
+        <check name='valid_enum' target='prCliSonde'>
161
+            <param>['Node Exporter','Port']</param>
162
+        </check>
97 163
 
98 164
         <condition name='disabled_if_in' source='activer_prometheus'>
99 165
             <param>non</param>
@@ -112,12 +178,12 @@
112 178
             <target type='variable'>prCliIP</target>
113 179
             <target type='variable'>prCliSonde</target>
114 180
         </condition>
115
-		<condition name='disabled_if_in' source='addPrOpenCli'>
116
-			<param>non</param>
181
+        <condition name='disabled_if_in' source='addPrOpenCli'>
182
+            <param>non</param>
117 183
             <target type='variable'>prOpenCli</target>
118 184
             <target type='variable'>prOpenCliIP</target>
119 185
             <target type='variable'>prOpenCliPort</target>
120
-		</condition>
186
+        </condition>
121 187
     </constraints>
122 188
     <help>
123 189
     </help>

+ 119
- 0
tmpl/alertmanager.yml View File

@@ -0,0 +1,119 @@
1
+global:
2
+  # The smarthost and SMTP sender used for mail notifications.
3
+  smtp_smarthost: '%%alSMTPHost:%%alSMTPPort'
4
+  smtp_from: '%%alFrom'
5
+%if %%getVar('alSMTPAuth','non') == 'oui'
6
+  smtp_auth_username: '%%alSMTPUser'
7
+  smtp_auth_password: 'alSMTPPass'
8
+%end if
9
+  # The auth token for Hipchat.
10
+  #hipchat_auth_token: '1234556789'
11
+  # Alternative host for Hipchat.
12
+  #hipchat_api_url: 'https://hipchat.foobar.org/'
13
+
14
+# The directory from which notification templates are read.
15
+templates: 
16
+- '/etc/alertmanager/template/*.tmpl'
17
+
18
+# The root route on which each incoming alert enters.
19
+route:
20
+  # The labels by which incoming alerts are grouped together. For example,
21
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
22
+  # be batched into a single group.
23
+  group_by: ['alertname', 'cluster', 'service']
24
+
25
+  # When a new group of alerts is created by an incoming alert, wait at
26
+  # least 'group_wait' to send the initial notification.
27
+  # This way ensures that you get multiple alerts for the same group that start
28
+  # firing shortly after another are batched together on the first 
29
+  # notification.
30
+  group_wait: 30s
31
+
32
+  # When the first notification was sent, wait 'group_interval' to send a batch
33
+  # of new alerts that started firing for that group.
34
+  group_interval: 5m
35
+
36
+  # If an alert has successfully been sent, wait 'repeat_interval' to
37
+  # resend them.
38
+  repeat_interval: 3h 
39
+
40
+  # A default receiver
41
+  receiver: %%alDefaultReceiver
42
+
43
+  # All the above attributes are inherited by all child routes and can 
44
+  # overwritten on each.
45
+
46
+  # The child route trees.
47
+  routes:
48
+  # This routes performs a regular expression match on alert labels to
49
+  # catch alerts that are related to a list of services.
50
+%for route in %%getVar('alRouteRegxp',[])
51
+  - match_re:
52
+      %%{route.alRouteMatchRegExpSource}: %%{route.alRouteMatchRegExp}
53
+      receiver: %%route.alRouteMatchRegxpRecv
54
+  %if not is_empty('alSubRoute')
55
+    routes:
56
+    %for sroute in %%getVar('alSubRoute',[])
57
+    # The service has a sub-route for critical alerts, any alerts
58
+    # that do not match, i.e. severity != critical, fall-back to the
59
+    # parent node and are sent to 'team-X-mails'
60
+      %if %%sroute == %%route
61
+    - match:
62
+        %%{sroute.alSubRouteMatchSource}: %%alSubRouteMatchValue
63
+      receiver: %%alSubRouteMatchReceiver
64
+      %end if
65
+    %end for
66
+  %end if
67
+%end for
68
+%for rt in %%getVar('alRoute',[])
69
+  - match:
70
+      %%{rt.alRouteMatchSource}: %%{rt.alRouteMatchValue}
71
+    receiver: %%rt.alRouteMatchReceiver
72
+
73
+  %if not is_empty('alSubRoute')
74
+    routes:
75
+    %for sroute in %%getVar('alSubRoute',[])
76
+      %if %%sroute == %%rt
77
+    - match:
78
+        %%{rt.alSubRouteMatchReceiver}: %%{rt.alSubRouteMatchReceiver}
79
+      receiver: %%rt.alSubRouteMatchReceiver
80
+      %end if
81
+    %end for
82
+  %end if
83
+%end for
84
+
85
+#  # This route handles all alerts coming from a database service. If there's
86
+#  # no team to handle it, it defaults to the DB team.
87
+#  - match:
88
+#      service: database
89
+#    receiver: team-DB-pager
90
+#    # Also group alerts by affected database.
91
+#    group_by: [alertname, cluster, database]
92
+#    routes:
93
+#    - match:
94
+#        owner: team-X
95
+#      receiver: team-X-pager
96
+#    - match:
97
+#        owner: team-Y
98
+#      receiver: team-Y-pager
99
+
100
+
101
+# Inhibition rules allow to mute a set of alerts given that another alert is
102
+# firing.
103
+# We use this to mute any warning-level notifications if the same alert is 
104
+# already critical.
105
+inhibit_rules:
106
+- source_match:
107
+    severity: 'critical'
108
+  target_match:
109
+    severity: 'warning'
110
+  # Apply inhibition if the alertname is the same.
111
+  equal: ['alertname', 'cluster', 'service']
112
+
113
+
114
+receivers:
115
+%for rcv in %%getVar('alReceiver',[])
116
+- name: '%%rcv'
117
+  email_configs:
118
+  - to: '%%rcv.alReceiverEmail'
119
+%end for

+ 1
- 0
tmpl/prometheus.yml View File

@@ -40,6 +40,7 @@ scrape_configs:
40 40
     %end if
41 41
 ]
42 42
 %end if
43
+
43 44
 #alerting:
44 45
 #  alertmanagers:
45 46
 #  - scheme: https