diff --git a/dicos/70_prometheus.xml b/dicos/70_prometheus.xml index edc0de9..e686b6c 100644 --- a/dicos/70_prometheus.xml +++ b/dicos/70_prometheus.xml @@ -1,9 +1,12 @@ - - + + + + prometheus + alertmanager grafana-server 80 @@ -46,21 +49,21 @@ non - - - - - - Node Exporter - + + + + + + Node Exporter + non - - - - - + + + + + @@ -79,21 +82,84 @@ false + + + + + + + non + + + + + + + + + + + + + + + + + + + + + + + + + + + Configuration SMTP pour l'envois des alertes + Destinatires + Rêgles de distribution + Sous-rêgles de distribution + + - - prCliIP - prCliSonde - - - prOpenCliIP - prOpenCliPort - + + alReceiverEmail + prCliSonde + - - ['Node Exporter','Port'] - + + alRouteMatchSource + alRouteMatchValue + alRouteMatchReceiver + + + + alRouteMatchRegExpSource + alRouteMatchRegExp + alRouteMatchRegxpRecv + + + + alSubRouteMatchSource + alSubRouteMatchValue + alSubRouteMatchReceiver + + + + prCliIP + prCliSonde + + + + prOpenCliIP + prOpenCliPort + + + + ['Node Exporter','Port'] + non @@ -112,12 +178,12 @@ prCliIP prCliSonde - - non + + non prOpenCli prOpenCliIP prOpenCliPort - + diff --git a/tmpl/alertmanager.yml b/tmpl/alertmanager.yml new file mode 100644 index 0000000..9f716cc --- /dev/null +++ b/tmpl/alertmanager.yml @@ -0,0 +1,119 @@ +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: '%%alSMTPHost:%%alSMTPPort' + smtp_from: '%%alFrom' +%if %%getVar('alSMTPAuth','non') == 'oui' + smtp_auth_username: '%%alSMTPUser' + smtp_auth_password: 'alSMTPPass' +%end if + # The auth token for Hipchat. + #hipchat_auth_token: '1234556789' + # Alternative host for Hipchat. + #hipchat_api_url: 'https://hipchat.foobar.org/' + +# The directory from which notification templates are read. +templates: +- '/etc/alertmanager/template/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: %%alDefaultReceiver + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # This routes performs a regular expression match on alert labels to + # catch alerts that are related to a list of services. +%for route in %%getVar('alRouteRegxp',[]) + - match_re: + %%{route.alRouteMatchRegExpSource}: %%{route.alRouteMatchRegExp} + receiver: %%route.alRouteMatchRegxpRecv + %if not is_empty('alSubRoute') + routes: + %for sroute in %%getVar('alSubRoute',[]) + # The service has a sub-route for critical alerts, any alerts + # that do not match, i.e. severity != critical, fall-back to the + # parent node and are sent to 'team-X-mails' + %if %%sroute == %%route + - match: + %%{sroute.alSubRouteMatchSource}: %%alSubRouteMatchValue + receiver: %%alSubRouteMatchReceiver + %end if + %end for + %end if +%end for +%for rt in %%getVar('alRoute',[]) + - match: + %%{rt.alRouteMatchSource}: %%{rt.alRouteMatchValue} + receiver: %%rt.alRouteMatchReceiver + + %if not is_empty('alSubRoute') + routes: + %for sroute in %%getVar('alSubRoute',[]) + %if %%sroute == %%rt + - match: + %%{rt.alSubRouteMatchReceiver}: %%{rt.alSubRouteMatchReceiver} + receiver: %%rt.alSubRouteMatchReceiver + %end if + %end for + %end if +%end for + +# # This route handles all alerts coming from a database service. If there's +# # no team to handle it, it defaults to the DB team. +# - match: +# service: database +# receiver: team-DB-pager +# # Also group alerts by affected database. +# group_by: [alertname, cluster, database] +# routes: +# - match: +# owner: team-X +# receiver: team-X-pager +# - match: +# owner: team-Y +# receiver: team-Y-pager + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +%for rcv in %%getVar('alReceiver',[]) +- name: '%%rcv' + email_configs: + - to: '%%rcv.alReceiverEmail' +%end for diff --git a/tmpl/prometheus.yml b/tmpl/prometheus.yml index 59363ed..31de0c7 100644 --- a/tmpl/prometheus.yml +++ b/tmpl/prometheus.yml @@ -40,6 +40,7 @@ scrape_configs: %end if ] %end if + #alerting: # alertmanagers: # - scheme: https