From 1013775b1a326b8475c19501b7fb57c935ca7867 Mon Sep 17 00:00:00 2001 From: Philippe Caseiro Date: Mon, 11 Jun 2018 09:04:58 +0200 Subject: [PATCH 1/3] Adding alert rules file template --- dicos/70_prometheus.xml | 7 ++++--- tmpl/alert-rules.yml | 41 +++++++++++++++++++++++++++++++++++++++++ tmpl/prometheus.yml | 2 +- 3 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 tmpl/alert-rules.yml diff --git a/dicos/70_prometheus.xml b/dicos/70_prometheus.xml index 4fda90d..0dcbe3f 100644 --- a/dicos/70_prometheus.xml +++ b/dicos/70_prometheus.xml @@ -1,9 +1,10 @@ - - - + + + + prometheus alertmanager diff --git a/tmpl/alert-rules.yml b/tmpl/alert-rules.yml new file mode 100644 index 0000000..2eb4b80 --- /dev/null +++ b/tmpl/alert-rules.yml @@ -0,0 +1,41 @@ +# +# Alert Rules +# +groups: +- name: EoleRules + rules: + # Instance is Down + - alert: JobInstanceDown + expr: up == 0 + for: 1m + annotations: + DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down. + SUMMARY: Job instance is down + + # Heavy CPU usage + - alert: cpu_threshold_exceeded + expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m])))) + > 80 + annotations: + description: This device's cpu usage has exceeded the threshold with a value + of {{ $value }}. + summary: Instance {{ $labels.instance }} CPU usage is dangerously high + + # Heavy Memory usage + - alert: mem_threshold_exceeded + expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"}) + / 1e+06 < 80 + annotations: + description: This device's memory usage has exceeded the threshold with a value + of {{ $value }}. + summary: Instance {{ $labels.instance }} memory usage is dangerously high + + # Heavy "/" use + - alert: filesystem_threshold_exceeded + expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"} + * 100 < 90 + annotations: + description: This device's filesystem usage has exceeded the threshold with + a value of {{ $value }}. + summary: Instance {{ $labels.instance }} filesystem usage is dangerously high + diff --git a/tmpl/prometheus.yml b/tmpl/prometheus.yml index edefb87..fc3588d 100644 --- a/tmpl/prometheus.yml +++ b/tmpl/prometheus.yml @@ -5,7 +5,7 @@ global: scrape_timeout: %%prometheusScrapeTimeout rule_files: -- "/etc/prometheus/rules.d/*.yml" + - "/etc/prometheus/rules.d/*.yml" scrape_configs: - job_name: %%prometheusJobName From 33643232d49f793c98d646adf224188f55fefa70 Mon Sep 17 00:00:00 2001 From: Philippe Caseiro Date: Mon, 11 Jun 2018 09:24:40 +0200 Subject: [PATCH 2/3] Supporting gobal smtp gateway usage --- dicos/70_prometheus.xml | 13 +++++++++++++ tmpl/alertmanager.yml | 20 ++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/dicos/70_prometheus.xml b/dicos/70_prometheus.xml index 0dcbe3f..9d57761 100644 --- a/dicos/70_prometheus.xml +++ b/dicos/70_prometheus.xml @@ -103,6 +103,9 @@ + + non + @@ -211,6 +214,16 @@ ['','service','severity'] + + oui + alSMTPUser + alSMTPPass + alSMTPPort + alSMTPTLS + alSMTPHost + alSMTPAuth + + non alSMTPUser diff --git a/tmpl/alertmanager.yml b/tmpl/alertmanager.yml index 1acb668..e72e74c 100644 --- a/tmpl/alertmanager.yml +++ b/tmpl/alertmanager.yml @@ -1,16 +1,28 @@ global: # The smarthost and SMTP sender used for mail notifications. +%if %%alSMTPUseSys == 'oui' + %if %%tls_smtp == "non" + smtp_smarthost: '%%exim_relay_smtp:25' + %elif %%tls_smtp == "port 25" + smtp_smarthost: '%%exim_relay_smtp:25' + smtp_require_tls: true + %else + smtp_smarthost: '%%exim_relay_smtp:465' + smtp_require_tls: true + %end if +%else smtp_smarthost: '%%alSMTPHost:%%alSMTPPort' smtp_from: '%%alFrom' -%if %%getVar('alSMTPAuth','non') == 'oui' + %if %%getVar('alSMTPAuth','non') == 'oui' smtp_auth_username: '%%alSMTPUser' smtp_auth_password: 'alSMTPPass' -%end if + %end if -%if %%getVar('alSMTPTLS','non') == 'oui' + %if %%getVar('alSMTPTLS','non') == 'oui' smtp_require_tls: true -%else + %else smtp_require_tls: false + %end if %end if # The auth token for Hipchat. From 37342e870037f6ca89d8b942107eafc75fdac26e Mon Sep 17 00:00:00 2001 From: Philippe Caseiro Date: Mon, 11 Jun 2018 09:35:28 +0200 Subject: [PATCH 3/3] Fix diagnose --- diagnose/70-grafana | 3 ++- diagnose/70-prometheus | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/diagnose/70-grafana b/diagnose/70-grafana index f674512..9269cd7 100644 --- a/diagnose/70-grafana +++ b/diagnose/70-grafana @@ -1,6 +1,7 @@ #!/bin/bash -if [ $(CreoleGet activer_grafana) = "oui" ];then +if [[ $(CreoleGet activer_grafana) == "oui" ]] +then . /usr/lib/eole/diagnose.sh EchoGras "*** Accès au serveur grafana" diff --git a/diagnose/70-prometheus b/diagnose/70-prometheus index 3aed96b..ab472bf 100644 --- a/diagnose/70-prometheus +++ b/diagnose/70-prometheus @@ -1,6 +1,7 @@ #!/bin/bash -if [ $(CreoleGet activer_prometheus) = "oui" ];then +if [[ $(CreoleGet activer_prometheus) == "oui" ]] +then . /usr/lib/eole/diagnose.sh EchoGras "*** Accès au serveur Prometheus"