From 1013775b1a326b8475c19501b7fb57c935ca7867 Mon Sep 17 00:00:00 2001 From: Philippe Caseiro Date: Mon, 11 Jun 2018 09:04:58 +0200 Subject: [PATCH] Adding alert rules file template --- dicos/70_prometheus.xml | 7 ++++--- tmpl/alert-rules.yml | 41 +++++++++++++++++++++++++++++++++++++++++ tmpl/prometheus.yml | 2 +- 3 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 tmpl/alert-rules.yml diff --git a/dicos/70_prometheus.xml b/dicos/70_prometheus.xml index 4fda90d..0dcbe3f 100644 --- a/dicos/70_prometheus.xml +++ b/dicos/70_prometheus.xml @@ -1,9 +1,10 @@ - - - + + + + prometheus alertmanager diff --git a/tmpl/alert-rules.yml b/tmpl/alert-rules.yml new file mode 100644 index 0000000..2eb4b80 --- /dev/null +++ b/tmpl/alert-rules.yml @@ -0,0 +1,41 @@ +# +# Alert Rules +# +groups: +- name: EoleRules + rules: + # Instance is Down + - alert: JobInstanceDown + expr: up == 0 + for: 1m + annotations: + DESCRIPTION: Job {{ $labels.job }} instance {{ $labels.instance }} is down. + SUMMARY: Job instance is down + + # Heavy CPU usage + - alert: cpu_threshold_exceeded + expr: (100 * (1 - avg by(instance) (irate(node_cpu{job="%%{job_name_node}",mode="idle"}[5m])))) + > 80 + annotations: + description: This device's cpu usage has exceeded the threshold with a value + of {{ $value }}. + summary: Instance {{ $labels.instance }} CPU usage is dangerously high + + # Heavy Memory usage + - alert: mem_threshold_exceeded + expr: (node_memory_MemFree{job="%%{job_name_node}"} + node_memory_Cached{job="%%{job_name_node}"} + node_memory_Buffers{job="%%{job_name_node}"}) + / 1e+06 < 80 + annotations: + description: This device's memory usage has exceeded the threshold with a value + of {{ $value }}. + summary: Instance {{ $labels.instance }} memory usage is dangerously high + + # Heavy "/" use + - alert: filesystem_threshold_exceeded + expr: node_filesystem_avail{job="%%{job_name_node}",mountpoint="/"} / node_filesystem_size{job="%%{job_name_node}"} + * 100 < 90 + annotations: + description: This device's filesystem usage has exceeded the threshold with + a value of {{ $value }}. + summary: Instance {{ $labels.instance }} filesystem usage is dangerously high + diff --git a/tmpl/prometheus.yml b/tmpl/prometheus.yml index edefb87..fc3588d 100644 --- a/tmpl/prometheus.yml +++ b/tmpl/prometheus.yml @@ -5,7 +5,7 @@ global: scrape_timeout: %%prometheusScrapeTimeout rule_files: -- "/etc/prometheus/rules.d/*.yml" + - "/etc/prometheus/rules.d/*.yml" scrape_configs: - job_name: %%prometheusJobName