From c4683c5bad8c8bbacee7d8913fd5f39559fa50ae Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 1 Mar 2020 23:20:33 -0800 Subject: [PATCH] Refresh Prometheus alerts and Grafana dashboards * Add 2 min wait before KubeNodeUnreachable to be less noisy on premeptible clusters * Add a BlackboxProbeFailure alert for any failing probes for services annotated `prometheus.io/probe: true` --- CHANGES.md | 3 + addons/grafana/dashboards-k8s-nodes.yaml | 194 +- .../grafana/dashboards-k8s-resources-1.yaml | 646 +++++- .../grafana/dashboards-k8s-resources-2.yaml | 769 +++++--- addons/grafana/dashboards-k8s.yaml | 1739 +---------------- addons/grafana/dashboards-prom.yaml | 854 +++++++- addons/prometheus/rules.yaml | 303 ++- 7 files changed, 2289 insertions(+), 2219 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f21beb51..71da2372 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -27,9 +27,12 @@ Notable changes between versions. * Update nginx-ingress from v0.28.0 to [v0.30.0](https://github.com/kubernetes/ingress-nginx/releases/tag/nginx-0.30.0) * Update Prometheus from v2.15.2 to [v2.16.0](https://github.com/prometheus/prometheus/releases/tag/v2.16.0) + * Refresh Prometheus rules and alerts + * Add a BlackboxProbeFailure alert * Update kube-state-metrics from v1.9.4 to v1.9.5 * Update node-exporter from v0.18.1 to [v1.0.0-rc.0](https://github.com/prometheus/node_exporter/releases/tag/v1.0.0-rc.0) * Update Grafana from v6.6.1 to v6.6.2 + * Refresh Grafana dashboards ## v1.17.3 diff --git a/addons/grafana/dashboards-k8s-nodes.yaml b/addons/grafana/dashboards-k8s-nodes.yaml index 1b42cd42..9a67f088 100644 --- a/addons/grafana/dashboards-k8s-nodes.yaml +++ b/addons/grafana/dashboards-k8s-nodes.yaml @@ -21,7 +21,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -558,15 +558,15 @@ data: }, "id": 8, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -649,15 +649,15 @@ data: }, "id": 9, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -753,15 +753,15 @@ data: }, "id": 10, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -857,15 +857,15 @@ data: }, "id": 11, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -955,15 +955,15 @@ data: }, "id": 12, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1066,17 +1066,17 @@ data: }, "id": 13, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", - "hideEmpty": "true", - "hideZero": "true", + "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1159,17 +1159,17 @@ data: }, "id": 14, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", - "hideEmpty": "true", - "hideZero": "true", + "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1265,17 +1265,17 @@ data: }, "id": 15, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", - "hideEmpty": "true", - "hideZero": "true", + "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": "true", + "rightSide": true, "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1371,15 +1371,15 @@ data: }, "id": 16, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1462,15 +1462,15 @@ data: }, "id": 17, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1567,15 +1567,15 @@ data: }, "id": 18, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1658,15 +1658,15 @@ data: }, "id": 19, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1762,15 +1762,15 @@ data: }, "id": 20, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1991,15 +1991,15 @@ data: }, "id": 22, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -2373,8 +2373,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -2427,7 +2427,7 @@ data: "options": [ ], - "query": "label_values(kubelet_runtime_operations{cluster=\"$cluster\", job=\"kubelet\"}, instance)", + "query": "label_values(kubelet_runtime_operations_total{cluster=\"$cluster\", job=\"kubelet\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -2496,7 +2496,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -2691,15 +2691,15 @@ data: }, "id": 4, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -2886,15 +2886,15 @@ data: }, "id": 6, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -3206,15 +3206,15 @@ data: }, "id": 9, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -3588,8 +3588,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, diff --git a/addons/grafana/dashboards-k8s-resources-1.yaml b/addons/grafana/dashboards-k8s-resources-1.yaml index 110f2f33..b59a48be 100644 --- a/addons/grafana/dashboards-k8s-resources-1.yaml +++ b/addons/grafana/dashboards-k8s-resources-1.yaml @@ -2458,8 +2458,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -2508,7 +2508,7 @@ data: "text": "5m", "value": "5m" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, @@ -2533,6 +2533,33 @@ data: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_cpu_seconds_total, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -2586,6 +2613,354 @@ data: ], "refresh": "10s", "rows": [ + { + "collapse": false, + "height": "100px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation (from requests)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation (from limits)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilization (from requests)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation (from limits)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -2599,7 +2974,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "id": 5, "legend": { "avg": false, "current": false, @@ -2620,7 +2995,26 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -2634,6 +3028,22 @@ data: "legendFormat": "{{pod}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -2697,7 +3107,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 6, "legend": { "avg": false, "current": false, @@ -2964,7 +3374,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 7, "legend": { "avg": false, "current": false, @@ -2985,7 +3395,26 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -2999,6 +3428,22 @@ data: "legendFormat": "{{pod}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -3062,7 +3507,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 8, "legend": { "avg": false, "current": false, @@ -3410,7 +3855,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 9, "legend": { "avg": false, "current": false, @@ -3704,7 +4149,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 10, "legend": { "avg": false, "current": false, @@ -3802,7 +4247,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 11, "legend": { "avg": false, "current": false, @@ -3900,7 +4345,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 12, "legend": { "avg": false, "current": false, @@ -3998,7 +4443,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 13, "legend": { "avg": false, "current": false, @@ -4096,7 +4541,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 14, "legend": { "avg": false, "current": false, @@ -4194,7 +4639,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 11, + "id": 15, "legend": { "avg": false, "current": false, @@ -4289,8 +4734,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -4303,60 +4748,6 @@ data: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -4366,7 +4757,7 @@ data: "text": "5m", "value": "5m" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, @@ -4391,6 +4782,60 @@ data: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -5265,8 +5710,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -5281,14 +5726,49 @@ data: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "prod", - "value": "prod" + "text": "5m", + "value": "5m" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "4h", + "value": "4h" + } + ], + "query": "4h", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, "multi": false, "name": "cluster", "options": [ @@ -5297,7 +5777,7 @@ data: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -5309,13 +5789,13 @@ data: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "node", + "label": null, "multi": false, "name": "node", "options": [ @@ -5324,7 +5804,7 @@ data: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ diff --git a/addons/grafana/dashboards-k8s-resources-2.yaml b/addons/grafana/dashboards-k8s-resources-2.yaml index 60dcfec4..2475dbac 100644 --- a/addons/grafana/dashboards-k8s-resources-2.yaml +++ b/addons/grafana/dashboards-k8s-resources-2.yaml @@ -50,7 +50,24 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -64,6 +81,22 @@ data: "legendFormat": "{{container}}", "legendLink": null, "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -126,8 +159,113 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "id": 2, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Throttling", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Throttling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, "legend": { "avg": false, "current": false, @@ -394,7 +532,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -415,7 +553,26 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -423,26 +580,26 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (RSS)", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Cache)", + "legendFormat": "requests", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Swap)", + "legendFormat": "limits", "legendLink": null, "step": 10 } @@ -508,7 +665,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -856,7 +1013,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -954,7 +1111,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -1052,7 +1209,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -1150,7 +1307,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -1248,7 +1405,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -1346,7 +1503,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 11, "legend": { "avg": false, "current": false, @@ -1441,8 +1598,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1455,87 +1612,6 @@ data: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -1545,7 +1621,7 @@ data: "text": "5m", "value": "5m" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, @@ -1570,6 +1646,87 @@ data: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -3441,8 +3598,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -3455,114 +3612,6 @@ data: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "workload", - "multi": false, - "name": "workload", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "type", - "multi": false, - "name": "type", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -3572,7 +3621,7 @@ data: "text": "5m", "value": "5m" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, @@ -3597,6 +3646,114 @@ data: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "workload", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "type", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -3684,7 +3841,26 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -3698,6 +3874,22 @@ data: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -4094,7 +4286,26 @@ data: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -4108,6 +4319,22 @@ data: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -5576,8 +5803,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -5590,60 +5817,6 @@ data: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -5653,7 +5826,7 @@ data: "text": "5m", "value": "5m" }, - "datasource": "prometheus", + "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, @@ -5706,6 +5879,60 @@ data: "tagValuesQuery": "", "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], "tagsQuery": "", "type": "query", diff --git a/addons/grafana/dashboards-k8s.yaml b/addons/grafana/dashboards-k8s.yaml index 56a2e5bc..744fa39f 100644 --- a/addons/grafana/dashboards-k8s.yaml +++ b/addons/grafana/dashboards-k8s.yaml @@ -21,7 +21,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -88,7 +88,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"apiserver\"})", + "expr": "sum(up{job=\"apiserver\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -155,28 +155,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"2..\", cluster=\"$cluster\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"3..\", cluster=\"$cluster\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"4..\", cluster=\"$cluster\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"5..\", cluster=\"$cluster\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -237,15 +237,15 @@ data: }, "id": 4, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -267,7 +267,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (verb, le))", + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\", cluster=\"$cluster\"}[5m])) by (verb, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}}", @@ -371,7 +371,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{name}}", @@ -462,7 +462,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{name}}", @@ -523,15 +523,15 @@ data: }, "id": 7, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -553,7 +553,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{name}}", @@ -657,7 +657,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "etcd_helper_cache_entry_total{job=\"apiserver\", instance=~\"$instance\"}", + "expr": "etcd_helper_cache_entry_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -748,14 +748,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(etcd_helper_cache_hit_total{job=\"apiserver\",instance=~\"$instance\"}[5m])) by (intance)", + "expr": "sum(rate(etcd_helper_cache_hit_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} hit", "refId": "A" }, { - "expr": "sum(rate(etcd_helper_cache_miss_total{job=\"apiserver\",instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(etcd_helper_cache_miss_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} miss", @@ -846,14 +846,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_get_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_get_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} get", "refId": "A" }, { - "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_add_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_add_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} miss", @@ -957,7 +957,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1048,7 +1048,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1139,7 +1139,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\"}", + "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1205,8 +1205,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1219,6 +1219,33 @@ data: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(apiserver_request_total, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -1233,7 +1260,7 @@ data: "options": [ ], - "query": "label_values(apiserver_request_total{job=\"apiserver\"}, instance)", + "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -1302,7 +1329,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -1406,15 +1433,15 @@ data: }, "id": 3, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1510,15 +1537,15 @@ data: }, "id": 4, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1614,15 +1641,15 @@ data: }, "id": 5, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -1934,15 +1961,15 @@ data: }, "id": 8, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -2316,8 +2343,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -2413,7 +2440,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -2815,8 +2842,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -2943,664 +2970,6 @@ data: "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 } - pods.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "$datasource", - "enable": true, - "expr": "time() == BOOL timestamp(rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[2m]) > 0)", - "hide": false, - "iconColor": "rgba(215, 44, 44, 1)", - "name": "Restarts", - "showIn": 0, - "tags": [ - "restart" - ], - "type": "rows" - } - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(container) (container_memory_usage_bytes{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current: {{ container }}", - "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" - }, - { - "expr": "sum by(container) (container_memory_cache{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\", container=~\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Cache: {{ container }}", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (container) (irate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"}[4m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current: {{ container }}", - "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum by (pod) (irate(container_network_receive_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[4m])))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RX: {{ pod }}", - "refId": "A" - }, - { - "expr": "sort_desc(sum by (pod) (irate(container_network_transmit_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[4m])))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "TX: {{ pod }}", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network I/O", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max by (container) (kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Restarts: {{ container }}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Total Restarts Per Container", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=~\"$namespace\"}, pod)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": "Container", - "multi": false, - "name": "container", - "options": [ - - ], - "query": "label_values(kube_pod_container_info{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}, container)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Pods", - "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", - "version": 0 - } scheduler.json: |- { "__inputs": [ @@ -3622,7 +2991,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -3726,15 +3095,15 @@ data: }, "id": 3, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -3838,15 +3207,15 @@ data: }, "id": 4, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -4179,15 +3548,15 @@ data: }, "id": 7, "legend": { - "alignAsTable": "true", + "alignAsTable": true, "avg": false, - "current": "true", + "current": true, "max": false, "min": false, - "rightSide": "true", - "show": "true", + "rightSide": true, + "show": true, "total": false, - "values": "true" + "values": true }, "lines": true, "linewidth": 1, @@ -4561,8 +3930,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -4637,910 +4006,6 @@ data: "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", "version": 0 } - statefulset.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "CPU", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Memory", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "Bps", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Network", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Replicas", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Replicas of current version", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Observed Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Metadata Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas specified", - "refId": "A" - }, - { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas created", - "refId": "B" - }, - { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "ready", - "refId": "C" - }, - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas of current version", - "refId": "D" - }, - { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "E" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Name", - "multi": false, - "name": "statefulset", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / StatefulSets", - "uid": "a31c1f46e6f727cb37c0d731a7245005", - "version": 0 - } kind: ConfigMap metadata: name: grafana-dashboards-k8s diff --git a/addons/grafana/dashboards-prom.yaml b/addons/grafana/dashboards-prom.yaml index 0c97844c..3eb0c030 100644 --- a/addons/grafana/dashboards-prom.yaml +++ b/addons/grafana/dashboards-prom.yaml @@ -2,6 +2,12 @@ apiVersion: v1 data: prometheus-remote-write.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -11,14 +17,15 @@ data: "gnetId": null, "graphTooltip": 0, "hideControls": false, + "id": null, "links": [ ], - "refresh": "10s", + "refresh": "", "rows": [ { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -29,12 +36,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "gridPos": { + + }, + "id": 2, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -44,11 +56,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -58,12 +71,11 @@ data: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -89,11 +101,11 @@ data: }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -102,7 +114,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -115,12 +127,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "gridPos": { + + }, + "id": 3, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -130,11 +147,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -144,12 +162,11 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -179,7 +196,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -188,7 +205,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -198,11 +215,12 @@ data: "repeatRowId": null, "showTitle": true, "title": "Timestamps", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -213,12 +231,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "gridPos": { + + }, + "id": 4, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -228,11 +251,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -242,12 +266,11 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -277,7 +300,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -286,7 +309,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -296,11 +319,12 @@ data: "repeatRowId": null, "showTitle": true, "title": "Samples", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -311,12 +335,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "gridPos": { + + }, + "id": 5, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -326,16 +355,18 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 6, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": false, "steppedLine": false, "targets": [ @@ -344,8 +375,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -353,7 +383,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Num. Shards", + "title": "Current Shards", "tooltip": { "shared": true, "sort": 0, @@ -375,7 +405,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -384,7 +414,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -397,12 +427,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "gridPos": { + + }, + "id": 6, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -412,11 +447,298 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Min Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Desired Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Shards", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -430,8 +752,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -439,7 +760,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Capacity", + "title": "Shard Capacity", "tooltip": { "shared": true, "sort": 0, @@ -461,7 +782,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -470,7 +791,98 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pending Samples", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ] } @@ -479,12 +891,13 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Shards", - "titleSize": "h6" + "title": "Shard Details", + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -495,12 +908,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 6, + "gridPos": { + + }, + "id": 11, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -510,11 +928,207 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "TSDB Current Segment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Remote Write Current Segment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Segments", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -528,8 +1142,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -559,7 +1172,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -568,7 +1181,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -581,12 +1194,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "gridPos": { + + }, + "id": 14, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -596,11 +1214,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -614,8 +1233,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -645,7 +1263,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -654,7 +1272,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -667,12 +1285,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "gridPos": { + + }, + "id": 15, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -682,11 +1305,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -700,8 +1324,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -731,7 +1354,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -740,7 +1363,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -753,12 +1376,17 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "gridPos": { + + }, + "id": 16, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -768,11 +1396,12 @@ data: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -786,8 +1415,7 @@ data: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -817,7 +1445,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -826,7 +1454,7 @@ data: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -835,8 +1463,9 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Misc Rates.", - "titleSize": "h6" + "title": "Misc. Rates", + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, @@ -847,10 +1476,6 @@ data: "templating": { "list": [ { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, "hide": 0, "label": null, "name": "datasource", @@ -865,23 +1490,30 @@ data: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "instance", - "multi": true, + "label": null, + "multi": false, "name": "instance", "options": [ ], "query": "label_values(prometheus_build_info, instance)", - "refresh": 1, + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -893,23 +1525,56 @@ data: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "cluster", - "multi": true, + "label": null, + "multi": false, "name": "cluster", "options": [ ], "query": "label_values(kube_pod_container_info{image=~\".*prometheus.*\"}, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "queue", + "options": [ + + ], + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, queue)", + "refresh": 2, + "regex": "", + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -921,7 +1586,7 @@ data: ] }, "time": { - "from": "now-1h", + "from": "now-6h", "to": "now" }, "timepicker": { @@ -949,9 +1614,8 @@ data: "30d" ] }, - "timezone": "utc", + "timezone": "browser", "title": "Prometheus Remote Write", - "uid": "", "version": 0 } prometheus.json: |- @@ -2048,8 +2712,8 @@ data: "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index 804f9e71..e1ffbc6f 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -42,10 +42,10 @@ data: { "alert": "etcdHighNumberOfLeaderChanges", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes." + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated." }, - "expr": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3\n", - "for": "15m", + "expr": "increase((max by (job) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 3\n", + "for": "5m", "labels": { "severity": "warning" } @@ -145,25 +145,132 @@ data: kube.yaml: |- { "groups": [ + { + "name": "kube-apiserver-error", + "rules": [ + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[5m]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate5m" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[30m]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate30m" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[1h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate1h" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[2h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate2h" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[6h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate6h" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[1d]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate1d" + }, + { + "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[3d]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class:apiserver_request_total:rate3d" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate5m{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate5m{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate5m" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate30m{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate30m{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate30m" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate1h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate1h{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate1h" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate2h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate2h{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate2h" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate6h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate6h{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate6h" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate1d{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate1d{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate1d" + }, + { + "expr": "sum(status_class:apiserver_request_total:rate3d{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate3d{job=\"apiserver\"})\n", + "labels": { + "job": "apiserver" + }, + "record": "status_class_5xx:apiserver_request_total:ratio_rate3d" + } + ] + }, { "name": "kube-apiserver.rules", "rules": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n", + "record": "cluster:apiserver_request_duration_seconds:mean5m" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", "labels": { "quantile": "0.99" }, "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" }, { - "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", "labels": { "quantile": "0.9" }, "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" }, { - "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", "labels": { "quantile": "0.5" }, @@ -179,23 +286,23 @@ data: "record": "namespace:container_cpu_usage_seconds_total:sum_rate" }, { - "expr": "sum by (namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "expr": "sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info)\n)\n", "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate" }, { - "expr": "container_memory_working_set_bytes{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "expr": "container_memory_working_set_bytes{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", "record": "node_namespace_pod_container:container_memory_working_set_bytes" }, { - "expr": "container_memory_rss{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "expr": "container_memory_rss{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", "record": "node_namespace_pod_container:container_memory_rss" }, { - "expr": "container_memory_cache{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "expr": "container_memory_cache{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", "record": "node_namespace_pod_container:container_memory_cache" }, { - "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", "record": "node_namespace_pod_container:container_memory_swap" }, { @@ -203,29 +310,29 @@ data: "record": "namespace:container_memory_usage_bytes:sum" }, { - "expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)) by (namespace, pod)\n * on (namespace, pod)\n group_left(label_name) kube_pod_labels{job=\"kube-state-metrics\"}\n)\n", + "expr": "sum by (namespace) (\n sum by (namespace, pod) (\n max by (namespace, pod, container) (\n kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}\n ) * on(namespace, pod) group_left() max by (namespace, pod) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", "record": "namespace:kube_pod_container_resource_requests_memory_bytes:sum" }, { - "expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)) by (namespace, pod)\n * on (namespace, pod)\n group_left(label_name) kube_pod_labels{job=\"kube-state-metrics\"}\n)\n", + "expr": "sum by (namespace) (\n sum by (namespace, pod) (\n max by (namespace, pod, container) (\n kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"}\n ) * on(namespace, pod) group_left() max by (namespace, pod) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", "record": "namespace:kube_pod_container_resource_requests_cpu_cores:sum" }, { - "expr": "sum(\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job=\"kube-state-metrics\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (\n 1, max by (replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"}\n )\n ),\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", "labels": { "workload_type": "deployment" }, "record": "mixin_pod_workload" }, { - "expr": "sum(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", "labels": { "workload_type": "daemonset" }, "record": "mixin_pod_workload" }, { - "expr": "sum(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", "labels": { "workload_type": "statefulset" }, @@ -305,23 +412,49 @@ data: "name": "node.rules", "rules": [ { - "expr": "sum(min(kube_pod_info) by (node))", + "expr": "sum(min(kube_pod_info) by (cluster, node))\n", "record": ":kube_pod_info_node_count:" }, { - "expr": "max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")) by (node, namespace, pod)\n", + "expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n", "record": "node_namespace_pod:kube_pod_info:" }, { - "expr": "count by (node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n))\n", + "expr": "count by (cluster, node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n))\n", "record": "node:node_num_cpu:sum" }, { - "expr": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n)\n", + "expr": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n", "record": ":node_memory_MemAvailable_bytes:sum" } ] }, + { + "name": "kubelet.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"})\n", + "labels": { + "quantile": "0.99" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"})\n", + "labels": { + "quantile": "0.9" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"})\n", + "labels": { + "quantile": "0.5" + }, + "record": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile" + } + ] + }, { "name": "kubernetes-apps", "rules": [ @@ -343,7 +476,7 @@ data: "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready" }, - "expr": "sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Failed|Pending|Unknown\"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!=\"Job\"}) > 0\n", + "expr": "sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})) > 0\n", "for": "15m", "labels": { "severity": "critical" @@ -367,7 +500,7 @@ data: "message": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch" }, - "expr": "kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n !=\nkube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n", + "expr": "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { "severity": "critical" @@ -379,7 +512,7 @@ data: "message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch" }, - "expr": "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n", + "expr": "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { "severity": "critical" @@ -528,7 +661,7 @@ data: "message": "Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit" }, - "expr": "sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)\n /\nsum(kube_node_status_allocatable_cpu_cores)\n >\n(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)\n", + "expr": "sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})\n /\nsum(kube_node_status_allocatable_cpu_cores)\n >\n(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)\n", "for": "5m", "labels": { "severity": "warning" @@ -540,7 +673,7 @@ data: "message": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit" }, - "expr": "sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)\n /\nsum(kube_node_status_allocatable_memory_bytes)\n >\n(count(kube_node_status_allocatable_memory_bytes)-1)\n /\ncount(kube_node_status_allocatable_memory_bytes)\n", + "expr": "sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})\n /\nsum(kube_node_status_allocatable_memory_bytes)\n >\n(count(kube_node_status_allocatable_memory_bytes)-1)\n /\ncount(kube_node_status_allocatable_memory_bytes)\n", "for": "5m", "labels": { "severity": "warning" @@ -618,7 +751,7 @@ data: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays" }, "expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n) < 0.15\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0\n", - "for": "5m", + "for": "1h", "labels": { "severity": "critical" } @@ -666,17 +799,44 @@ data: } ] }, + { + "name": "kube-apiserver-error-alerts", + "rules": [ + { + "alert": "ErrorBudgetBurn", + "annotations": { + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn" + }, + "expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1h{job=\"apiserver\"} > (14.4*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate5m{job=\"apiserver\"} > (14.4*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (6*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate30m{job=\"apiserver\"} > (6*0.010000)\n)\n", + "labels": { + "job": "apiserver", + "severity": "critical" + } + }, + { + "alert": "ErrorBudgetBurn", + "annotations": { + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn" + }, + "expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1d{job=\"apiserver\"} > (3*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate2h{job=\"apiserver\"} > (3*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate3d{job=\"apiserver\"} > (0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (0.010000)\n)\n", + "labels": { + "job": "apiserver", + "severity": "warning" + } + } + ] + }, { "name": "kubernetes-system-apiserver", "rules": [ { "alert": "KubeAPILatencyHigh", "annotations": { - "message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", + "message": "The API server has an abnormal latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" }, - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"} > 1\n", - "for": "10m", + "expr": "(\n cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"}\n >\n on (verb) group_left()\n (\n avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\n +\n 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\n )\n) > on (verb) group_left()\n1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\nand on (verb,resource)\ncluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\"}\n>\n1\n", + "for": "5m", "labels": { "severity": "warning" } @@ -687,7 +847,7 @@ data: "message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" }, - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"} > 4\n", + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\"} > 4\n", "for": "10m", "labels": { "severity": "critical" @@ -747,7 +907,7 @@ data: "message": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration" }, - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n", + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n", "labels": { "severity": "warning" } @@ -758,11 +918,34 @@ data: "message": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration" }, - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n", + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n", "labels": { "severity": "critical" } }, + { + "alert": "AggregatedAPIErrors", + "annotations": { + "message": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors" + }, + "expr": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "AggregatedAPIDown", + "annotations": { + "message": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown" + }, + "expr": "sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, { "alert": "KubeAPIDown", "annotations": { @@ -799,6 +982,7 @@ data: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable" }, "expr": "kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} == 1\n", + "for": "2m", "labels": { "severity": "warning" } @@ -815,6 +999,42 @@ data: "severity": "warning" } }, + { + "alert": "KubeNodeReadinessFlapping", + "annotations": { + "message": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping" + }, + "expr": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletPlegDurationHigh", + "annotations": { + "message": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh" + }, + "expr": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletPodStartUpLatencyHigh", + "annotations": { + "message": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 5\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, { "alert": "KubeletDown", "annotations": { @@ -1124,7 +1344,7 @@ data: { "alert": "PrometheusRemoteStorageFailures", "annotations": { - "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to queue {{$labels.queue}}.", + "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue }}{{ else }}{{ $labels.url }}{{ end }}.", "summary": "Prometheus fails to send samples to remote storage." }, "expr": "(\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n/\n (\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n +\n rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus\"}[5m])\n )\n)\n* 100\n> 1\n", @@ -1136,7 +1356,7 @@ data: { "alert": "PrometheusRemoteWriteBehind", "annotations": { - "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for queue {{$labels.queue}}.", + "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue }}{{ else }}{{ $labels.url }}{{ end }}.", "summary": "Prometheus remote write is behind." }, "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus\"}[5m])\n- on(job, instance) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus\"}[5m])\n)\n> 120\n", @@ -1148,10 +1368,10 @@ data: { "alert": "PrometheusRemoteWriteDesiredShards", "annotations": { - "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ printf $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}.", + "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}.", "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." }, - "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus\"}[5m])\n> on(job, instance) group_right\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus\"}[5m])\n)\n", + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus\"}[5m])\n)\n", "for": "15m", "labels": { "severity": "warning" @@ -1201,6 +1421,17 @@ data: "labels": { "severity": "warning" } + }, + { + "alert": "BlackboxProbeFailure", + "annotations": { + "message": "Blackbox probe {{$labels.instance}} failed" + }, + "expr": "probe_success == 0", + "for": "2m", + "labels": { + "severity": "critical" + } } ] },