From e838d4dc3d4b80fe22cf57da2971d47c1b1418ad Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 13 Sep 2020 14:42:07 -0700 Subject: [PATCH] Refresh Prometheus rules/alerts and Grafana dashboards * Refresh upstream Prometheus rules/alerts and Grafana dashboards --- CHANGES.md | 1 + addons/grafana/dashboards-coredns.yaml | 9 + addons/grafana/dashboards-etcd.yaml | 3 +- addons/grafana/dashboards-k8s-nodes.yaml | 38 +- .../grafana/dashboards-k8s-resources-1.yaml | 148 +- .../grafana/dashboards-k8s-resources-2.yaml | 200 +- addons/grafana/dashboards-k8s.yaml | 2052 ++++++++++++++--- addons/grafana/dashboards-nginx-ingress.yaml | 6 + addons/grafana/dashboards-node-exporter.yaml | 7 + addons/grafana/dashboards-prom.yaml | 60 +- addons/prometheus/rules.yaml | 701 ++++-- 11 files changed, 2476 insertions(+), 749 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index eadf7bef..3c66f928 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ Notable changes between versions. ### Addons +* Refresh Prometheus rules/alerts and Grafana dashboards ([#831](https://github.com/poseidon/typhoon/pull/831)) * Reduce apiserver metrics cardinality for non-core APIs ([#830](https://github.com/poseidon/typhoon/pull/830)) ## v1.19.1 diff --git a/addons/grafana/dashboards-coredns.yaml b/addons/grafana/dashboards-coredns.yaml index 8bd507aa..60994376 100644 --- a/addons/grafana/dashboards-coredns.yaml +++ b/addons/grafana/dashboards-coredns.yaml @@ -49,6 +49,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -140,6 +141,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -231,6 +233,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -335,6 +338,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, @@ -440,6 +444,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, @@ -544,6 +549,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, @@ -649,6 +655,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, @@ -767,6 +774,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, @@ -858,6 +866,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": false }, diff --git a/addons/grafana/dashboards-etcd.yaml b/addons/grafana/dashboards-etcd.yaml index bf219e71..6fa287eb 100644 --- a/addons/grafana/dashboards-etcd.yaml +++ b/addons/grafana/dashboards-etcd.yaml @@ -11,7 +11,6 @@ data: "editable": true, "gnetId": null, "hideControls": false, - "id": 6, "links": [ ], @@ -343,7 +342,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes{job=\"$cluster\"}", + "expr": "etcd_mvcc_db_total_size_in_bytes{job=\"$cluster\"}", "hide": false, "interval": "", "intervalFactor": 2, diff --git a/addons/grafana/dashboards-k8s-nodes.yaml b/addons/grafana/dashboards-k8s-nodes.yaml index 9a67f088..ed86e719 100644 --- a/addons/grafana/dashboards-k8s-nodes.yaml +++ b/addons/grafana/dashboards-k8s-nodes.yaml @@ -565,6 +565,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -656,6 +657,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -760,6 +762,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -864,6 +867,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -962,6 +966,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1075,6 +1080,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1168,6 +1174,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1274,6 +1281,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1378,6 +1386,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1469,6 +1478,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1574,6 +1584,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1665,6 +1676,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1769,6 +1781,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1873,6 +1886,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1998,6 +2012,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -2021,7 +2036,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{verb}} {{url}}", @@ -2102,6 +2117,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2193,6 +2209,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2284,6 +2301,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2470,7 +2488,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Kubelet", "uid": "3138fa155d5915769fbded898ac09fd9", "version": 0 @@ -2607,6 +2625,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2698,6 +2717,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -2802,6 +2822,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2893,6 +2914,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -2997,6 +3019,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3109,6 +3132,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3132,7 +3156,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -3213,6 +3237,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -3236,7 +3261,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -3317,6 +3342,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3408,6 +3434,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3499,6 +3526,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3659,7 +3687,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Proxy", "uid": "632e265de029684c40b21cb76bca4f94", "version": 0 diff --git a/addons/grafana/dashboards-k8s-resources-1.yaml b/addons/grafana/dashboards-k8s-resources-1.yaml index 70f20724..d88629a9 100644 --- a/addons/grafana/dashboards-k8s-resources-1.yaml +++ b/addons/grafana/dashboards-k8s-resources-1.yaml @@ -31,6 +31,7 @@ data: "fill": 1, "format": "percentunit", "id": 1, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -686,6 +687,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #A", @@ -704,6 +706,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to workloads", "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #B", @@ -722,6 +725,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -740,6 +744,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -758,6 +763,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -776,6 +782,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -794,6 +801,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #G", @@ -812,6 +820,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", @@ -839,7 +848,7 @@ data: ], "targets": [ { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_owner{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -848,7 +857,7 @@ data: "step": 10 }, { - "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", + "expr": "count(avg(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1105,6 +1114,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #A", @@ -1123,6 +1133,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to workloads", "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #B", @@ -1141,6 +1152,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -1159,6 +1171,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -1177,6 +1190,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -1195,6 +1209,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -1213,6 +1228,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #G", @@ -1231,6 +1247,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", @@ -1258,7 +1275,7 @@ data: ], "targets": [ { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_owner{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1267,7 +1284,7 @@ data: "step": 10 }, { - "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", + "expr": "count(avg(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1384,6 +1401,7 @@ data: "datasource": "$datasource", "fill": 1, "id": 11, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1426,6 +1444,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -1444,6 +1463,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -1462,6 +1482,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -1480,6 +1501,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -1498,6 +1520,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -1516,6 +1539,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -1534,6 +1558,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", @@ -2472,33 +2497,6 @@ data: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(node_cpu_seconds_total, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "current": { @@ -2557,7 +2555,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Cluster", "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 @@ -2789,7 +2787,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2873,7 +2871,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -3115,6 +3113,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -3133,6 +3132,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -3151,6 +3151,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -3169,6 +3170,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -3187,6 +3189,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -3205,6 +3208,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -3387,7 +3391,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3515,6 +3519,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -3533,6 +3538,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -3551,6 +3557,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -3569,6 +3576,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -3587,6 +3595,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -3605,6 +3614,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -3623,6 +3633,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #G", @@ -3641,6 +3652,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #H", @@ -3659,6 +3671,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -3686,7 +3699,7 @@ data: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3704,7 +3717,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3722,7 +3735,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3821,6 +3834,7 @@ data: "datasource": "$datasource", "fill": 1, "id": 9, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -3863,6 +3877,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -3881,6 +3896,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -3899,6 +3915,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -3917,6 +3934,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -3935,6 +3953,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -3953,6 +3972,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -3971,6 +3991,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -4798,7 +4819,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Namespace (Pods)", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 @@ -4861,7 +4882,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4973,6 +4994,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -4991,6 +5013,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -5009,6 +5032,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -5027,6 +5051,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -5045,6 +5070,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -5063,6 +5089,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "pod", @@ -5090,7 +5117,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5099,7 +5126,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5108,7 +5135,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5117,7 +5144,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5126,7 +5153,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5226,7 +5253,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\", container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -5338,6 +5365,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -5356,6 +5384,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -5374,6 +5403,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -5392,6 +5422,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -5410,6 +5441,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -5428,6 +5460,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -5446,6 +5479,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #G", @@ -5464,6 +5498,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #H", @@ -5482,6 +5517,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "pod", @@ -5509,7 +5545,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5518,7 +5554,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5527,7 +5563,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5536,7 +5572,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5545,7 +5581,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5554,7 +5590,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5563,7 +5599,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5572,7 +5608,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=\"$node\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5691,7 +5727,7 @@ data: "hide": 0, "includeAll": false, "label": null, - "multi": false, + "multi": true, "name": "node", "options": [ @@ -5739,7 +5775,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Node (Pods)", "uid": "200ac8fdbfbb74b39aff88118e4d1c2c", "version": 0 diff --git a/addons/grafana/dashboards-k8s-resources-2.yaml b/addons/grafana/dashboards-k8s-resources-2.yaml index efe2c00b..99200bed 100644 --- a/addons/grafana/dashboards-k8s-resources-2.yaml +++ b/addons/grafana/dashboards-k8s-resources-2.yaml @@ -189,7 +189,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -203,7 +203,7 @@ data: "fill": true, "line": true, "op": "gt", - "value": 1, + "value": 0.80000000000000004, "yaxis": "left" } ], @@ -308,6 +308,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -326,6 +327,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -344,6 +346,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -362,6 +365,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -380,6 +384,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -398,6 +403,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "container", @@ -580,7 +586,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -708,6 +714,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -726,6 +733,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -744,6 +752,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -762,6 +771,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -780,6 +790,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -798,6 +809,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -816,6 +828,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #G", @@ -834,6 +847,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #H", @@ -852,6 +866,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "container", @@ -879,7 +894,7 @@ data: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -897,7 +912,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -915,7 +930,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1014,6 +1029,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 6, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1112,6 +1128,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 7, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1210,6 +1227,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 8, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1308,6 +1326,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 9, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1406,6 +1425,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 10, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1504,6 +1524,7 @@ data: "datasource": "$datasource", "fill": 10, "id": 11, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -1724,7 +1745,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Pod", "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 @@ -1787,7 +1808,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1899,6 +1920,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -1917,6 +1939,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -1935,6 +1958,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -1953,6 +1977,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -1971,6 +1996,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -1989,6 +2015,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -2016,7 +2043,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2025,7 +2052,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2034,7 +2061,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2043,7 +2070,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2052,7 +2079,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2152,7 +2179,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -2264,6 +2291,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -2282,6 +2310,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -2300,6 +2329,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -2318,6 +2348,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -2336,6 +2367,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -2354,6 +2386,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -2381,7 +2414,7 @@ data: ], "targets": [ { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2390,7 +2423,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2399,7 +2432,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2408,7 +2441,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2417,7 +2450,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2489,6 +2522,7 @@ data: "datasource": "$datasource", "fill": 1, "id": 5, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -2531,6 +2565,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -2549,6 +2584,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -2567,6 +2603,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -2585,6 +2622,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -2603,6 +2641,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -2621,6 +2660,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -2639,6 +2679,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", @@ -2666,7 +2707,7 @@ data: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2675,7 +2716,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2684,7 +2725,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2693,7 +2734,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2702,7 +2743,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2711,7 +2752,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2811,7 +2852,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -2909,7 +2950,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3007,7 +3048,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3105,7 +3146,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3203,7 +3244,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3301,7 +3342,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3399,7 +3440,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3497,7 +3538,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3646,7 +3687,7 @@ data: "options": [ ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", "refresh": 1, "regex": "", "sort": 1, @@ -3673,7 +3714,7 @@ data: "options": [ ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", "refresh": 1, "regex": "", "sort": 1, @@ -3716,7 +3757,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Workload", "uid": "a164a7f0339f99e89cea5cb47e9be617", "version": 0 @@ -3798,7 +3839,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -3926,6 +3967,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -3944,6 +3986,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -3962,6 +4005,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -3980,6 +4024,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -3998,6 +4043,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -4016,6 +4062,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -4034,6 +4081,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", "pattern": "workload", @@ -4052,6 +4100,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "workload_type", @@ -4079,7 +4128,7 @@ data: ], "targets": [ { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload, workload_type)", + "expr": "count(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload, workload_type)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4088,7 +4137,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4097,7 +4146,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4106,7 +4155,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4115,7 +4164,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4124,7 +4173,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4243,7 +4292,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -4371,6 +4420,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -4389,6 +4439,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -4407,6 +4458,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -4425,6 +4477,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -4443,6 +4496,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -4461,6 +4515,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -4479,6 +4534,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", "pattern": "workload", @@ -4497,6 +4553,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "workload_type", @@ -4524,7 +4581,7 @@ data: ], "targets": [ { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload, workload_type)", + "expr": "count(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}) by (workload, workload_type)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4533,7 +4590,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4542,7 +4599,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4551,7 +4608,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4560,7 +4617,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4569,7 +4626,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4641,6 +4698,7 @@ data: "datasource": "$datasource", "fill": 1, "id": 5, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -4683,6 +4741,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -4701,6 +4760,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -4719,6 +4779,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #C", @@ -4737,6 +4798,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #D", @@ -4755,6 +4817,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #E", @@ -4773,6 +4836,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #F", @@ -4791,6 +4855,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, + "linkTargetBlank": false, "linkTooltip": "Drill down to pods", "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$type", "pattern": "workload", @@ -4809,6 +4874,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "workload_type", @@ -4836,7 +4902,7 @@ data: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4845,7 +4911,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4854,7 +4920,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4863,7 +4929,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4872,7 +4938,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4881,7 +4947,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4981,7 +5047,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5079,7 +5145,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5177,7 +5243,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5275,7 +5341,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5373,7 +5439,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5471,7 +5537,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5569,7 +5635,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5667,7 +5733,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5757,7 +5823,7 @@ data: "value": "deployment" }, "datasource": "$datasource", - "definition": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "hide": 0, "includeAll": false, "label": null, @@ -5766,7 +5832,7 @@ data: "options": [ ], - "query": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -5864,7 +5930,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Compute Resources / Namespace (Workloads)", "uid": "a87fb0d919ec0ea5f6543124e16c42a5", "version": 0 diff --git a/addons/grafana/dashboards-k8s.yaml b/addons/grafana/dashboards-k8s.yaml index 744fa39f..481f039d 100644 --- a/addons/grafana/dashboards-k8s.yaml +++ b/addons/grafana/dashboards-k8s.yaml @@ -20,6 +20,24 @@ data: "id": null, "links": [ + ], + "panels": [ + { + "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "datasource": null, + "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "mode": "markdown", + "span": 12, + "title": "Notice", + "type": "text" + } ], "refresh": "10s", "rows": [ @@ -37,7 +55,9 @@ data: "#d44a3a" ], "datasource": "$datasource", - "format": "none", + "decimals": 3, + "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", "gauge": { "maxValue": 100, "minValue": 0, @@ -48,7 +68,7 @@ data: "gridPos": { }, - "id": 2, + "id": 3, "interval": null, "links": [ @@ -78,7 +98,7 @@ data: "to": "null" } ], - "span": 2, + "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, @@ -88,7 +108,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"apiserver\", cluster=\"$cluster\"})", + "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -96,7 +116,7 @@ data: } ], "thresholds": "", - "title": "Up", + "title": "Availability (30d) > 99.000%", "tooltip": { "shared": false }, @@ -109,7 +129,7 @@ data: "value": "null" } ], - "valueName": "min" + "valueName": "avg" }, { "aliasColors": { @@ -119,11 +139,13 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "decimals": 3, + "description": "How much error budget is left looking at our 0.990% availability gurantees?", + "fill": 10, "gridPos": { }, - "id": 3, + "id": 4, "legend": { "alignAsTable": false, "avg": false, @@ -132,6 +154,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -150,37 +173,16 @@ data: ], "spaceLength": 10, - "span": 5, + "span": 8, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"2..\", cluster=\"$cluster\"}[5m]))", + "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "2xx", + "legendFormat": "errorbudget", "refId": "A" - }, - { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"3..\", cluster=\"$cluster\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"4..\", cluster=\"$cluster\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\", instance=~\"$instance\",code=~\"5..\", cluster=\"$cluster\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" } ], "thresholds": [ @@ -188,7 +190,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "RPC Rate", + "title": "ErrorBudget (30d) > 99.000%", "tooltip": { "shared": false, "sort": 0, @@ -206,7 +208,8 @@ data: }, "yaxes": [ { - "format": "ops", + "decimals": 3, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -214,7 +217,215 @@ data: "show": true }, { - "format": "ops", + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Read Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", + "fill": 10, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/2../i", + "color": "#56A64B" + }, + { + "alias": "/3../i", + "color": "#F2CC0C" + }, + { + "alias": "/4../i", + "color": "#3274D9" + }, + { + "alias": "/5../i", + "color": "#E02F44" + } + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Read SLI - Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "reqps", "label": null, "logBase": 1, "max": null, @@ -231,21 +442,23 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, "gridPos": { }, - "id": 4, + "id": 7, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -262,15 +475,15 @@ data: ], "spaceLength": 10, - "span": 5, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", verb!=\"WATCH\", cluster=\"$cluster\"}[5m])) by (verb, le))", + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{verb}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -279,7 +492,493 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Request duration 99th quantile", + "title": "Read SLI - Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Read SLI - Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 9, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Write Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", + "fill": 10, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/2../i", + "color": "#56A64B" + }, + { + "alias": "/3../i", + "color": "#F2CC0C" + }, + { + "alias": "/4../i", + "color": "#3274D9" + }, + { + "alias": "/5../i", + "color": "#E02F44" + } + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", + "fill": 1, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", + "fill": 1, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Duration", "tooltip": { "shared": false, "sort": 0, @@ -339,7 +1038,7 @@ data: "gridPos": { }, - "id": 5, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -348,6 +1047,7 @@ data: "min": false, "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -430,7 +1130,7 @@ data: "gridPos": { }, - "id": 6, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -439,6 +1139,7 @@ data: "min": false, "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -521,7 +1222,7 @@ data: "gridPos": { }, - "id": 7, + "id": 15, "legend": { "alignAsTable": true, "avg": false, @@ -530,6 +1231,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -625,307 +1327,7 @@ data: "gridPos": { }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_helper_cache_entry_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "ETCD Cache Entry Total", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_helper_cache_hit_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} hit", - "refId": "A" - }, - { - "expr": "sum(rate(etcd_helper_cache_miss_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} miss", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "ETCD Cache Hit/Miss Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_get_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} get", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99,sum(rate(etcd_request_cache_add_duration_seconds_bucket{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} miss", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "ETCD Cache Duration 99th Quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 11, + "id": 16, "legend": { "alignAsTable": false, "avg": false, @@ -934,6 +1336,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1016,7 +1419,7 @@ data: "gridPos": { }, - "id": 12, + "id": 17, "legend": { "alignAsTable": false, "avg": false, @@ -1025,6 +1428,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1107,7 +1511,7 @@ data: "gridPos": { }, - "id": 13, + "id": 18, "legend": { "alignAsTable": false, "avg": false, @@ -1116,6 +1520,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1222,20 +1627,19 @@ data: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": null, + "label": "cluster", "multi": false, "name": "cluster", "options": [ ], "query": "label_values(apiserver_request_total, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -1303,7 +1707,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / API server", "uid": "09ec8aa1e996d6ffcd6817bbaff4db1b", "version": 0 @@ -1440,6 +1844,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1544,6 +1949,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1648,6 +2054,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1752,6 +2159,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1864,6 +2272,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1887,7 +2296,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -1968,6 +2377,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -1991,7 +2401,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -2072,6 +2482,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2163,6 +2574,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2254,6 +2666,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -2414,7 +2827,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Controller Manager", "uid": "72e0e05bef5099e5f049b05fdc429ed4", "version": 0 @@ -2467,6 +2880,7 @@ data: "min": true, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -2662,6 +3076,7 @@ data: "min": true, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -2965,7 +3380,7 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Persistent Volumes", "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 @@ -3102,6 +3517,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -3214,6 +3630,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -3339,6 +3756,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3451,6 +3869,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3474,7 +3893,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -3555,6 +3974,7 @@ data: "min": false, "rightSide": true, "show": true, + "sideWidth": null, "total": false, "values": true }, @@ -3578,7 +3998,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -3659,6 +4079,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3750,6 +4171,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -3841,6 +4263,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -4001,11 +4424,916 @@ data: "30d" ] }, - "timezone": "", + "timezone": "UTC", "title": "Kubernetes / Scheduler", "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", "version": 0 } + statefulset.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "CPU", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Memory", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "Bps", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Network", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Desired Replicas", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Replicas of current version", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Observed Generation", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Metadata Generation", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas specified", + "refId": "A" + }, + { + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas created", + "refId": "B" + }, + { + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "ready", + "refId": "C" + }, + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas of current version", + "refId": "D" + }, + { + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "E" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Name", + "multi": false, + "name": "statefulset", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / StatefulSets", + "uid": "a31c1f46e6f727cb37c0d731a7245005", + "version": 0 + } kind: ConfigMap metadata: name: grafana-dashboards-k8s diff --git a/addons/grafana/dashboards-nginx-ingress.yaml b/addons/grafana/dashboards-nginx-ingress.yaml index 7af93fa8..6228e760 100644 --- a/addons/grafana/dashboards-nginx-ingress.yaml +++ b/addons/grafana/dashboards-nginx-ingress.yaml @@ -308,6 +308,7 @@ data: "min": false, "rightSide": "true", "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -399,6 +400,7 @@ data: "min": false, "rightSide": "true", "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -503,6 +505,7 @@ data: "min": false, "rightSide": "true", "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -621,6 +624,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -719,6 +723,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, @@ -810,6 +815,7 @@ data: "min": false, "rightSide": false, "show": "true", + "sideWidth": null, "total": false, "values": "true" }, diff --git a/addons/grafana/dashboards-node-exporter.yaml b/addons/grafana/dashboards-node-exporter.yaml index 70e52399..682ed3ad 100644 --- a/addons/grafana/dashboards-node-exporter.yaml +++ b/addons/grafana/dashboards-node-exporter.yaml @@ -48,6 +48,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -140,6 +141,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -265,6 +267,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -471,6 +474,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -586,6 +590,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -704,6 +709,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -796,6 +802,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, diff --git a/addons/grafana/dashboards-prom.yaml b/addons/grafana/dashboards-prom.yaml index 3eb0c030..b9856418 100644 --- a/addons/grafana/dashboards-prom.yaml +++ b/addons/grafana/dashboards-prom.yaml @@ -48,6 +48,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -71,10 +72,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -139,6 +140,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -162,10 +164,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", + "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -243,6 +245,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -266,10 +269,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -347,6 +350,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -374,7 +378,7 @@ data: "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -439,6 +443,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -465,7 +470,7 @@ data: "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -530,6 +535,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -556,7 +562,7 @@ data: "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -621,6 +627,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -647,7 +654,7 @@ data: "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -725,6 +732,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -751,7 +759,7 @@ data: "expr": "prometheus_remote_storage_shard_capacity{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -816,6 +824,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -842,7 +851,7 @@ data: "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -920,6 +929,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1011,6 +1021,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1037,7 +1048,7 @@ data: "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{consumer}}", "refId": "A" } ], @@ -1115,6 +1126,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1141,7 +1153,7 @@ data: "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -1206,6 +1218,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1232,7 +1245,7 @@ data: "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -1297,6 +1310,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1323,7 +1337,7 @@ data: "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -1388,6 +1402,7 @@ data: "min": false, "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -1414,7 +1429,7 @@ data: "expr": "rate(prometheus_remote_storage_enqueue_retries_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -1567,11 +1582,11 @@ data: "includeAll": true, "label": null, "multi": false, - "name": "queue", + "name": "url", "options": [ ], - "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, queue)", + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, url)", "refresh": 2, "regex": "", "sort": 0, @@ -1690,6 +1705,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #A", @@ -1708,6 +1724,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value #B", @@ -1726,6 +1743,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "instance", @@ -1744,6 +1762,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "job", @@ -1762,6 +1781,7 @@ data: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "version", @@ -2814,7 +2834,7 @@ data: ] }, "timezone": "utc", - "title": "Prometheus", + "title": "Prometheus Overview", "uid": "", "version": 0 } diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index 359cad7e..dea101fd 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -11,8 +11,8 @@ data: "annotations": { "message": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }})." }, - "expr": "max by (job) (\n sum by (job) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count by (job,endpoint) (\n sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[3m])) > 0.01\n )\n)\n> 0\n", - "for": "3m", + "expr": "max without (endpoint) (\n sum without (instance) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n", + "for": "10m", "labels": { "severity": "critical" } @@ -22,7 +22,7 @@ data: "annotations": { "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." }, - "expr": "sum(up{job=~\".*etcd.*\"} == bool 1) by (job) < ((count(up{job=~\".*etcd.*\"}) by (job) + 1) / 2)\n", + "expr": "sum(up{job=~\".*etcd.*\"} == bool 1) without (instance) < ((count(up{job=~\".*etcd.*\"}) without (instance) + 1) / 2)\n", "for": "3m", "labels": { "severity": "critical" @@ -44,18 +44,40 @@ data: "annotations": { "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated." }, - "expr": "increase((max by (job) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 3\n", + "expr": "increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n", "for": "5m", "labels": { "severity": "warning" } }, + { + "alert": "etcdHighNumberOfFailedGRPCRequests", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." + }, + "expr": "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code!=\"OK\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighNumberOfFailedGRPCRequests", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." + }, + "expr": "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code!=\"OK\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n", + "for": "5m", + "labels": { + "severity": "critical" + } + }, { "alert": "etcdGRPCRequestsSlow", "annotations": { "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." }, - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) by (job, instance, grpc_service, grpc_method, le))\n> 0.15\n", + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n", "for": "10m", "labels": { "severity": "critical" @@ -110,7 +132,7 @@ data: "annotations": { "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" }, - "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.01\n", + "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nwithout (code) > 0.01\n", "for": "10m", "labels": { "severity": "warning" @@ -121,7 +143,7 @@ data: "annotations": { "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." }, - "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.05\n", + "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nwithout (code) > 0.05\n", "for": "10m", "labels": { "severity": "critical" @@ -145,112 +167,137 @@ data: kube.yaml: |- { "groups": [ - { - "name": "kube-apiserver-error", - "rules": [ - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[5m]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate5m" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[30m]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate30m" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[1h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate1h" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[2h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate2h" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[6h]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate6h" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[1d]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate1d" - }, - { - "expr": "sum by (status_class) (\n label_replace(\n rate(apiserver_request_total{job=\"apiserver\"}[3d]\n ), \"status_class\", \"${1}xx\", \"code\", \"([0-9])..\")\n)\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class:apiserver_request_total:rate3d" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate5m{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate5m{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate5m" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate30m{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate30m{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate30m" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate1h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate1h{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate1h" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate2h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate2h{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate2h" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate6h{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate6h{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate6h" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate1d{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate1d{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate1d" - }, - { - "expr": "sum(status_class:apiserver_request_total:rate3d{job=\"apiserver\",status_class=\"5xx\"})\n/\nsum(status_class:apiserver_request_total:rate3d{job=\"apiserver\"})\n", - "labels": { - "job": "apiserver" - }, - "record": "status_class_5xx:apiserver_request_total:ratio_rate3d" - } - ] - }, { "name": "kube-apiserver.rules", "rules": [ + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[1d]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[1d]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[1d]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate1d" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[1h]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[1h]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[1h]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate1h" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[2h]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[2h]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[2h]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate2h" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[30m]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[30m]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[30m]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate30m" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[3d]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[3d]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[3d]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate3d" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[5m]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[5m]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[5m]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate5m" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n -\n (\n (\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[6h]))\n or\n vector(0)\n )\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[6h]))\n +\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[6h]))\n )\n )\n +\n # errors\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:burnrate6h" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1d]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate1d" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[1h]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate1h" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[2h]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate2h" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30m]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate30m" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[3d]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate3d" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[5m]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate5m" + }, + { + "expr": "(\n (\n # too slow\n sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n -\n sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[6h]))\n )\n +\n sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:burnrate6h" + }, + { + "expr": "sum by (code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n", + "labels": { + "verb": "read" + }, + "record": "code_resource:apiserver_request_total:rate5m" + }, + { + "expr": "sum by (code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n", + "labels": { + "verb": "write" + }, + "record": "code_resource:apiserver_request_total:rate5m" + }, + { + "expr": "histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0\n", + "labels": { + "quantile": "0.99", + "verb": "read" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0\n", + "labels": { + "quantile": "0.99", + "verb": "write" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, { "expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n", "record": "cluster:apiserver_request_duration_seconds:mean5m" @@ -278,6 +325,143 @@ data: } ] }, + { + "interval": "3m", + "name": "kube-apiserver-availability.rules", + "rules": [ + { + "expr": "1 - (\n (\n # write too slow\n sum(increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum(increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n ) +\n (\n # read too slow\n sum(increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d]))\n -\n (\n (\n sum(increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[30d]))\n or\n vector(0)\n )\n +\n sum(increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[30d]))\n +\n sum(increase(apiserver_request_duration_seconds_bucket{verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[30d]))\n )\n ) +\n # errors\n sum(code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum(code:apiserver_request_total:increase30d)\n", + "labels": { + "verb": "all" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "1 - (\n sum(increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d]))\n -\n (\n # too slow\n (\n sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=~\"resource|\",le=\"0.1\"}[30d]))\n or\n vector(0)\n )\n +\n sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"namespace\",le=\"0.5\"}[30d]))\n +\n sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",scope=\"cluster\",le=\"5\"}[30d]))\n )\n +\n # errors\n sum(code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum(code:apiserver_request_total:increase30d{verb=\"read\"})\n", + "labels": { + "verb": "read" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "1 - (\n (\n # too slow\n sum(increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))\n -\n sum(increase(apiserver_request_duration_seconds_bucket{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"}[30d]))\n )\n +\n # errors\n sum(code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum(code:apiserver_request_total:increase30d{verb=\"write\"})\n", + "labels": { + "verb": "write" + }, + "record": "apiserver_request:availability30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"LIST\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"GET\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"POST\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PUT\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PATCH\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"DELETE\",code=~\"2..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"LIST\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"GET\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"POST\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PUT\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PATCH\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"DELETE\",code=~\"3..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"LIST\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"GET\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"POST\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PUT\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PATCH\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"DELETE\",code=~\"4..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"LIST\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"GET\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"POST\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PUT\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"PATCH\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=\"DELETE\",code=~\"5..\"}[30d]))\n", + "record": "code_verb:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})\n", + "labels": { + "verb": "read" + }, + "record": "code:apiserver_request_total:increase30d" + }, + { + "expr": "sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n", + "labels": { + "verb": "write" + }, + "record": "code:apiserver_request_total:increase30d" + } + ] + }, { "name": "k8s.rules", "rules": [ @@ -286,23 +470,23 @@ data: "record": "namespace:container_cpu_usage_seconds_total:sum_rate" }, { - "expr": "sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info)\n)\n", + "expr": "sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate" }, { - "expr": "container_memory_working_set_bytes{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", + "expr": "container_memory_working_set_bytes{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_memory_working_set_bytes" }, { - "expr": "container_memory_rss{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", + "expr": "container_memory_rss{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_memory_rss" }, { - "expr": "container_memory_cache{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", + "expr": "container_memory_cache{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_memory_cache" }, { - "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info)\n)\n", + "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_memory_swap" }, { @@ -322,21 +506,21 @@ data: "labels": { "workload_type": "deployment" }, - "record": "mixin_pod_workload" + "record": "namespace_workload_pod:kube_pod_owner:relabel" }, { "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", "labels": { "workload_type": "daemonset" }, - "record": "mixin_pod_workload" + "record": "namespace_workload_pod:kube_pod_owner:relabel" }, { "expr": "max by (cluster, namespace, workload, pod) (\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n)\n", "labels": { "workload_type": "statefulset" }, - "record": "mixin_pod_workload" + "record": "namespace_workload_pod:kube_pod_owner:relabel" } ] }, @@ -412,11 +596,11 @@ data: "name": "node.rules", "rules": [ { - "expr": "sum(min(kube_pod_info) by (cluster, node))\n", + "expr": "sum(min(kube_pod_info{node!=\"\"}) by (cluster, node))\n", "record": ":kube_pod_info_node_count:" }, { - "expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n", + "expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n", "record": "node_namespace_pod:kube_pod_info:" }, { @@ -464,10 +648,10 @@ data: "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 5 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" }, - "expr": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) * 60 * 5 > 0\n", + "expr": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[5m]) * 60 * 5 > 0\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -476,10 +660,10 @@ data: "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready" }, - "expr": "sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})) > 0\n", + "expr": "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) > 0\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -491,7 +675,7 @@ data: "expr": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -503,7 +687,7 @@ data: "expr": "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -515,7 +699,7 @@ data: "expr": "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -527,7 +711,7 @@ data: "expr": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -536,22 +720,22 @@ data: "message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout" }, - "expr": "max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n)\n *\n(\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n)\n", + "expr": "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { "alert": "KubeDaemonSetRolloutStuck", "annotations": { - "message": "Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.", + "message": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck" }, - "expr": "kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n /\nkube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} < 1.00\n", + "expr": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n", "for": "15m", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -585,19 +769,7 @@ data: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled" }, "expr": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0\n", - "for": "10m", - "labels": { - "severity": "warning" - } - }, - { - "alert": "KubeCronJobRunning", - "annotations": { - "message": "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning" - }, - "expr": "time() - kube_cronjob_next_schedule_time{job=\"kube-state-metrics\"} > 3600\n", - "for": "1h", + "for": "15m", "labels": { "severity": "warning" } @@ -605,11 +777,11 @@ data: { "alert": "KubeJobCompletion", "annotations": { - "message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.", + "message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion" }, "expr": "kube_job_spec_completions{job=\"kube-state-metrics\"} - kube_job_status_succeeded{job=\"kube-state-metrics\"} > 0\n", - "for": "1h", + "for": "12h", "labels": { "severity": "warning" } @@ -668,10 +840,10 @@ data: } }, { - "alert": "KubeMemOvercommit", + "alert": "KubeMemoryOvercommit", "annotations": { "message": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit" }, "expr": "sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})\n /\nsum(kube_node_status_allocatable_memory_bytes)\n >\n(count(kube_node_status_allocatable_memory_bytes)-1)\n /\ncount(kube_node_status_allocatable_memory_bytes)\n", "for": "5m", @@ -680,10 +852,10 @@ data: } }, { - "alert": "KubeCPUOvercommit", + "alert": "KubeCPUQuotaOvercommit", "annotations": { "message": "Cluster has overcommitted CPU resource requests for Namespaces.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit" }, "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable_cpu_cores)\n > 1.5\n", "for": "5m", @@ -692,10 +864,10 @@ data: } }, { - "alert": "KubeMemOvercommit", + "alert": "KubeMemoryQuotaOvercommit", "annotations": { "message": "Cluster has overcommitted memory resource requests for Namespaces.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit" }, "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable_memory_bytes{job=\"node-exporter\"})\n > 1.5\n", "for": "5m", @@ -704,12 +876,38 @@ data: } }, { - "alert": "KubeQuotaExceeded", + "alert": "KubeQuotaAlmostFull", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull", + "summary": "Namespace quota is going to be full." + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n", + "for": "15m", + "labels": { + "severity": "info" + } + }, + { + "alert": "KubeQuotaFullyUsed", "annotations": { "message": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused" }, - "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.90\n", + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n", + "for": "15m", + "labels": { + "severity": "info" + } + }, + { + "alert": "KubeQuotaExceeded", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded", + "summary": "Namespace quota has exceeded the limits." + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n", "for": "15m", "labels": { "severity": "warning" @@ -721,10 +919,10 @@ data: "message": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh" }, - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 100 / 100 )\n", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 80 / 100 )\n", "for": "15m", "labels": { - "severity": "warning" + "severity": "info" } } ] @@ -733,10 +931,10 @@ data: "name": "kubernetes-storage", "rules": [ { - "alert": "KubePersistentVolumeUsageCritical", + "alert": "KubePersistentVolumeFillingUp", "annotations": { "message": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup" }, "expr": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\nkubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 0.03\n", "for": "1m", @@ -745,15 +943,15 @@ data: } }, { - "alert": "KubePersistentVolumeFullInFourDays", + "alert": "KubePersistentVolumeFillingUp", "annotations": { "message": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays" + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup" }, "expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n) < 0.15\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0\n", "for": "1h", "labels": { - "severity": "critical" + "severity": "warning" } }, { @@ -779,7 +977,7 @@ data: "message": "There are {{ $value }} different semantic versions of Kubernetes components running.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch" }, - "expr": "count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"gitVersion\",\"$1\",\"gitVersion\",\"(v[0-9]*.[0-9]*.[0-9]*).*\"))) > 1\n", + "expr": "count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"gitVersion\",\"$1\",\"gitVersion\",\"(v[0-9]*.[0-9]*).*\"))) > 1\n", "for": "15m", "labels": { "severity": "warning" @@ -800,30 +998,62 @@ data: ] }, { - "name": "kube-apiserver-error-alerts", + "name": "kube-apiserver-slos", "rules": [ { - "alert": "ErrorBudgetBurn", + "alert": "KubeAPIErrorBudgetBurn", "annotations": { - "message": "High requests error budget burn for job=apiserver (current value: {{ $value }})", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn" + "message": "The API server is burning too much error budget", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn" }, - "expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1h{job=\"apiserver\"} > (14.4*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate5m{job=\"apiserver\"} > (14.4*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (6*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate30m{job=\"apiserver\"} > (6*0.010000)\n)\n", + "expr": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n", + "for": "2m", "labels": { - "job": "apiserver", - "severity": "critical" + "long": "1h", + "severity": "critical", + "short": "5m" } }, { - "alert": "ErrorBudgetBurn", + "alert": "KubeAPIErrorBudgetBurn", "annotations": { - "message": "High requests error budget burn for job=apiserver (current value: {{ $value }})", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn" + "message": "The API server is burning too much error budget", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn" }, - "expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1d{job=\"apiserver\"} > (3*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate2h{job=\"apiserver\"} > (3*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate3d{job=\"apiserver\"} > (0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (0.010000)\n)\n", + "expr": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n", + "for": "15m", "labels": { - "job": "apiserver", - "severity": "warning" + "long": "6h", + "severity": "critical", + "short": "30m" + } + }, + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "message": "The API server is burning too much error budget", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn" + }, + "expr": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n", + "for": "1h", + "labels": { + "long": "1d", + "severity": "warning", + "short": "2h" + } + }, + { + "alert": "KubeAPIErrorBudgetBurn", + "annotations": { + "message": "The API server is burning too much error budget", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn" + }, + "expr": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n", + "for": "3h", + "labels": { + "long": "3d", + "severity": "warning", + "short": "6h" } } ] @@ -831,54 +1061,6 @@ data: { "name": "kubernetes-system-apiserver", "rules": [ - { - "alert": "KubeAPILatencyHigh", - "annotations": { - "message": "The API server has an abnormal latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" - }, - "expr": "(\n cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"}\n >\n on (verb) group_left()\n (\n avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\n +\n 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\n )\n) > on (verb) group_left()\n1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job=\"apiserver\"} >= 0)\nand on (verb,resource)\ncluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\"}\n>\n1\n", - "for": "5m", - "labels": { - "severity": "warning" - } - }, - { - "alert": "KubeAPILatencyHigh", - "annotations": { - "message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" - }, - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\"} > 4\n", - "for": "10m", - "labels": { - "severity": "critical" - } - }, - { - "alert": "KubeAPIErrorsHigh", - "annotations": { - "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" - }, - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (resource,subresource,verb)\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (resource,subresource,verb) > 0.10\n", - "for": "10m", - "labels": { - "severity": "critical" - } - }, - { - "alert": "KubeAPIErrorsHigh", - "annotations": { - "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" - }, - "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (resource,subresource,verb)\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (resource,subresource,verb) > 0.05\n", - "for": "10m", - "labels": { - "severity": "warning" - } - }, { "alert": "KubeClientCertificateExpiration", "annotations": { @@ -915,10 +1097,10 @@ data: { "alert": "AggregatedAPIDown", "annotations": { - "message": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.", + "message": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown" }, - "expr": "sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0\n", + "expr": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90\n", "for": "5m", "labels": { "severity": "warning" @@ -959,8 +1141,7 @@ data: "message": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable" }, - "expr": "kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} == 1\n", - "for": "2m", + "expr": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n", "labels": { "severity": "warning" } @@ -971,7 +1152,7 @@ data: "message": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods" }, - "expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1) by(node) > 0.95\n", + "expr": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n", "for": "15m", "labels": { "severity": "warning" @@ -1007,7 +1188,7 @@ data: "message": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh" }, - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60\n", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"} > 60\n", "for": "15m", "labels": { "severity": "warning" @@ -1350,14 +1531,25 @@ data: { "alert": "NodeHighNumberConntrackEntriesUsed", "annotations": { - "description": "{{ $value | humanizePercentage }} of conntrack entries are used", - "summary": "Number of conntrack are getting close to the limit" + "description": "{{ $value | humanizePercentage }} of conntrack entries are used.", + "summary": "Number of conntrack are getting close to the limit." }, "expr": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n", "labels": { "severity": "warning" } }, + { + "alert": "NodeTextFileCollectorScrapeError", + "annotations": { + "description": "Node Exporter text file collector failed to scrape.", + "summary": "Node Exporter text file collector failed to scrape." + }, + "expr": "node_textfile_scrape_error{job=\"node-exporter\"} == 1\n", + "labels": { + "severity": "warning" + } + }, { "alert": "NodeClockSkewDetected", "annotations": { @@ -1381,6 +1573,29 @@ data: "labels": { "severity": "warning" } + }, + { + "alert": "NodeRAIDDegraded", + "annotations": { + "description": "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + "summary": "RAID Array is degraded" + }, + "expr": "node_md_disks_required - ignoring (state) (node_md_disks{state=\"active\"}) > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeRAIDDiskFailure", + "annotations": { + "description": "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + "summary": "Failed device in RAID array" + }, + "expr": "node_md_disks{state=\"fail\"} > 0\n", + "labels": { + "severity": "warning" + } } ] } @@ -1515,7 +1730,7 @@ data: { "alert": "PrometheusRemoteStorageFailures", "annotations": { - "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue }}{{ else }}{{ $labels.url }}{{ end }}.", + "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}", "summary": "Prometheus fails to send samples to remote storage." }, "expr": "(\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n/\n (\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n +\n rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus\"}[5m])\n )\n)\n* 100\n> 1\n", @@ -1527,7 +1742,7 @@ data: { "alert": "PrometheusRemoteWriteBehind", "annotations": { - "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue }}{{ else }}{{ $labels.url }}{{ end }}.", + "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.", "summary": "Prometheus remote write is behind." }, "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus\"}[5m])\n- on(job, instance) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus\"}[5m])\n)\n> 120\n", @@ -1539,7 +1754,7 @@ data: { "alert": "PrometheusRemoteWriteDesiredShards", "annotations": { - "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}.", + "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}.", "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." }, "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus\"}[5m])\n)\n", @@ -1571,6 +1786,18 @@ data: "labels": { "severity": "warning" } + }, + { + "alert": "PrometheusTargetLimitHit", + "annotations": { + "description": "Prometheus {{$labels.instance}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit." + }, + "expr": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } } ] }