From d770393dbc819c3f04a8b3d72b2e5e2b1e93b234 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Wed, 28 Mar 2018 21:45:24 -0700 Subject: [PATCH] Add etcd metrics, Prometheus scrapes, and Grafana dash * Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Add Prometheus discovery for etcd peers on controller nodes * Temporarily drop two noisy Prometheus alerts --- CHANGES.md | 10 ++++++++++ addons/prometheus/config.yaml | 16 +++++++++++++++ addons/prometheus/deployment.yaml | 6 ++++++ addons/prometheus/rules.yaml | 20 ------------------- .../kubernetes/cl/controller.yaml.tmpl | 1 + aws/container-linux/kubernetes/security.tf | 20 +++++++++++++++++++ .../kubernetes/cl/controller.yaml.tmpl | 1 + .../kubernetes/cl/controller.yaml.tmpl | 1 + .../controllers/cl/controller.yaml.tmpl | 1 + .../container-linux/kubernetes/network.tf | 4 ++-- 10 files changed, 58 insertions(+), 22 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index fb3a77cb..22bbbe82 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,16 @@ Notable changes between versions. ## Latest +* Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175)) + +#### Addons + +* Add Prometheus discovery for etcd peers on controller nodes ([#175](https://github.com/poseidon/typhoon/pull/175)) + * Scrape etcd v3.3 `--listen-metrics-urls` for metrics + * Enable etcd alerts and populate the etcd Grafana dashboard + +## v1.10.0 + * Kubernetes [v1.10.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1100) * Remove unused, unmaintained `pxe-worker` internal module diff --git a/addons/prometheus/config.yaml b/addons/prometheus/config.yaml index ec04e772..9832d1fd 100644 --- a/addons/prometheus/config.yaml +++ b/addons/prometheus/config.yaml @@ -112,6 +112,22 @@ data: target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + # Scrap etcd metrics from controllers + - job_name: 'etcd' + kubernetes_sd_configs: + - role: node + scheme: http + relabel_configs: + - source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller] + action: keep + regex: 'true' + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_node_name] + action: replace + target_label: __address__ + replacement: '${1}:2381' + # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured diff --git a/addons/prometheus/deployment.yaml b/addons/prometheus/deployment.yaml index 82c6981d..f416de4a 100644 --- a/addons/prometheus/deployment.yaml +++ b/addons/prometheus/deployment.yaml @@ -15,6 +15,12 @@ spec: name: prometheus phase: prod spec: + nodeSelector: + node-role.kubernetes.io/master: "" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule serviceAccountName: prometheus containers: - name: prometheus diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index 4bdcde24..c76c00d2 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -63,26 +63,6 @@ data: description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15 diff --git a/aws/container-linux/kubernetes/cl/controller.yaml.tmpl b/aws/container-linux/kubernetes/cl/controller.yaml.tmpl index 52be6687..c5aad985 100644 --- a/aws/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/aws/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/aws/container-linux/kubernetes/security.tf b/aws/container-linux/kubernetes/security.tf index 8c71da6b..79fa1cc7 100644 --- a/aws/container-linux/kubernetes/security.tf +++ b/aws/container-linux/kubernetes/security.tf @@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" { source_security_group_id = "${aws_security_group.worker.id}" } +resource "aws_security_group_rule" "controller-node-exporter-self" { + security_group_id = "${aws_security_group.controller.id}" + + type = "ingress" + protocol = "tcp" + from_port = 9100 + to_port = 9100 + self = true +} + resource "aws_security_group_rule" "controller-kubelet-self" { security_group_id = "${aws_security_group.controller.id}" @@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" { resource "aws_security_group_rule" "worker-node-exporter" { security_group_id = "${aws_security_group.worker.id}" + type = "ingress" + protocol = "tcp" + from_port = 9100 + to_port = 9100 + source_security_group_id = "${aws_security_group.controller.id}" +} + +resource "aws_security_group_rule" "worker-node-exporter-self" { + security_group_id = "${aws_security_group.worker.id}" + type = "ingress" protocol = "tcp" from_port = 9100 diff --git a/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl b/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl index 56d77699..cbd53792 100644 --- a/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl b/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl index 5d90b83c..d98bba08 100644 --- a/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl b/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl index 0957943d..d0b4f867 100644 --- a/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl +++ b/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/google-cloud/container-linux/kubernetes/network.tf b/google-cloud/container-linux/kubernetes/network.tf index 228b2b07..74b07ab3 100644 --- a/google-cloud/container-linux/kubernetes/network.tf +++ b/google-cloud/container-linux/kubernetes/network.tf @@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" { target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] } -# Allow prometheus (workload) to scrape node-exporter daemonset +# Allow Prometheus to scrape node-exporter daemonset resource "google_compute_firewall" "internal-node-exporter" { name = "${var.cluster_name}-internal-node-exporter" network = "${google_compute_network.network.name}" @@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" { ports = [9100] } - source_tags = ["${var.cluster_name}-worker"] + source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] }