From c2b719dc753c17db45c094e4101689f9caac27fd Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 13 May 2018 23:49:45 -0700 Subject: [PATCH] Configure Prometheus to scrape Kubelets directly * Use Kubelet bearer token authn/authz to scrape metrics * Drop RBAC permission from nodes/proxy to nodes/metrics * Stop proxying kubelet scrapes through the apiserver, since this required higher privilege (nodes/proxy) and can add load to the apiserver on large clusters --- CHANGES.md | 5 ++- addons/prometheus/config.yaml | 43 +++++-------------- addons/prometheus/rbac/cluster-role.yaml | 2 +- aws/container-linux/kubernetes/security.tf | 10 +++++ aws/fedora-atomic/kubernetes/security.tf | 10 +++++ .../container-linux/kubernetes/network.tf | 4 +- .../fedora-atomic/kubernetes/network.tf | 4 +- 7 files changed, 40 insertions(+), 38 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3298e9e5..fdbb64ec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,7 +6,7 @@ Notable changes between versions. * Update etcd from v3.3.4 to v3.3.5 ([#213](https://github.com/poseidon/typhoon/pull/213)) * Require Terraform v0.11.x and drop support for v0.10.x ([migration guide](https://typhoon.psdn.io/topics/maintenance/#terraform-v011x)) -* Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/215)) +* Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/216)) * Require Webhook authorization to the Kubelet * Switch apiserver X509 client cert org to satisfy new authorization requirement @@ -22,6 +22,9 @@ Notable changes between versions. #### Addons * Fix Prometheus data directory location ([#203](https://github.com/poseidon/typhoon/pull/203)) +* Configure Prometheus to scrape Kubelets directly with bearer token auth instead of proxying through the apiserver ([#217](https://github.com/poseidon/typhoon/pull/217)) + * Security improvement: Drop RBAC permission from `nodes/proxy` to `nodes/metrics` + * Scale: Remove per-node proxied scrape load from the apiserver * Update Grafana from v5.04 to v5.1.2 ([#208](https://github.com/poseidon/typhoon/pull/208)) * Disable Grafana Google Analytics by default ([#214](https://github.com/poseidon/typhoon/issues/214)) diff --git a/addons/prometheus/config.yaml b/addons/prometheus/config.yaml index 9832d1fd..e910e804 100644 --- a/addons/prometheus/config.yaml +++ b/addons/prometheus/config.yaml @@ -56,12 +56,7 @@ data: target_label: job # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore - # metrics from a node by scraping kubelet (127.0.0.1:10255/metrics). - # - # Rather than connecting directly to the node, the scrape is proxied though the - # Kubernetes apiserver. This means it will work if Prometheus is running out of - # cluster, or can't connect to nodes for some other reason (e.g. because of - # firewalling). + # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics). - job_name: 'kubelet' kubernetes_sd_configs: - role: node @@ -69,50 +64,34 @@ data: scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics # Scrape config for Kubelet cAdvisor. Explore metrics from a node by - # scraping kubelet (127.0.0.1:10255/metrics/cadvisor). - # - # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics - # (those whose names begin with 'container_') have been removed from the - # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to - # retrieve those metrics. - # - # Rather than connecting directly to the node, the scrape is proxied though the - # Kubernetes apiserver. This means it will work if Prometheus is running out of - # cluster, or can't connect to nodes for some other reason (e.g. because of - # firewalling). + # scraping kubelet (127.0.0.1:10250/metrics/cadvisor). - job_name: 'kubernetes-cadvisor' kubernetes_sd_configs: - role: node - + scheme: https + metrics_path: /metrics/cadvisor tls_config: + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - - # Scrap etcd metrics from controllers + + + # Scrap etcd metrics from controllers via listen-metrics-urls - job_name: 'etcd' kubernetes_sd_configs: - role: node diff --git a/addons/prometheus/rbac/cluster-role.yaml b/addons/prometheus/rbac/cluster-role.yaml index 0390a92b..6f6cee0f 100644 --- a/addons/prometheus/rbac/cluster-role.yaml +++ b/addons/prometheus/rbac/cluster-role.yaml @@ -6,7 +6,7 @@ rules: - apiGroups: [""] resources: - nodes - - nodes/proxy + - nodes/metrics - services - endpoints - pods diff --git a/aws/container-linux/kubernetes/security.tf b/aws/container-linux/kubernetes/security.tf index 9c729c95..47186f7f 100644 --- a/aws/container-linux/kubernetes/security.tf +++ b/aws/container-linux/kubernetes/security.tf @@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" { source_security_group_id = "${aws_security_group.worker.id}" } +resource "aws_security_group_rule" "controller-kubelet" { + security_group_id = "${aws_security_group.controller.id}" + + type = "ingress" + protocol = "tcp" + from_port = 10250 + to_port = 10250 + source_security_group_id = "${aws_security_group.worker.id}" +} + resource "aws_security_group_rule" "controller-kubelet-self" { security_group_id = "${aws_security_group.controller.id}" diff --git a/aws/fedora-atomic/kubernetes/security.tf b/aws/fedora-atomic/kubernetes/security.tf index 9c729c95..47186f7f 100644 --- a/aws/fedora-atomic/kubernetes/security.tf +++ b/aws/fedora-atomic/kubernetes/security.tf @@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" { source_security_group_id = "${aws_security_group.worker.id}" } +resource "aws_security_group_rule" "controller-kubelet" { + security_group_id = "${aws_security_group.controller.id}" + + type = "ingress" + protocol = "tcp" + from_port = 10250 + to_port = 10250 + source_security_group_id = "${aws_security_group.worker.id}" +} + resource "aws_security_group_rule" "controller-kubelet-self" { security_group_id = "${aws_security_group.controller.id}" diff --git a/google-cloud/container-linux/kubernetes/network.tf b/google-cloud/container-linux/kubernetes/network.tf index 619faa8b..628b0fcc 100644 --- a/google-cloud/container-linux/kubernetes/network.tf +++ b/google-cloud/container-linux/kubernetes/network.tf @@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" { target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] } -# kubelet API to allow kubectl exec and log +# kubelet API to allow apiserver exec and log or metrics scraping resource "google_compute_firewall" "internal-kubelet" { name = "${var.cluster_name}-internal-kubelet" network = "${google_compute_network.network.name}" @@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" { ports = [10250] } - source_tags = ["${var.cluster_name}-controller"] + source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] } diff --git a/google-cloud/fedora-atomic/kubernetes/network.tf b/google-cloud/fedora-atomic/kubernetes/network.tf index 619faa8b..628b0fcc 100644 --- a/google-cloud/fedora-atomic/kubernetes/network.tf +++ b/google-cloud/fedora-atomic/kubernetes/network.tf @@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" { target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] } -# kubelet API to allow kubectl exec and log +# kubelet API to allow apiserver exec and log or metrics scraping resource "google_compute_firewall" "internal-kubelet" { name = "${var.cluster_name}-internal-kubelet" network = "${google_compute_network.network.name}" @@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" { ports = [10250] } - source_tags = ["${var.cluster_name}-controller"] + source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] }