Configure Prometheus to scrape Kubelets directly

* Use Kubelet bearer token authn/authz to scrape metrics
* Drop RBAC permission from nodes/proxy to nodes/metrics
* Stop proxying kubelet scrapes through the apiserver, since
this required higher privilege (nodes/proxy) and can add
load to the apiserver on large clusters
This commit is contained in:
Dalton Hubble 2018-05-13 23:49:45 -07:00
parent 37981f9fb1
commit c2b719dc75
7 changed files with 40 additions and 38 deletions

View File

@ -6,7 +6,7 @@ Notable changes between versions.
* Update etcd from v3.3.4 to v3.3.5 ([#213](https://github.com/poseidon/typhoon/pull/213)) * Update etcd from v3.3.4 to v3.3.5 ([#213](https://github.com/poseidon/typhoon/pull/213))
* Require Terraform v0.11.x and drop support for v0.10.x ([migration guide](https://typhoon.psdn.io/topics/maintenance/#terraform-v011x)) * Require Terraform v0.11.x and drop support for v0.10.x ([migration guide](https://typhoon.psdn.io/topics/maintenance/#terraform-v011x))
* Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/215)) * Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/216))
* Require Webhook authorization to the Kubelet * Require Webhook authorization to the Kubelet
* Switch apiserver X509 client cert org to satisfy new authorization requirement * Switch apiserver X509 client cert org to satisfy new authorization requirement
@ -22,6 +22,9 @@ Notable changes between versions.
#### Addons #### Addons
* Fix Prometheus data directory location ([#203](https://github.com/poseidon/typhoon/pull/203)) * Fix Prometheus data directory location ([#203](https://github.com/poseidon/typhoon/pull/203))
* Configure Prometheus to scrape Kubelets directly with bearer token auth instead of proxying through the apiserver ([#217](https://github.com/poseidon/typhoon/pull/217))
* Security improvement: Drop RBAC permission from `nodes/proxy` to `nodes/metrics`
* Scale: Remove per-node proxied scrape load from the apiserver
* Update Grafana from v5.04 to v5.1.2 ([#208](https://github.com/poseidon/typhoon/pull/208)) * Update Grafana from v5.04 to v5.1.2 ([#208](https://github.com/poseidon/typhoon/pull/208))
* Disable Grafana Google Analytics by default ([#214](https://github.com/poseidon/typhoon/issues/214)) * Disable Grafana Google Analytics by default ([#214](https://github.com/poseidon/typhoon/issues/214))

View File

@ -56,12 +56,7 @@ data:
target_label: job target_label: job
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
# metrics from a node by scraping kubelet (127.0.0.1:10255/metrics). # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics).
#
# Rather than connecting directly to the node, the scrape is proxied though the
# Kubernetes apiserver. This means it will work if Prometheus is running out of
# cluster, or can't connect to nodes for some other reason (e.g. because of
# firewalling).
- job_name: 'kubelet' - job_name: 'kubelet'
kubernetes_sd_configs: kubernetes_sd_configs:
- role: node - role: node
@ -69,50 +64,34 @@ data:
scheme: https scheme: https
tls_config: tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# Kubelet certs don't have any fixed IP SANs
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs: relabel_configs:
- action: labelmap - action: labelmap
regex: __meta_kubernetes_node_label_(.+) regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Scrape config for Kubelet cAdvisor. Explore metrics from a node by # Scrape config for Kubelet cAdvisor. Explore metrics from a node by
# scraping kubelet (127.0.0.1:10255/metrics/cadvisor). # scraping kubelet (127.0.0.1:10250/metrics/cadvisor).
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
#
# Rather than connecting directly to the node, the scrape is proxied though the
# Kubernetes apiserver. This means it will work if Prometheus is running out of
# cluster, or can't connect to nodes for some other reason (e.g. because of
# firewalling).
- job_name: 'kubernetes-cadvisor' - job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs: kubernetes_sd_configs:
- role: node - role: node
scheme: https scheme: https
metrics_path: /metrics/cadvisor
tls_config: tls_config:
# Kubelet certs don't have any fixed IP SANs
insecure_skip_verify: true
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs: relabel_configs:
- action: labelmap - action: labelmap
regex: __meta_kubernetes_node_label_(.+) regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name] # Scrap etcd metrics from controllers via listen-metrics-urls
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Scrap etcd metrics from controllers
- job_name: 'etcd' - job_name: 'etcd'
kubernetes_sd_configs: kubernetes_sd_configs:
- role: node - role: node

View File

@ -6,7 +6,7 @@ rules:
- apiGroups: [""] - apiGroups: [""]
resources: resources:
- nodes - nodes
- nodes/proxy - nodes/metrics
- services - services
- endpoints - endpoints
- pods - pods

View File

@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
source_security_group_id = "${aws_security_group.worker.id}" source_security_group_id = "${aws_security_group.worker.id}"
} }
resource "aws_security_group_rule" "controller-kubelet" {
security_group_id = "${aws_security_group.controller.id}"
type = "ingress"
protocol = "tcp"
from_port = 10250
to_port = 10250
source_security_group_id = "${aws_security_group.worker.id}"
}
resource "aws_security_group_rule" "controller-kubelet-self" { resource "aws_security_group_rule" "controller-kubelet-self" {
security_group_id = "${aws_security_group.controller.id}" security_group_id = "${aws_security_group.controller.id}"

View File

@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
source_security_group_id = "${aws_security_group.worker.id}" source_security_group_id = "${aws_security_group.worker.id}"
} }
resource "aws_security_group_rule" "controller-kubelet" {
security_group_id = "${aws_security_group.controller.id}"
type = "ingress"
protocol = "tcp"
from_port = 10250
to_port = 10250
source_security_group_id = "${aws_security_group.worker.id}"
}
resource "aws_security_group_rule" "controller-kubelet-self" { resource "aws_security_group_rule" "controller-kubelet-self" {
security_group_id = "${aws_security_group.controller.id}" security_group_id = "${aws_security_group.controller.id}"

View File

@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
} }
# kubelet API to allow kubectl exec and log # kubelet API to allow apiserver exec and log or metrics scraping
resource "google_compute_firewall" "internal-kubelet" { resource "google_compute_firewall" "internal-kubelet" {
name = "${var.cluster_name}-internal-kubelet" name = "${var.cluster_name}-internal-kubelet"
network = "${google_compute_network.network.name}" network = "${google_compute_network.network.name}"
@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" {
ports = [10250] ports = [10250]
} }
source_tags = ["${var.cluster_name}-controller"] source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
} }

View File

@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
} }
# kubelet API to allow kubectl exec and log # kubelet API to allow apiserver exec and log or metrics scraping
resource "google_compute_firewall" "internal-kubelet" { resource "google_compute_firewall" "internal-kubelet" {
name = "${var.cluster_name}-internal-kubelet" name = "${var.cluster_name}-internal-kubelet"
network = "${google_compute_network.network.name}" network = "${google_compute_network.network.name}"
@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" {
ports = [10250] ports = [10250]
} }
source_tags = ["${var.cluster_name}-controller"] source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
} }