Configure Prometheus to scrape Kubelets directly
* Use Kubelet bearer token authn/authz to scrape metrics * Drop RBAC permission from nodes/proxy to nodes/metrics * Stop proxying kubelet scrapes through the apiserver, since this required higher privilege (nodes/proxy) and can add load to the apiserver on large clusters
This commit is contained in:
parent
37981f9fb1
commit
c2b719dc75
|
@ -6,7 +6,7 @@ Notable changes between versions.
|
||||||
|
|
||||||
* Update etcd from v3.3.4 to v3.3.5 ([#213](https://github.com/poseidon/typhoon/pull/213))
|
* Update etcd from v3.3.4 to v3.3.5 ([#213](https://github.com/poseidon/typhoon/pull/213))
|
||||||
* Require Terraform v0.11.x and drop support for v0.10.x ([migration guide](https://typhoon.psdn.io/topics/maintenance/#terraform-v011x))
|
* Require Terraform v0.11.x and drop support for v0.10.x ([migration guide](https://typhoon.psdn.io/topics/maintenance/#terraform-v011x))
|
||||||
* Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/215))
|
* Allow bearer token authentication to the Kubelet ([#216](https://github.com/poseidon/typhoon/issues/216))
|
||||||
* Require Webhook authorization to the Kubelet
|
* Require Webhook authorization to the Kubelet
|
||||||
* Switch apiserver X509 client cert org to satisfy new authorization requirement
|
* Switch apiserver X509 client cert org to satisfy new authorization requirement
|
||||||
|
|
||||||
|
@ -22,6 +22,9 @@ Notable changes between versions.
|
||||||
#### Addons
|
#### Addons
|
||||||
|
|
||||||
* Fix Prometheus data directory location ([#203](https://github.com/poseidon/typhoon/pull/203))
|
* Fix Prometheus data directory location ([#203](https://github.com/poseidon/typhoon/pull/203))
|
||||||
|
* Configure Prometheus to scrape Kubelets directly with bearer token auth instead of proxying through the apiserver ([#217](https://github.com/poseidon/typhoon/pull/217))
|
||||||
|
* Security improvement: Drop RBAC permission from `nodes/proxy` to `nodes/metrics`
|
||||||
|
* Scale: Remove per-node proxied scrape load from the apiserver
|
||||||
* Update Grafana from v5.04 to v5.1.2 ([#208](https://github.com/poseidon/typhoon/pull/208))
|
* Update Grafana from v5.04 to v5.1.2 ([#208](https://github.com/poseidon/typhoon/pull/208))
|
||||||
* Disable Grafana Google Analytics by default ([#214](https://github.com/poseidon/typhoon/issues/214))
|
* Disable Grafana Google Analytics by default ([#214](https://github.com/poseidon/typhoon/issues/214))
|
||||||
|
|
||||||
|
|
|
@ -56,12 +56,7 @@ data:
|
||||||
target_label: job
|
target_label: job
|
||||||
|
|
||||||
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
||||||
# metrics from a node by scraping kubelet (127.0.0.1:10255/metrics).
|
# metrics from a node by scraping kubelet (127.0.0.1:10250/metrics).
|
||||||
#
|
|
||||||
# Rather than connecting directly to the node, the scrape is proxied though the
|
|
||||||
# Kubernetes apiserver. This means it will work if Prometheus is running out of
|
|
||||||
# cluster, or can't connect to nodes for some other reason (e.g. because of
|
|
||||||
# firewalling).
|
|
||||||
- job_name: 'kubelet'
|
- job_name: 'kubelet'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: node
|
- role: node
|
||||||
|
@ -69,50 +64,34 @@ data:
|
||||||
scheme: https
|
scheme: https
|
||||||
tls_config:
|
tls_config:
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
# Kubelet certs don't have any fixed IP SANs
|
||||||
|
insecure_skip_verify: true
|
||||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
- target_label: __address__
|
|
||||||
replacement: kubernetes.default.svc:443
|
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
|
||||||
regex: (.+)
|
|
||||||
target_label: __metrics_path__
|
|
||||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
|
||||||
|
|
||||||
# Scrape config for Kubelet cAdvisor. Explore metrics from a node by
|
# Scrape config for Kubelet cAdvisor. Explore metrics from a node by
|
||||||
# scraping kubelet (127.0.0.1:10255/metrics/cadvisor).
|
# scraping kubelet (127.0.0.1:10250/metrics/cadvisor).
|
||||||
#
|
|
||||||
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
|
|
||||||
# (those whose names begin with 'container_') have been removed from the
|
|
||||||
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
|
|
||||||
# retrieve those metrics.
|
|
||||||
#
|
|
||||||
# Rather than connecting directly to the node, the scrape is proxied though the
|
|
||||||
# Kubernetes apiserver. This means it will work if Prometheus is running out of
|
|
||||||
# cluster, or can't connect to nodes for some other reason (e.g. because of
|
|
||||||
# firewalling).
|
|
||||||
- job_name: 'kubernetes-cadvisor'
|
- job_name: 'kubernetes-cadvisor'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: node
|
- role: node
|
||||||
|
|
||||||
scheme: https
|
scheme: https
|
||||||
|
metrics_path: /metrics/cadvisor
|
||||||
tls_config:
|
tls_config:
|
||||||
|
# Kubelet certs don't have any fixed IP SANs
|
||||||
|
insecure_skip_verify: true
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
- target_label: __address__
|
|
||||||
replacement: kubernetes.default.svc:443
|
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
# Scrap etcd metrics from controllers via listen-metrics-urls
|
||||||
regex: (.+)
|
|
||||||
target_label: __metrics_path__
|
|
||||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
|
||||||
|
|
||||||
# Scrap etcd metrics from controllers
|
|
||||||
- job_name: 'etcd'
|
- job_name: 'etcd'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: node
|
- role: node
|
||||||
|
|
|
@ -6,7 +6,7 @@ rules:
|
||||||
- apiGroups: [""]
|
- apiGroups: [""]
|
||||||
resources:
|
resources:
|
||||||
- nodes
|
- nodes
|
||||||
- nodes/proxy
|
- nodes/metrics
|
||||||
- services
|
- services
|
||||||
- endpoints
|
- endpoints
|
||||||
- pods
|
- pods
|
||||||
|
|
|
@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
|
||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "controller-kubelet" {
|
||||||
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 10250
|
||||||
|
to_port = 10250
|
||||||
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-self" {
|
resource "aws_security_group_rule" "controller-kubelet-self" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
|
|
@ -91,6 +91,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
|
||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "controller-kubelet" {
|
||||||
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 10250
|
||||||
|
to_port = 10250
|
||||||
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-self" {
|
resource "aws_security_group_rule" "controller-kubelet-self" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
|
|
@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
# kubelet API to allow kubectl exec and log
|
# kubelet API to allow apiserver exec and log or metrics scraping
|
||||||
resource "google_compute_firewall" "internal-kubelet" {
|
resource "google_compute_firewall" "internal-kubelet" {
|
||||||
name = "${var.cluster_name}-internal-kubelet"
|
name = "${var.cluster_name}-internal-kubelet"
|
||||||
network = "${google_compute_network.network.name}"
|
network = "${google_compute_network.network.name}"
|
||||||
|
@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" {
|
||||||
ports = [10250]
|
ports = [10250]
|
||||||
}
|
}
|
||||||
|
|
||||||
source_tags = ["${var.cluster_name}-controller"]
|
source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -121,7 +121,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
# kubelet API to allow kubectl exec and log
|
# kubelet API to allow apiserver exec and log or metrics scraping
|
||||||
resource "google_compute_firewall" "internal-kubelet" {
|
resource "google_compute_firewall" "internal-kubelet" {
|
||||||
name = "${var.cluster_name}-internal-kubelet"
|
name = "${var.cluster_name}-internal-kubelet"
|
||||||
network = "${google_compute_network.network.name}"
|
network = "${google_compute_network.network.name}"
|
||||||
|
@ -131,7 +131,7 @@ resource "google_compute_firewall" "internal-kubelet" {
|
||||||
ports = [10250]
|
ports = [10250]
|
||||||
}
|
}
|
||||||
|
|
||||||
source_tags = ["${var.cluster_name}-controller"]
|
source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue