Add etcd metrics, Prometheus scrapes, and Grafana dash
* Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Add Prometheus discovery for etcd peers on controller nodes * Temporarily drop two noisy Prometheus alerts
This commit is contained in:
parent
642f7ec22f
commit
d770393dbc
10
CHANGES.md
10
CHANGES.md
|
@ -4,6 +4,16 @@ Notable changes between versions.
|
||||||
|
|
||||||
## Latest
|
## Latest
|
||||||
|
|
||||||
|
* Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175))
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Add Prometheus discovery for etcd peers on controller nodes ([#175](https://github.com/poseidon/typhoon/pull/175))
|
||||||
|
* Scrape etcd v3.3 `--listen-metrics-urls` for metrics
|
||||||
|
* Enable etcd alerts and populate the etcd Grafana dashboard
|
||||||
|
|
||||||
|
## v1.10.0
|
||||||
|
|
||||||
* Kubernetes [v1.10.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1100)
|
* Kubernetes [v1.10.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1100)
|
||||||
* Remove unused, unmaintained `pxe-worker` internal module
|
* Remove unused, unmaintained `pxe-worker` internal module
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,22 @@ data:
|
||||||
target_label: __metrics_path__
|
target_label: __metrics_path__
|
||||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||||
|
|
||||||
|
# Scrap etcd metrics from controllers
|
||||||
|
- job_name: 'etcd'
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: node
|
||||||
|
scheme: http
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller]
|
||||||
|
action: keep
|
||||||
|
regex: 'true'
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- source_labels: [__meta_kubernetes_node_name]
|
||||||
|
action: replace
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '${1}:2381'
|
||||||
|
|
||||||
# Scrape config for service endpoints.
|
# Scrape config for service endpoints.
|
||||||
#
|
#
|
||||||
# The relabeling allows the actual service scrape endpoint to be configured
|
# The relabeling allows the actual service scrape endpoint to be configured
|
||||||
|
|
|
@ -15,6 +15,12 @@ spec:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
phase: prod
|
phase: prod
|
||||||
spec:
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/master: ""
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/master
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
serviceAccountName: prometheus
|
serviceAccountName: prometheus
|
||||||
containers:
|
containers:
|
||||||
- name: prometheus
|
- name: prometheus
|
||||||
|
|
|
@ -63,26 +63,6 @@ data:
|
||||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||||
changes within the last hour
|
changes within the last hour
|
||||||
summary: a high number of leader changes within the etcd cluster are happening
|
summary: a high number of leader changes within the etcd cluster are happening
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
|
||||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
||||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
||||||
on etcd instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of gRPC requests are failing
|
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
|
||||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
||||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
||||||
on etcd instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of gRPC requests are failing
|
|
||||||
- alert: GRPCRequestsSlow
|
- alert: GRPCRequestsSlow
|
||||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
|
|
@ -13,6 +13,7 @@ systemd:
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
||||||
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
||||||
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
||||||
|
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
|
||||||
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
||||||
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
||||||
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
||||||
|
|
|
@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
|
||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "controller-node-exporter-self" {
|
||||||
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 9100
|
||||||
|
to_port = 9100
|
||||||
|
self = true
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-self" {
|
resource "aws_security_group_rule" "controller-kubelet-self" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" {
|
||||||
resource "aws_security_group_rule" "worker-node-exporter" {
|
resource "aws_security_group_rule" "worker-node-exporter" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 9100
|
||||||
|
to_port = 9100
|
||||||
|
source_security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "worker-node-exporter-self" {
|
||||||
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
type = "ingress"
|
type = "ingress"
|
||||||
protocol = "tcp"
|
protocol = "tcp"
|
||||||
from_port = 9100
|
from_port = 9100
|
||||||
|
|
|
@ -13,6 +13,7 @@ systemd:
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
|
||||||
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
||||||
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
||||||
|
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
|
||||||
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
||||||
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
||||||
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
||||||
|
|
|
@ -13,6 +13,7 @@ systemd:
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
||||||
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
||||||
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
||||||
|
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
|
||||||
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
||||||
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
||||||
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
||||||
|
|
|
@ -13,6 +13,7 @@ systemd:
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
||||||
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
||||||
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
||||||
|
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
|
||||||
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
||||||
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
||||||
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
||||||
|
|
|
@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" {
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Allow prometheus (workload) to scrape node-exporter daemonset
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
resource "google_compute_firewall" "internal-node-exporter" {
|
resource "google_compute_firewall" "internal-node-exporter" {
|
||||||
name = "${var.cluster_name}-internal-node-exporter"
|
name = "${var.cluster_name}-internal-node-exporter"
|
||||||
network = "${google_compute_network.network.name}"
|
network = "${google_compute_network.network.name}"
|
||||||
|
@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
|
||||||
ports = [9100]
|
ports = [9100]
|
||||||
}
|
}
|
||||||
|
|
||||||
source_tags = ["${var.cluster_name}-worker"]
|
source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue