mirror of
https://github.com/puppetmaster/typhoon.git
synced 2025-07-01 08:34:35 +02:00
Add etcd metrics, Prometheus scrapes, and Grafana dash
* Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Add Prometheus discovery for etcd peers on controller nodes * Temporarily drop two noisy Prometheus alerts
This commit is contained in:
@ -112,6 +112,22 @@ data:
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||
|
||||
# Scrap etcd metrics from controllers
|
||||
- job_name: 'etcd'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
scheme: http
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller]
|
||||
action: keep
|
||||
regex: 'true'
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: '${1}:2381'
|
||||
|
||||
# Scrape config for service endpoints.
|
||||
#
|
||||
# The relabeling allows the actual service scrape endpoint to be configured
|
||||
|
@ -15,6 +15,12 @@ spec:
|
||||
name: prometheus
|
||||
phase: prod
|
||||
spec:
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
serviceAccountName: prometheus
|
||||
containers:
|
||||
- name: prometheus
|
||||
|
@ -63,26 +63,6 @@ data:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
|
Reference in New Issue
Block a user