Discard Prometheus etcd gRPC failure alert
* Kubernetes watch expiry is not a gRPC code we care about * Background: This rule is typically removed, but was added back in
This commit is contained in:
parent
1b3a0f6ebc
commit
f884de847e
|
@ -50,28 +50,6 @@ data:
|
||||||
"severity": "warning"
|
"severity": "warning"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"alert": "etcdHighNumberOfFailedGRPCRequests",
|
|
||||||
"annotations": {
|
|
||||||
"message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}."
|
|
||||||
},
|
|
||||||
"expr": "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code!=\"OK\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n",
|
|
||||||
"for": "10m",
|
|
||||||
"labels": {
|
|
||||||
"severity": "warning"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"alert": "etcdHighNumberOfFailedGRPCRequests",
|
|
||||||
"annotations": {
|
|
||||||
"message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}."
|
|
||||||
},
|
|
||||||
"expr": "100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code!=\"OK\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n",
|
|
||||||
"for": "5m",
|
|
||||||
"labels": {
|
|
||||||
"severity": "critical"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"alert": "etcdGRPCRequestsSlow",
|
"alert": "etcdGRPCRequestsSlow",
|
||||||
"annotations": {
|
"annotations": {
|
||||||
|
|
Loading…
Reference in New Issue