| groups: |
| - name: etcd_alert.rules |
| rules: |
| - alert: InsufficientMembers |
| expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) |
| for: 3m |
| labels: |
| severity: critical |
| annotations: |
| description: If one more etcd member goes down the cluster will be unavailable |
| summary: etcd cluster insufficient members |
| - alert: HighNumberOfFailedHTTPRequests |
| expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m])) |
| BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) |
| > 0.01 |
| for: 10m |
| labels: |
| severity: warning |
| annotations: |
| description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd |
| instance {{ $labels.instance }}' |
| summary: a high number of HTTP requests are failing |
| - alert: HighNumberOfFailedHTTPRequests |
| expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m])) |
| BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) |
| > 0.05 |
| for: 5m |
| labels: |
| severity: critical |
| annotations: |
| description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd |
| instance {{ $labels.instance }}' |
| summary: a high number of HTTP requests are failing |
| - alert: HighNumberOfFailedHTTPRequests |
| expr: sum(rate(etcd_http_failed_total{code=~"^(?:4[0-9]{2})$",job="etcd"}[5m])) |
| BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) |
| > 0.5 |
| for: 10m |
| labels: |
| severity: critical |
| annotations: |
| description: '{{ $value }}% of requests for {{ $labels.method }} failed with |
| 4xx responses on etcd instance {{ $labels.instance }}' |
| summary: a high number of HTTP requests are failing |
| - alert: HTTPRequestsSlow |
| expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) |
| > 0.15 |
| for: 10m |
| labels: |
| severity: warning |
| annotations: |
| description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method |
| }} are slow |
| summary: slow HTTP requests |
| - record: instance:fd_utilization |
| expr: process_open_fds / process_max_fds |
| - alert: FdExhaustionClose |
| expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 |
| for: 10m |
| labels: |
| severity: warning |
| annotations: |
| description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust |
| its file descriptors soon' |
| summary: file descriptors soon exhausted |
| - alert: FdExhaustionClose |
| expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 |
| for: 10m |
| labels: |
| severity: critical |
| annotations: |
| description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust |
| its file descriptors soon' |
| summary: file descriptors soon exhausted |
| - alert: HighNumberOfFailedProposals |
| expr: increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 |
| labels: |
| severity: warning |
| annotations: |
| description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal |
| failures within the last hour |
| summary: a high number of proposals within the etcd cluster are failing |
| - alert: HighFsyncDurations |
| expr: histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) |
| > 0.5 |
| for: 10m |
| labels: |
| severity: warning |
| annotations: |
| description: etcd instance {{ $labels.instance }} fync durations are high |
| summary: high fsync durations |