vendor/github.com/coreos/etcd/Documentation/op-guide/etcd3_alert.rules - cloudstack-kubernetes-provider - Git at Google

 # general cluster availability

 # alert if another failed member will result in an unavailable cluster
 ALERT InsufficientMembers
 IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
 FOR 3m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "etcd cluster insufficient members",
   description = "If one more etcd member goes down the cluster will be unavailable",
 }

 # etcd leader alerts
 # ==================

 # alert if any etcd instance has no leader
 ALERT NoLeader
 IF etcd_server_has_leader{job="etcd"} == 0
 FOR 1m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "etcd member has no leader",
   description = "etcd member {{ $labels.instance }} has no leader",
 }

 # alert if there are lots of leader changes
 ALERT HighNumberOfLeaderChanges
 IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "a high number of leader changes within the etcd cluster are happening",
   description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
 }

 # gRPC request alerts
 # ===================

 # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
 ALERT HighNumberOfFailedGRPCRequests
 IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
   / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "a high number of gRPC requests are failing",
   description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
 }

 # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
 ALERT HighNumberOfFailedGRPCRequests
 IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
   / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
 FOR 5m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "a high number of gRPC requests are failing",
   description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
 }

 # alert if the 99th percentile of gRPC method calls take more than 150ms
 ALERT GRPCRequestsSlow
 IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
 FOR 10m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "slow gRPC requests",
   description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
 }

 # HTTP requests alerts
 # ====================

 # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
 ALERT HighNumberOfFailedHTTPRequests
 IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
   / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "a high number of HTTP requests are failing",
   description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
 }

 # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
 ALERT HighNumberOfFailedHTTPRequests
 IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
   / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method))  > 5
 FOR 5m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "a high number of HTTP requests are failing",
   description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
 }

 # alert if the 99th percentile of HTTP requests take more than 150ms
 ALERT HTTPRequestsSlow
 IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "slow HTTP requests",
   description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
 }

 # file descriptor alerts
 # ======================

 instance:fd_utilization = process_open_fds / process_max_fds

 # alert if file descriptors are likely to exhaust within the next 4 hours
 ALERT FdExhaustionClose
 IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "file descriptors soon exhausted",
   description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
 }

 # alert if file descriptors are likely to exhaust within the next hour
 ALERT FdExhaustionClose
 IF predict_linear(instance:fd_utilization[10m], 3600) > 1
 FOR 10m
 LABELS {
   severity = "critical"
 }
 ANNOTATIONS {
   summary = "file descriptors soon exhausted",
   description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
 }

 # etcd member communication alerts
 # ================================

 # alert if 99th percentile of round trips take 150ms
 ALERT EtcdMemberCommunicationSlow
 IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "etcd member communication is slow",
   description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
 }

 # etcd proposal alerts
 # ====================

 # alert if there are several failed proposals within an hour
 ALERT HighNumberOfFailedProposals
 IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "a high number of proposals within the etcd cluster are failing",
   description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
 }

 # etcd disk io latency alerts
 # ===========================

 # alert if 99th percentile of fsync durations is higher than 500ms
 ALERT HighFsyncDurations
 IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "high fsync durations",
   description = "etcd instance {{ $labels.instance }} fync durations are high",
 }

 # alert if 99th percentile of commit durations is higher than 250ms
 ALERT HighCommitDurations
 IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
 FOR 10m
 LABELS {
   severity = "warning"
 }
 ANNOTATIONS {
   summary = "high commit durations",
   description = "etcd instance {{ $labels.instance }} commit durations are high",
 }
	# general cluster availability

	# alert if another failed member will result in an unavailable cluster
	ALERT InsufficientMembers
	IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
	FOR 3m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "etcd cluster insufficient members",
	description = "If one more etcd member goes down the cluster will be unavailable",
	}

	# etcd leader alerts
	# ==================

	# alert if any etcd instance has no leader
	ALERT NoLeader
	IF etcd_server_has_leader{job="etcd"} == 0
	FOR 1m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "etcd member has no leader",
	description = "etcd member {{ $labels.instance }} has no leader",
	}

	# alert if there are lots of leader changes
	ALERT HighNumberOfLeaderChanges
	IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "a high number of leader changes within the etcd cluster are happening",
	description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
	}

	# gRPC request alerts
	# ===================

	# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
	ALERT HighNumberOfFailedGRPCRequests
	IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
	/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "a high number of gRPC requests are failing",
	description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
	}

	# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
	ALERT HighNumberOfFailedGRPCRequests
	IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
	/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
	FOR 5m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "a high number of gRPC requests are failing",
	description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
	}

	# alert if the 99th percentile of gRPC method calls take more than 150ms
	ALERT GRPCRequestsSlow
	IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
	FOR 10m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "slow gRPC requests",
	description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
	}

	# HTTP requests alerts
	# ====================

	# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
	ALERT HighNumberOfFailedHTTPRequests
	IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
	/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "a high number of HTTP requests are failing",
	description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
	}

	# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
	ALERT HighNumberOfFailedHTTPRequests
	IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
	/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
	FOR 5m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "a high number of HTTP requests are failing",
	description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
	}

	# alert if the 99th percentile of HTTP requests take more than 150ms
	ALERT HTTPRequestsSlow
	IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "slow HTTP requests",
	description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
	}

	# file descriptor alerts
	# ======================

	instance:fd_utilization = process_open_fds / process_max_fds

	# alert if file descriptors are likely to exhaust within the next 4 hours
	ALERT FdExhaustionClose
	IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "file descriptors soon exhausted",
	description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
	}

	# alert if file descriptors are likely to exhaust within the next hour
	ALERT FdExhaustionClose
	IF predict_linear(instance:fd_utilization[10m], 3600) > 1
	FOR 10m
	LABELS {
	severity = "critical"
	}
	ANNOTATIONS {
	summary = "file descriptors soon exhausted",
	description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
	}

	# etcd member communication alerts
	# ================================

	# alert if 99th percentile of round trips take 150ms
	ALERT EtcdMemberCommunicationSlow
	IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "etcd member communication is slow",
	description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
	}

	# etcd proposal alerts
	# ====================

	# alert if there are several failed proposals within an hour
	ALERT HighNumberOfFailedProposals
	IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "a high number of proposals within the etcd cluster are failing",
	description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
	}

	# etcd disk io latency alerts
	# ===========================

	# alert if 99th percentile of fsync durations is higher than 500ms
	ALERT HighFsyncDurations
	IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "high fsync durations",
	description = "etcd instance {{ $labels.instance }} fync durations are high",
	}

	# alert if 99th percentile of commit durations is higher than 250ms
	ALERT HighCommitDurations
	IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
	FOR 10m
	LABELS {
	severity = "warning"
	}
	ANNOTATIONS {
	summary = "high commit durations",
	description = "etcd instance {{ $labels.instance }} commit durations are high",
	}