dist-material/alarm-settings.yml - skywalking - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # Sample alarm rules.
 rules:
   # Rule unique name, must be ended with `_rule`.
   service_resp_time_rule:
     # A MQE expression, the result type must be `SINGLE_VALUE` and the root operation of the expression must be a Compare Operation
     # which provides `1`(true) or `0`(false) result. When the result is `1`(true), the alarm will be triggered.
     expression: sum(service_resp_time > 1000) >= 3
     period: 10
     silence-period: 5
     message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
 #  service_resp_time_rule:
 #    expression: avg(service_resp_time) > 1000
 #    period: 10
 #    silence-period: 5
 #    message: Avg response time of service {name} is more than 1000ms in last 10 minutes.
   service_sla_rule:
     expression: sum(service_sla < 8000) >= 2
     # The length of time to evaluate the metrics
     period: 10
     # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
     silence-period: 3
     message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
   service_resp_time_percentile_rule:
     expression: sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3
     period: 10
     silence-period: 5
     message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
   service_instance_resp_time_rule:
     expression: sum(service_instance_resp_time > 1000) >= 2
     period: 10
     silence-period: 5
     message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
   database_access_resp_time_rule:
     expression: sum(database_access_resp_time > 1000) >= 2
     period: 10
     message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
   endpoint_relation_resp_time_rule:
     expression: sum(endpoint_relation_resp_time > 1000) >= 2
     period: 10
     message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes
 #  Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm.
 #  Because the number of endpoint is much more than service and instance.
 #
 #  endpoint_resp_time_rule:
 #    expression: sum(endpoint_resp_time > 1000) >= 2
 #    period: 10
 #    silence-period: 5
 #    message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes

 #hooks:
 #  webhook:
 #    default:
 #      is-default: true
 #      urls:
 #        - http://127.0.0.1/notify/
 #        - http://127.0.0.1/go-wechat/
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Sample alarm rules.
	rules:
	# Rule unique name, must be ended with `_rule`.
	service_resp_time_rule:
	# A MQE expression, the result type must be `SINGLE_VALUE` and the root operation of the expression must be a Compare Operation
	# which provides `1`(true) or `0`(false) result. When the result is `1`(true), the alarm will be triggered.
	expression: sum(service_resp_time > 1000) >= 3
	period: 10
	silence-period: 5
	message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
	# service_resp_time_rule:
	# expression: avg(service_resp_time) > 1000
	# period: 10
	# silence-period: 5
	# message: Avg response time of service {name} is more than 1000ms in last 10 minutes.
	service_sla_rule:
	expression: sum(service_sla < 8000) >= 2
	# The length of time to evaluate the metrics
	period: 10
	# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
	silence-period: 3
	message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
	service_resp_time_percentile_rule:
	expression: sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3
	period: 10
	silence-period: 5
	message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
	service_instance_resp_time_rule:
	expression: sum(service_instance_resp_time > 1000) >= 2
	period: 10
	silence-period: 5
	message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
	database_access_resp_time_rule:
	expression: sum(database_access_resp_time > 1000) >= 2
	period: 10
	message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
	endpoint_relation_resp_time_rule:
	expression: sum(endpoint_relation_resp_time > 1000) >= 2
	period: 10
	message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes
	# Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm.
	# Because the number of endpoint is much more than service and instance.
	#
	# endpoint_resp_time_rule:
	# expression: sum(endpoint_resp_time > 1000) >= 2
	# period: 10
	# silence-period: 5
	# message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes

	#hooks:
	# webhook:
	# default:
	# is-default: true
	# urls:
	# - http://127.0.0.1/notify/
	# - http://127.0.0.1/go-wechat/