heron/config/src/yaml/conf/kubernetes/metrics_sinks.yaml - incubator-heron - Git at Google

 #  Licensed to the Apache Software Foundation (ASF) under one
 #  or more contributor license agreements.  See the NOTICE file
 #  distributed with this work for additional information
 #  regarding copyright ownership.  The ASF licenses this file
 #  to you under the Apache License, Version 2.0 (the
 #  "License"); you may not use this file except in compliance
 #  with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.

 ########### These all have default values as shown

 # We would specify the unique sink-id first
 sinks:
   - file-sink
   - tmanager-sink
   - prometheus-sink
   - metricscache-sink

 ########### Now we would specify the detailed configuration for every unique sink
 ########### Syntax: sink-id: - option(s)

 ########### option class is required as we need to instantiate a new instance by reflection
 ########### option flush-frequency-ms is required to invoke flush() at interval
 ########### option sink-restart-attempts, representsing # of times to restart a sink when it throws exceptions and dies.
 ###########   If this option is missed, default value 0 would be supplied; negative value represents to restart it forever.

 ########### Other options would be constructed as an immutable map passed to IMetricsSink's init(Map conf) as argument,
 ########### We would be able to fetch value by conf.get(options), for instance:
 ########### We could get "org.apache.heron.metricsmgr.sink.FileSink" if conf.get("class") is called inside file-sink's instance

 ### Config for file-sink
 file-sink:
   class: "org.apache.heron.metricsmgr.sink.FileSink"
   flush-frequency-ms: 60000 # 1 min
   sink-restart-attempts: -1 # Forever
   filename-output: "metrics.json" # File for metrics to write to
   file-maximum: 5 # maximum number of file saved in disk

 ### Config for tmanager-sink
 tmanager-sink:
   class: "org.apache.heron.metricsmgr.sink.tmanager.TManagerSink"
   flush-frequency-ms: 60000
   sink-restart-attempts: -1 # Forever
   tmanager-location-check-interval-sec: 5
   tmanager-client:
     reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
     # The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
     network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
     network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
     network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
     network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
     socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
     socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
   tmanager-metrics-type:
     "__emit-count": SUM
     "__execute-count": SUM
     "__fail-count": SUM
     "__ack-count": SUM
     "__complete-latency": AVG
     "__execute-latency": AVG
     "__process-latency": AVG
     "__jvm-uptime-secs": LAST
     "__jvm-process-cpu-load": LAST
     "__jvm-memory-used-mb": LAST
     "__jvm-memory-mb-total": LAST
     "__jvm-gc-collection-time-ms": LAST
     "__server/__time_spent_back_pressure_initiated": SUM
     "__time_spent_back_pressure_by_compid": SUM

 prometheus-sink:
   class: "org.apache.heron.metricsmgr.sink.PrometheusSink"
   port: 8080 # The port on which to run (either port or port-file are mandatory)
   path: /metrics # The path on which to publish the metrics (mandatory)
   flat-metrics: true # By default the web-sink will publish a flat "name -> value" json map
   include-topology-name: true # Include topology name in metric name (default false)
   metrics-cache-max-size: 1000000 # Max number of metrics cached and published (default 1000000)
   metrics-cache-ttl-sec: 600 # Time in seconds after which a metric that was collected will stopped being published (default 600)
   rules:
   # __jvm-peak-usage/G1-Survivor-Space-committed": "9",
   - pattern: __jvm-(.+)/(.+)
     name: jvm_$1_$2
     attrNameSnakeCase: true
     type: COUNTER
   # "__execute-time-ns/pulsar-prod-4/default": "418764",
   - pattern: __(?!jvm-+)(.+-count|.+-latency|.+-count|.+-time-ns)/(.+)/(.+)
     name: $1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       dest_component: "$2"
       context: "$3"
   # "__execute-time-ns/pulsar-prod-4": "418764",
   - pattern: __(?!jvm-+)(.+-count|.+-latency|.+-count|.+-time-ns)/(.+)
     name: $1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       context: "$2"
   # StreamManager
   # "__client_stmgr-17/__bytes_to_stmgrs": "7841039",
   - pattern: __(client_stmgr-.+)/__(.+_to_stmgrs)
     name: $2
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       dest_component: "$1"
   # "__connection_buffer_by_instanceid/container_1_pulsar-prod-9_201/bytes": "0.000000",
   - pattern: __(connection_buffer_by_instanceid)/container_(.+)_(.+)/(.+)
     name: $1_$4
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       dest_container: "$2"
       dest_task: "$3"
   # "__time_spent_back_pressure_by_compid/container_1_pulsar-prod-5_151": "0",
   - pattern: __(time_spent_back_pressure_by_compid)/container_(.+)_(.+)
     name: $1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       dest_container: "$2"
       dest_task: "$3"
   # PulsarSpoutMetrics of PulsarSpout 'PulsarSpoutMetrics/pulsar-prod-4-0/consumerThroughput'
   - pattern: PulsarSpout/(.+)/(.+)
     name: pulsar_spout_$2
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       component: "$1"
   - pattern: PulsarBolt/(.+)/(.+)
     name: pulsar_bolt_$2
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       component: "$1"
   # name: "kafkaConsumer-request-size-max/consumer-node-metrics/client-id-spout/node-id-node-1"
   - pattern: kafkaConsumer-(.+)/consumer-(node)-metrics/client-id-(.+)/node-id-(.+)
     name: kafka_consumer_$2_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$3"
       node_id: "$4"
   # name: "kafkaConsumer-commit-rate/consumer-coordinator-metrics/client-id-spout"
   - pattern: kafkaConsumer-(.+)/consumer-(coordinator)-metrics/client-id-(.+)
     name: kafka_consumer_$2_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$3"
   # name: "kafkaConsumer-records-lag-max/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-lad-es/partition-1"
   - pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)/partition-(.+)
     name: kafka_consumer_$2_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$3"
       topic: "$4"
       partition: "$5"
   # name: "kafkaConsumer-records-per-request-avg/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-adp-cms-api"
   - pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)
     name: kafka_consumer_$2_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$3"
       topic: "$4"
   # name: "kafkaConsumer-bytes-consumed-total/consumer-fetch-manager-metrics/client-id-consumer-1"
   - pattern: kafkaConsumer-(.+)/consumer-(feath-manager)-metrics/client-id-(.+)
     name: kafka_consumer_$2_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$3"
   - pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)/node-id-(.+)
     name: kafka_consumer_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$2"
       node_id: "$3"
   - pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)
     name: kafka_consumer_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$2"
   - pattern: kafkaConsumer-(.+)/app-info/client-id-(.+)
     name: kafka_consumer_$1
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       client_id: "$2"
   # kafkaOffset of KafkaSpout 'kafkaOffset/topicName/partition_2/spoutLag'
   - pattern: kafkaOffset/(.+)/partition_([0-9]+)/(.+)
     name: kafka_offset_$3
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       topic: "$1"
       partition: "$2"
   # kafkaOffset of KafkaSpout 'kafkaOffset/topicName/totalSpoutLag'
   - pattern: kafkaOffset/(.+)/(.+)
     name: kafka_offset_$2
     attrNameSnakeCase: true
     type: COUNTER
     labels:
       topic: "$1"

 ### Config for metricscache-sink
 metricscache-sink:
   class: "org.apache.heron.metricsmgr.sink.metricscache.MetricsCacheSink"
   flush-frequency-ms: 60000
   sink-restart-attempts: -1 # Forever
   metricscache-location-check-interval-sec: 5
   metricscache-client:
     reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
     # The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
     network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
     network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
     network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
     network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
     socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
     socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
   metricscache-metrics-type:
     "__emit-count": SUM
     "__execute-count": SUM
     "__fail-count": SUM
     "__ack-count": SUM
     "__complete-latency": AVG
     "__execute-latency": AVG
     "__process-latency": AVG
     "__jvm-uptime-secs": LAST
     "__jvm-process-cpu-load": LAST
     "__jvm-memory-used-mb": LAST
     "__jvm-memory-mb-total": LAST
     "__jvm-gc-collection-time-ms": LAST
     "__server/__time_spent_back_pressure_initiated": SUM
     "__time_spent_back_pressure_by_compid": SUM

 ### Config for graphite-sink
 ### Currently the graphite-sink is disabled
 # graphite-sink:
 #   class: "org.apache.heron.metricsmgr.sink.GraphiteSink"
 #   flush-frequency-ms: 60000
 #   graphite_host: "127.0.0.1" # The host of graphite to be exported metrics to
 #   graphite_port: 2004 # The port of graphite to be exported metrics to
 #   metrics_prefix: "heron" # The prefix of every metrics
 #   server_max_reconnect-attempts: 20 # The max reconnect attempts when failing to connect to graphite server
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	########### These all have default values as shown

	# We would specify the unique sink-id first
	sinks:
	- file-sink
	- tmanager-sink
	- prometheus-sink
	- metricscache-sink

	########### Now we would specify the detailed configuration for every unique sink
	########### Syntax: sink-id: - option(s)

	########### option class is required as we need to instantiate a new instance by reflection
	########### option flush-frequency-ms is required to invoke flush() at interval
	########### option sink-restart-attempts, representsing # of times to restart a sink when it throws exceptions and dies.
	########### If this option is missed, default value 0 would be supplied; negative value represents to restart it forever.

	########### Other options would be constructed as an immutable map passed to IMetricsSink's init(Map conf) as argument,
	########### We would be able to fetch value by conf.get(options), for instance:
	########### We could get "org.apache.heron.metricsmgr.sink.FileSink" if conf.get("class") is called inside file-sink's instance

	### Config for file-sink
	file-sink:
	class: "org.apache.heron.metricsmgr.sink.FileSink"
	flush-frequency-ms: 60000 # 1 min
	sink-restart-attempts: -1 # Forever
	filename-output: "metrics.json" # File for metrics to write to
	file-maximum: 5 # maximum number of file saved in disk

	### Config for tmanager-sink
	tmanager-sink:
	class: "org.apache.heron.metricsmgr.sink.tmanager.TManagerSink"
	flush-frequency-ms: 60000
	sink-restart-attempts: -1 # Forever
	tmanager-location-check-interval-sec: 5
	tmanager-client:
	reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
	# The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
	network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
	network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
	network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
	network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
	socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
	socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
	tmanager-metrics-type:
	"__emit-count": SUM
	"__execute-count": SUM
	"__fail-count": SUM
	"__ack-count": SUM
	"__complete-latency": AVG
	"__execute-latency": AVG
	"__process-latency": AVG
	"__jvm-uptime-secs": LAST
	"__jvm-process-cpu-load": LAST
	"__jvm-memory-used-mb": LAST
	"__jvm-memory-mb-total": LAST
	"__jvm-gc-collection-time-ms": LAST
	"__server/__time_spent_back_pressure_initiated": SUM
	"__time_spent_back_pressure_by_compid": SUM

	prometheus-sink:
	class: "org.apache.heron.metricsmgr.sink.PrometheusSink"
	port: 8080 # The port on which to run (either port or port-file are mandatory)
	path: /metrics # The path on which to publish the metrics (mandatory)
	flat-metrics: true # By default the web-sink will publish a flat "name -> value" json map
	include-topology-name: true # Include topology name in metric name (default false)
	metrics-cache-max-size: 1000000 # Max number of metrics cached and published (default 1000000)
	metrics-cache-ttl-sec: 600 # Time in seconds after which a metric that was collected will stopped being published (default 600)
	rules:
	# __jvm-peak-usage/G1-Survivor-Space-committed": "9",
	- pattern: __jvm-(.+)/(.+)
	name: jvm_$1_$2
	attrNameSnakeCase: true
	type: COUNTER
	# "__execute-time-ns/pulsar-prod-4/default": "418764",
	- pattern: __(?!jvm-+)(.+-count\|.+-latency\|.+-count\|.+-time-ns)/(.+)/(.+)
	name: $1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	dest_component: "$2"
	context: "$3"
	# "__execute-time-ns/pulsar-prod-4": "418764",
	- pattern: __(?!jvm-+)(.+-count\|.+-latency\|.+-count\|.+-time-ns)/(.+)
	name: $1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	context: "$2"
	# StreamManager
	# "__client_stmgr-17/__bytes_to_stmgrs": "7841039",
	- pattern: __(client_stmgr-.+)/__(.+_to_stmgrs)
	name: $2
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	dest_component: "$1"
	# "__connection_buffer_by_instanceid/container_1_pulsar-prod-9_201/bytes": "0.000000",
	- pattern: __(connection_buffer_by_instanceid)/container_(.+)_(.+)/(.+)
	name: $1_$4
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	dest_container: "$2"
	dest_task: "$3"
	# "__time_spent_back_pressure_by_compid/container_1_pulsar-prod-5_151": "0",
	- pattern: __(time_spent_back_pressure_by_compid)/container_(.+)_(.+)
	name: $1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	dest_container: "$2"
	dest_task: "$3"
	# PulsarSpoutMetrics of PulsarSpout 'PulsarSpoutMetrics/pulsar-prod-4-0/consumerThroughput'
	- pattern: PulsarSpout/(.+)/(.+)
	name: pulsar_spout_$2
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	component: "$1"
	- pattern: PulsarBolt/(.+)/(.+)
	name: pulsar_bolt_$2
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	component: "$1"
	# name: "kafkaConsumer-request-size-max/consumer-node-metrics/client-id-spout/node-id-node-1"
	- pattern: kafkaConsumer-(.+)/consumer-(node)-metrics/client-id-(.+)/node-id-(.+)
	name: kafka_consumer_$2_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$3"
	node_id: "$4"
	# name: "kafkaConsumer-commit-rate/consumer-coordinator-metrics/client-id-spout"
	- pattern: kafkaConsumer-(.+)/consumer-(coordinator)-metrics/client-id-(.+)
	name: kafka_consumer_$2_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$3"
	# name: "kafkaConsumer-records-lag-max/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-lad-es/partition-1"
	- pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)/partition-(.+)
	name: kafka_consumer_$2_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$3"
	topic: "$4"
	partition: "$5"
	# name: "kafkaConsumer-records-per-request-avg/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-adp-cms-api"
	- pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)
	name: kafka_consumer_$2_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$3"
	topic: "$4"
	# name: "kafkaConsumer-bytes-consumed-total/consumer-fetch-manager-metrics/client-id-consumer-1"
	- pattern: kafkaConsumer-(.+)/consumer-(feath-manager)-metrics/client-id-(.+)
	name: kafka_consumer_$2_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$3"
	- pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)/node-id-(.+)
	name: kafka_consumer_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$2"
	node_id: "$3"
	- pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)
	name: kafka_consumer_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$2"
	- pattern: kafkaConsumer-(.+)/app-info/client-id-(.+)
	name: kafka_consumer_$1
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	client_id: "$2"
	# kafkaOffset of KafkaSpout 'kafkaOffset/topicName/partition_2/spoutLag'
	- pattern: kafkaOffset/(.+)/partition_([0-9]+)/(.+)
	name: kafka_offset_$3
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	topic: "$1"
	partition: "$2"
	# kafkaOffset of KafkaSpout 'kafkaOffset/topicName/totalSpoutLag'
	- pattern: kafkaOffset/(.+)/(.+)
	name: kafka_offset_$2
	attrNameSnakeCase: true
	type: COUNTER
	labels:
	topic: "$1"

	### Config for metricscache-sink
	metricscache-sink:
	class: "org.apache.heron.metricsmgr.sink.metricscache.MetricsCacheSink"
	flush-frequency-ms: 60000
	sink-restart-attempts: -1 # Forever
	metricscache-location-check-interval-sec: 5
	metricscache-client:
	reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
	# The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
	network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
	network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
	network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
	network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
	socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
	socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
	metricscache-metrics-type:
	"__emit-count": SUM
	"__execute-count": SUM
	"__fail-count": SUM
	"__ack-count": SUM
	"__complete-latency": AVG
	"__execute-latency": AVG
	"__process-latency": AVG
	"__jvm-uptime-secs": LAST
	"__jvm-process-cpu-load": LAST
	"__jvm-memory-used-mb": LAST
	"__jvm-memory-mb-total": LAST
	"__jvm-gc-collection-time-ms": LAST
	"__server/__time_spent_back_pressure_initiated": SUM
	"__time_spent_back_pressure_by_compid": SUM

	### Config for graphite-sink
	### Currently the graphite-sink is disabled
	# graphite-sink:
	# class: "org.apache.heron.metricsmgr.sink.GraphiteSink"
	# flush-frequency-ms: 60000
	# graphite_host: "127.0.0.1" # The host of graphite to be exported metrics to
	# graphite_port: 2004 # The port of graphite to be exported metrics to
	# metrics_prefix: "heron" # The prefix of every metrics
	# server_max_reconnect-attempts: 20 # The max reconnect attempts when failing to connect to graphite server