blob: 081f5c7f5740535ceb4c3cfa84149b3986dd4a05 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
########### These all have default values as shown
# We would specify the unique sink-id first
sinks:
- file-sink
- tmanager-sink
- prometheus-sink
- metricscache-sink
########### Now we would specify the detailed configuration for every unique sink
########### Syntax: sink-id: - option(s)
########### option class is required as we need to instantiate a new instance by reflection
########### option flush-frequency-ms is required to invoke flush() at interval
########### option sink-restart-attempts, representsing # of times to restart a sink when it throws exceptions and dies.
########### If this option is missed, default value 0 would be supplied; negative value represents to restart it forever.
########### Other options would be constructed as an immutable map passed to IMetricsSink's init(Map conf) as argument,
########### We would be able to fetch value by conf.get(options), for instance:
########### We could get "org.apache.heron.metricsmgr.sink.FileSink" if conf.get("class") is called inside file-sink's instance
### Config for file-sink
file-sink:
class: "org.apache.heron.metricsmgr.sink.FileSink"
flush-frequency-ms: 60000 # 1 min
sink-restart-attempts: -1 # Forever
filename-output: "metrics.json" # File for metrics to write to
file-maximum: 5 # maximum number of file saved in disk
### Config for tmanager-sink
tmanager-sink:
class: "org.apache.heron.metricsmgr.sink.tmanager.TManagerSink"
flush-frequency-ms: 60000
sink-restart-attempts: -1 # Forever
tmanager-location-check-interval-sec: 5
tmanager-client:
reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
# The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
tmanager-metrics-type:
"__emit-count": SUM
"__execute-count": SUM
"__fail-count": SUM
"__ack-count": SUM
"__complete-latency": AVG
"__execute-latency": AVG
"__process-latency": AVG
"__jvm-uptime-secs": LAST
"__jvm-process-cpu-load": LAST
"__jvm-memory-used-mb": LAST
"__jvm-memory-mb-total": LAST
"__jvm-gc-collection-time-ms": LAST
"__server/__time_spent_back_pressure_initiated": SUM
"__time_spent_back_pressure_by_compid": SUM
prometheus-sink:
class: "org.apache.heron.metricsmgr.sink.PrometheusSink"
port: 8080 # The port on which to run (either port or port-file are mandatory)
path: /metrics # The path on which to publish the metrics (mandatory)
flat-metrics: true # By default the web-sink will publish a flat "name -> value" json map
include-topology-name: true # Include topology name in metric name (default false)
metrics-cache-max-size: 1000000 # Max number of metrics cached and published (default 1000000)
metrics-cache-ttl-sec: 600 # Time in seconds after which a metric that was collected will stopped being published (default 600)
rules:
# __jvm-peak-usage/G1-Survivor-Space-committed": "9",
- pattern: __jvm-(.+)/(.+)
name: jvm_$1_$2
attrNameSnakeCase: true
type: COUNTER
# "__execute-time-ns/pulsar-prod-4/default": "418764",
- pattern: __(?!jvm-+)(.+-count|.+-latency|.+-count|.+-time-ns)/(.+)/(.+)
name: $1
attrNameSnakeCase: true
type: COUNTER
labels:
dest_component: "$2"
context: "$3"
# "__execute-time-ns/pulsar-prod-4": "418764",
- pattern: __(?!jvm-+)(.+-count|.+-latency|.+-count|.+-time-ns)/(.+)
name: $1
attrNameSnakeCase: true
type: COUNTER
labels:
context: "$2"
# StreamManager
# "__client_stmgr-17/__bytes_to_stmgrs": "7841039",
- pattern: __(client_stmgr-.+)/__(.+_to_stmgrs)
name: $2
attrNameSnakeCase: true
type: COUNTER
labels:
dest_component: "$1"
# "__connection_buffer_by_instanceid/container_1_pulsar-prod-9_201/bytes": "0.000000",
- pattern: __(connection_buffer_by_instanceid)/container_(.+)_(.+)/(.+)
name: $1_$4
attrNameSnakeCase: true
type: COUNTER
labels:
dest_container: "$2"
dest_task: "$3"
# "__time_spent_back_pressure_by_compid/container_1_pulsar-prod-5_151": "0",
- pattern: __(time_spent_back_pressure_by_compid)/container_(.+)_(.+)
name: $1
attrNameSnakeCase: true
type: COUNTER
labels:
dest_container: "$2"
dest_task: "$3"
# PulsarSpoutMetrics of PulsarSpout 'PulsarSpoutMetrics/pulsar-prod-4-0/consumerThroughput'
- pattern: PulsarSpout/(.+)/(.+)
name: pulsar_spout_$2
attrNameSnakeCase: true
type: COUNTER
labels:
component: "$1"
- pattern: PulsarBolt/(.+)/(.+)
name: pulsar_bolt_$2
attrNameSnakeCase: true
type: COUNTER
labels:
component: "$1"
# name: "kafkaConsumer-request-size-max/consumer-node-metrics/client-id-spout/node-id-node-1"
- pattern: kafkaConsumer-(.+)/consumer-(node)-metrics/client-id-(.+)/node-id-(.+)
name: kafka_consumer_$2_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$3"
node_id: "$4"
# name: "kafkaConsumer-commit-rate/consumer-coordinator-metrics/client-id-spout"
- pattern: kafkaConsumer-(.+)/consumer-(coordinator)-metrics/client-id-(.+)
name: kafka_consumer_$2_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$3"
# name: "kafkaConsumer-records-lag-max/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-lad-es/partition-1"
- pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)/partition-(.+)
name: kafka_consumer_$2_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$3"
topic: "$4"
partition: "$5"
# name: "kafkaConsumer-records-per-request-avg/consumer-fetch-manager-metrics/client-id-spout/topic-nginx-adp-cms-api"
- pattern: kafkaConsumer-(.+)/consumer-(fetch-manager)-metrics/client-id-(.+)/topic-(.+)
name: kafka_consumer_$2_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$3"
topic: "$4"
# name: "kafkaConsumer-bytes-consumed-total/consumer-fetch-manager-metrics/client-id-consumer-1"
- pattern: kafkaConsumer-(.+)/consumer-(feath-manager)-metrics/client-id-(.+)
name: kafka_consumer_$2_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$3"
- pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)/node-id-(.+)
name: kafka_consumer_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$2"
node_id: "$3"
- pattern: kafkaConsumer-(.+)/consumer-metrics/client-id-(.+)
name: kafka_consumer_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$2"
- pattern: kafkaConsumer-(.+)/app-info/client-id-(.+)
name: kafka_consumer_$1
attrNameSnakeCase: true
type: COUNTER
labels:
client_id: "$2"
# kafkaOffset of KafkaSpout 'kafkaOffset/topicName/partition_2/spoutLag'
- pattern: kafkaOffset/(.+)/partition_([0-9]+)/(.+)
name: kafka_offset_$3
attrNameSnakeCase: true
type: COUNTER
labels:
topic: "$1"
partition: "$2"
# kafkaOffset of KafkaSpout 'kafkaOffset/topicName/totalSpoutLag'
- pattern: kafkaOffset/(.+)/(.+)
name: kafka_offset_$2
attrNameSnakeCase: true
type: COUNTER
labels:
topic: "$1"
### Config for metricscache-sink
metricscache-sink:
class: "org.apache.heron.metricsmgr.sink.metricscache.MetricsCacheSink"
flush-frequency-ms: 60000
sink-restart-attempts: -1 # Forever
metricscache-location-check-interval-sec: 5
metricscache-client:
reconnect-interval-second: 5 # The re-connect interval to TManager from TManagerClient
# The size of packets written to TManager will be determined by the minimal of: (a) time based (b) size based
network-write-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
network-write-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
network-read-batch-size-bytes: 32768 # Size based, the maximum batch size in bytes to write to TManager
network-read-batch-time-ms: 16 # Time based, the maximum batch time in ms for Metrics Manager to write to TManager per attempt
socket-send-buffer-size-bytes: 6553600 # The maximum socket's send buffer size in bytes
socket-received-buffer-size-bytes: 8738000 # The maximum socket's received buffer size in bytes
metricscache-metrics-type:
"__emit-count": SUM
"__execute-count": SUM
"__fail-count": SUM
"__ack-count": SUM
"__complete-latency": AVG
"__execute-latency": AVG
"__process-latency": AVG
"__jvm-uptime-secs": LAST
"__jvm-process-cpu-load": LAST
"__jvm-memory-used-mb": LAST
"__jvm-memory-mb-total": LAST
"__jvm-gc-collection-time-ms": LAST
"__server/__time_spent_back_pressure_initiated": SUM
"__time_spent_back_pressure_by_compid": SUM
### Config for graphite-sink
### Currently the graphite-sink is disabled
# graphite-sink:
# class: "org.apache.heron.metricsmgr.sink.GraphiteSink"
# flush-frequency-ms: 60000
# graphite_host: "127.0.0.1" # The host of graphite to be exported metrics to
# graphite_port: 2004 # The port of graphite to be exported metrics to
# metrics_prefix: "heron" # The prefix of every metrics
# server_max_reconnect-attempts: 20 # The max reconnect attempts when failing to connect to graphite server