On this page, you can find the following guides to set up an observability stack based on the instructions in the Flink quickstart guide:
We provide a minimal quickstart configuration for application observability with Prometheus (metric aggregation system), Loki (log aggregation system) and Grafana (dashboard system).
The quickstart configuration comes with 2 metric dashboards.
Fluss – overview: Selected metrics to observe the overall cluster statusFluss – detail: Majority of metrics listed in metrics listFollow the instructions below to add observability capabilities to your setup.
├── docker-compose.yml # docker compose manifest from quickstart guide
└── fluss-quickstart-observability # downloaded and extracted ZIP archive
├── grafana
│ ├── grafana.ini
│ └── provisioning
│ ├── dashboards
│ │ ├── default.yml
│ │ └── fluss
│ │ └── ...
│ └── datatsources
│ └── default.yml
├── prometheus
│ └── prometheus.yml
└── slf4j
└── ...
fluss-slf4j-logback.Dockerfile in your working directory.ARG FLUSS_VERSION FROM apache/fluss:$FLUSS_DOCKER_VERSION$ # remove default logging backend from classpath and add logback to classpath RUN rm -rf ${FLUSS_HOME}/lib/log4j-slf4j-impl-*.jar && \ wget https://repo1.maven.org/maven2/ch/qos/logback/logback-classic/1.2.13/logback-classic-1.2.13.jar -P ${FLUSS_HOME}/lib/ && \ wget https://repo1.maven.org/maven2/ch/qos/logback/logback-core/1.2.13/logback-core-1.2.13.jar -P ${FLUSS_HOME}/lib/ # add loki4j logback appender to classpath RUN wget https://repo1.maven.org/maven2/com/github/loki4j/loki-logback-appender/1.4.2/loki-logback-appender-1.4.2.jar -P ${FLUSS_HOME}/lib/ # logback configuration that exposes metrics to loki COPY fluss-quickstart-observability/slf4j/logback-loki-console.xml ${FLUSS_HOME}/conf/logback-console.xml
:::note Detailed configuration instructions for Fluss and Logback can be found here. :::
docker-compose.yml andfluss-slf4j-logback.Dockerfile).APP_NAME).To do this, you can simply copy the manifest below into your docker-compose.yml.
services: #begin Fluss cluster coordinator-server: image: fluss-slf4j-logback:$FLUSS_DOCKER_VERSION$ build: args: FLUSS_VERSION: $FLUSS_VERSION$ dockerfile: fluss-slf4j-logback.Dockerfile command: coordinatorServer depends_on: - zookeeper environment: - | FLUSS_PROPERTIES= zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://coordinator-server:9123 remote.data.dir: /tmp/fluss/remote-data datalake.format: paimon datalake.paimon.metastore: filesystem datalake.paimon.warehouse: /tmp/paimon metrics.reporters: prometheus metrics.reporter.prometheus.port: 9250 logback.configurationFile: logback-loki-console.xml - APP_NAME=coordinator-server tablet-server: image: fluss-slf4j-logback:$FLUSS_DOCKER_VERSION$ build: args: FLUSS_VERSION: $FLUSS_VERSION$ dockerfile: fluss-slf4j-logback.Dockerfile command: tabletServer depends_on: - coordinator-server environment: - | FLUSS_PROPERTIES= zookeeper.address: zookeeper:2181 bind.listeners: FLUSS://tablet-server:9123 data.dir: /tmp/fluss/data remote.data.dir: /tmp/fluss/remote-data kv.snapshot.interval: 0s datalake.format: paimon datalake.paimon.metastore: filesystem datalake.paimon.warehouse: /tmp/paimon metrics.reporters: prometheus metrics.reporter.prometheus.port: 9250 logback.configurationFile: logback-loki-console.xml - APP_NAME=tablet-server zookeeper: restart: always image: zookeeper:3.9.2 #end #begin Flink cluster jobmanager: image: apache/fluss-quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ ports: - "8083:8081" command: jobmanager environment: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory metrics.reporter.prom.port: 9250 volumes: - shared-tmpfs:/tmp/paimon taskmanager: image: apache/fluss-quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$ depends_on: - jobmanager command: taskmanager environment: - | FLINK_PROPERTIES= jobmanager.rpc.address: jobmanager taskmanager.numberOfTaskSlots: 10 taskmanager.memory.process.size: 2048m taskmanager.memory.framework.off-heap.size: 256m metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory metrics.reporter.prom.port: 9250 volumes: - shared-tmpfs:/tmp/paimon #end #begin observability prometheus: image: bitnami/prometheus:2.55.1-debian-12-r0 ports: - "9092:9090" volumes: - ./fluss-quickstart-observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro loki: image: grafana/loki:3.3.2 ports: - "3102:3100" grafana: image: grafana/grafana:11.4.0 ports: - "3002:3000" depends_on: - prometheus - loki volumes: - ./fluss-quickstart-observability/grafana:/etc/grafana:ro #end volumes: shared-tmpfs: driver: local driver_opts: type: "tmpfs" device: "tmpfs"
Then run
# note the --build flag! docker compose up -d --build
to apply the changes.
:::warning This recreates shared-tmpfs and all data is lost (created tables, running jobs, etc.) :::
Make sure that the modified and added containers are up and running using
docker container ls -a