blob: 4329bb3e71ede479a348cccf8266a24fc974e607 [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: texera-single-node
services:
# Part1: Specification of the storage services used by Texera
# MinIO is an S3-compatible object storage used to store datasets and files.
minio:
image: minio/minio:RELEASE.2025-02-28T09-55-16Z
container_name: texera-minio
ports:
- "${MINIO_PORT:-9000}:9000"
env_file:
- .env
environment:
- MINIO_ROOT_USER=${STORAGE_S3_AUTH_USERNAME}
- MINIO_ROOT_PASSWORD=${STORAGE_S3_AUTH_PASSWORD}
volumes:
- minio_data:/data
command: server --console-address ":9001" /data
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:9000/minio/health/live"]
interval: 5s
timeout: 3s
retries: 10
# This job creates the S3 bucket used by the Iceberg warehouse that the
# Lakekeeper service manages.
minio-init:
image: minio/mc:RELEASE.2025-05-21T01-59-54Z
container_name: texera-minio-init
depends_on:
minio:
condition: service_healthy
env_file:
- .env
restart: "no"
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -e
mc alias set local "$$STORAGE_S3_ENDPOINT" "$$STORAGE_S3_AUTH_USERNAME" "$$STORAGE_S3_AUTH_PASSWORD"
mc mb --ignore-existing "local/$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET"
echo "MinIO bucket '$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET' is ready."
# PostgreSQL with PGroonga extension for full-text search.
# Used by lakeFS and Texera's metadata storage.
postgres:
image: groonga/pgroonga:4.0.1-debian-15
container_name: texera-postgres
restart: always
env_file:
- .env
healthcheck:
test: ["CMD", "pg_isready", "-U", "texera", "-d", "texera_db"]
interval: 10s
retries: 5
start_period: 5s
volumes:
- postgres_data:/var/lib/postgresql/data
# mount the sql files for initializing the postgres
- ../../sql:/docker-entrypoint-initdb.d
# lakeFS is the underlying storage of Texera's dataset service
lakefs:
image: treeverse/lakefs:1.51
container_name: texera-lakefs
restart: always
depends_on:
postgres:
condition: service_healthy
minio:
condition: service_started
env_file:
- .env
environment:
# This port also need to be changed if the port of MinIO service is changed
- LAKEFS_BLOCKSTORE_S3_PRE_SIGNED_ENDPOINT=${TEXERA_HOST}:${MINIO_PORT:-9000}
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_ACCESS_KEY_ID=${STORAGE_S3_AUTH_USERNAME}
- LAKEFS_BLOCKSTORE_S3_CREDENTIALS_SECRET_ACCESS_KEY=${STORAGE_S3_AUTH_PASSWORD}
entrypoint: ["/bin/sh", "-c"]
command:
- |
lakefs setup --user-name "$LAKEFS_INSTALLATION_USER_NAME" --access-key-id "$LAKEFS_INSTALLATION_ACCESS_KEY_ID" --secret-access-key "$LAKEFS_INSTALLATION_SECRET_ACCESS_KEY" || true
lakefs run &
wait
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8000/api/v1/healthcheck"]
interval: 10s
timeout: 5s
retries: 10
# This job applies Lakekeeper's database schema to the Postgres instance —
# creating or upgrading the tables that Lakekeeper uses to track its catalog metadata.
lakekeeper-migrate:
image: vakamo/lakekeeper:v0.11.0
container_name: texera-lakekeeper-migrate
depends_on:
postgres:
condition: service_healthy
env_file:
- .env
restart: "no"
entrypoint: ["/home/nonroot/lakekeeper"]
command: ["migrate"]
# Lakekeeper is the Iceberg REST catalog service
lakekeeper:
image: vakamo/lakekeeper:v0.11.0
container_name: texera-lakekeeper
restart: always
depends_on:
postgres:
condition: service_healthy
minio:
condition: service_started
lakekeeper-migrate:
condition: service_completed_successfully
env_file:
- .env
entrypoint: ["/home/nonroot/lakekeeper"]
command: ["serve"]
healthcheck:
test: ["CMD", "/home/nonroot/lakekeeper", "healthcheck"]
interval: 10s
timeout: 5s
retries: 10
start_period: 10s
# One-shot init container that creates the Lakekeeper default project and
# the Iceberg warehouse pointing at the MinIO bucket prepared by minio-init.
lakekeeper-init:
image: alpine:3.19
container_name: texera-lakekeeper-init
depends_on:
lakekeeper:
condition: service_healthy
minio-init:
condition: service_completed_successfully
env_file:
- .env
restart: "no"
entrypoint: [ "/bin/sh", "-c" ]
command:
- |
set -e
echo "Installing curl (to call Lakekeeper's management API) and jq (to parse its list responses)..."
apk add --no-cache curl jq
# Lakekeeper organizes warehouses under "projects" (its top-level tenant
# abstraction). Texera is single-tenant, so we use the NIL UUID (all
# zeros) as the project ID — with LAKEKEEPER__ENABLE_DEFAULT_PROJECT=true
# (Lakekeeper's default), this is the project that any client request
# without a project-id is routed to.
echo "Step 1: Ensuring Lakekeeper's default project exists (top-level tenant that will hold the Iceberg warehouse)..."
echo "Checking whether Lakekeeper's default project already exists..."
PROJECT_GET_CODE=$$(curl -s -o /tmp/project.txt -w "%{http_code}" \
"$$LAKEKEEPER_BASE_URI/management/v1/project" || echo "000")
if [ "$$PROJECT_GET_CODE" = "200" ]; then
echo "Lakekeeper's default project already exists. Skipping creation."
elif [ "$$PROJECT_GET_CODE" = "404" ]; then
echo "Default project not found. Creating..."
PROJECT_PAYLOAD='{"project-id": "00000000-0000-0000-0000-000000000000", "project-name": "default"}'
PROJECT_CODE=$$(curl -s -o /tmp/response.txt -w "%{http_code}" \
-X POST \
-H "Content-Type: application/json" \
-d "$$PROJECT_PAYLOAD" \
"$$LAKEKEEPER_BASE_URI/management/v1/project" || echo "000")
if [ "$$PROJECT_CODE" -lt 200 ] || [ "$$PROJECT_CODE" -ge 300 ]; then
echo "Failed to create Lakekeeper's default project. HTTP Code: $$PROJECT_CODE"
echo "ERROR RESPONSE:"
if [ -f /tmp/response.txt ]; then cat /tmp/response.txt; fi
echo ""
exit 1
fi
echo "Created Lakekeeper's default project successfully (HTTP $$PROJECT_CODE)."
else
echo "Failed to check Lakekeeper's default project. HTTP Code: $$PROJECT_GET_CODE"
echo "ERROR RESPONSE:"
if [ -f /tmp/project.txt ]; then cat /tmp/project.txt; fi
echo ""
exit 1
fi
echo "Step 2: Ensuring Warehouse '$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME' exists..."
# Detect existing warehouse first so we only POST when it's actually
# missing — keeps this init idempotent across re-runs.
echo "Checking whether Lakekeeper Warehouse '$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME' already exists..."
WAREHOUSE_LIST_CODE=$$(curl -s -o /tmp/warehouses.txt -w "%{http_code}" \
"$$LAKEKEEPER_BASE_URI/management/v1/warehouse" || echo "000")
if [ "$$WAREHOUSE_LIST_CODE" -lt 200 ] || [ "$$WAREHOUSE_LIST_CODE" -ge 300 ]; then
echo "Failed to list Lakekeeper warehouses. HTTP Code: $$WAREHOUSE_LIST_CODE"
echo "ERROR RESPONSE:"
if [ -f /tmp/warehouses.txt ]; then cat /tmp/warehouses.txt; fi
echo ""
exit 1
fi
if jq -e --arg name "$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME" '.warehouses[]? | select(.name == $$name)' /tmp/warehouses.txt >/dev/null; then
echo "Lakekeeper Warehouse '$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME' already exists. Skipping creation."
else
echo "Warehouse not found. Creating '$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME'..."
CREATE_PAYLOAD=$$(cat <<EOF
{
"warehouse-name": "$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME",
"project-id": "00000000-0000-0000-0000-000000000000",
"storage-profile": {
"type": "s3",
"bucket": "$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET",
"region": "$$STORAGE_S3_REGION",
"endpoint": "$$STORAGE_S3_ENDPOINT",
"flavor": "s3-compat",
"path-style-access": true,
"sts-enabled": false
},
"storage-credential": {
"type": "s3",
"credential-type": "access-key",
"aws-access-key-id": "$$STORAGE_S3_AUTH_USERNAME",
"aws-secret-access-key": "$$STORAGE_S3_AUTH_PASSWORD"
}
}
EOF
)
WAREHOUSE_CODE=$$(curl -s -o /tmp/response.txt -w "%{http_code}" \
-X POST \
-H "Content-Type: application/json" \
-d "$$CREATE_PAYLOAD" \
"$$LAKEKEEPER_BASE_URI/management/v1/warehouse" || echo "000")
if [ "$$WAREHOUSE_CODE" -lt 200 ] || [ "$$WAREHOUSE_CODE" -ge 300 ]; then
echo "Failed to create Lakekeeper Warehouse. HTTP Code: $$WAREHOUSE_CODE"
echo "ERROR RESPONSE:"
if [ -f /tmp/response.txt ]; then cat /tmp/response.txt; fi
echo ""
exit 1
fi
echo "Created Lakekeeper Warehouse successfully (HTTP $$WAREHOUSE_CODE)."
fi
echo "Initialization sequence completed successfully!"
# Part2: Specification of Texera's micro-services
# FileService provides endpoints for Texera's dataset management
file-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-file-service:${IMAGE_TAG:-latest}
container_name: file-service
restart: always
depends_on:
minio:
condition: service_started
lakefs:
condition: service_healthy
env_file:
- .env
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:9092/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# ConfigService provides endpoints for configuration management
config-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-config-service:${IMAGE_TAG:-latest}
container_name: config-service
restart: always
depends_on:
postgres:
condition: service_healthy
env_file:
- .env
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:9094/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# AccessControlService handles user permissions and access control
access-control-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-access-control-service:${IMAGE_TAG:-latest}
container_name: access-control-service
restart: always
depends_on:
postgres:
condition: service_healthy
env_file:
- .env
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:9096/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# WorkflowComputingUnitManagingService provides endpoints for managing computing units
workflow-computing-unit-managing-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-workflow-computing-unit-managing-service:${IMAGE_TAG:-latest}
container_name: workflow-computing-unit-managing-service
restart: always
depends_on:
postgres:
condition: service_healthy
env_file:
- .env
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8888/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# WorkflowCompilingService provides endpoints for sanity check and schema propagation while workflows are being edited
workflow-compiling-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-workflow-compiling-service:${IMAGE_TAG:-latest}
container_name: workflow-compiling-service
restart: always
depends_on:
file-service:
condition: service_started
env_file:
- .env
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:9090/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# WorkflowRuntimeCoordinatorService provides endpoints for executing workflows and interactions during executions.
workflow-runtime-coordinator-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-workflow-execution-coordinator:${IMAGE_TAG:-latest}
container_name: workflow-runtime-coordinator-service
restart: always
depends_on:
workflow-compiling-service:
condition: service_started
lakekeeper:
condition: service_healthy
lakekeeper-init:
condition: service_completed_successfully
env_file:
- .env
volumes:
- workflow_result_data:/amber/user-resources
# DashboardService provides endpoints for hub resource management.
dashboard-service:
image: ${IMAGE_REGISTRY:-ghcr.io/apache}/texera-dashboard-service:${IMAGE_TAG:-latest}
container_name: dashboard-service
restart: always
depends_on:
workflow-runtime-coordinator-service:
condition: service_started
workflow-compiling-service:
condition: service_healthy
file-service:
condition: service_healthy
env_file:
- .env
volumes:
- workflow_result_data:/amber/user-resources
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8080/api/healthcheck"]
interval: 5s
timeout: 3s
retries: 10
# Part 3: reverse proxy service for Texera's micro services
nginx:
image: nginx:alpine
container_name: texera-nginx
depends_on:
- workflow-compiling-service
- file-service
- dashboard-service
- workflow-runtime-coordinator-service
- config-service
- access-control-service
- workflow-computing-unit-managing-service
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "${TEXERA_PORT:-8080}:8080"
startup-message:
image: alpine:latest
depends_on:
nginx:
condition: service_started
environment:
- TEXERA_PORT=${TEXERA_PORT:-8080}
restart: "no"
command: >
sh -c '
echo "";
echo "=========================================";
echo " Texera has started successfully!";
echo " Access at: http://localhost:$$TEXERA_PORT";
echo "=========================================";
echo "";
'
# Part 4: Optional one-shot jobs
# Loads example datasets and workflows into Texera on first startup.
# Only runs when the "examples" profile is activated:
# docker compose --profile examples up
example-data-loader:
image: alpine:latest
depends_on:
dashboard-service:
condition: service_healthy
file-service:
condition: service_healthy
volumes:
- ./examples:/examples:ro
environment:
- TEXERA_DASHBOARD_SERVICE_URL=http://dashboard-service:8080/api
- TEXERA_FILE_SERVICE_URL=http://file-service:9092/api
- TEXERA_EXAMPLE_USERNAME=${USER_SYS_ADMIN_USERNAME}
- TEXERA_EXAMPLE_PASSWORD=${USER_SYS_ADMIN_PASSWORD}
restart: "no"
profiles:
- examples
command: >
sh -c 'apk add --no-cache curl jq bash > /dev/null 2>&1 && bash /examples/load-examples.sh'
networks:
default:
name: texera-single-node
# persistent volumes
volumes:
minio_data:
postgres_data:
workflow_result_data: