docker/thirdparties/docker-compose/hudi/scripts/init.sh - doris - Git at Google

 #!/bin/bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 set -euo pipefail

 # Remove SUCCESS file from previous run to ensure fresh initialization
 SUCCESS_FILE="/opt/hudi-scripts/SUCCESS"
 if [[ -f "${SUCCESS_FILE}" ]]; then
   echo "Removing previous SUCCESS file to ensure fresh initialization..."
   rm -f "${SUCCESS_FILE}"
 fi

 SPARK_HOME=/opt/spark
 CONF_DIR="${SPARK_HOME}/conf"
 JARS_DIR="${SPARK_HOME}/jars"
 CACHE_DIR=/opt/hudi-cache

 mkdir -p "${CONF_DIR}" "${CACHE_DIR}"

 # Function to download a JAR file if it doesn't exist
 download_jar() {
   local jar_name="$1"
   local version="$2"
   local url="$3"
   local jar_file="${CACHE_DIR}/${jar_name}-${version}.jar"

   if [[ ! -f "${jar_file}" ]]; then
     echo "Downloading ${jar_name} JAR ${version} from ${url} ..." >&2
     local download_success=false
     if command -v curl >/dev/null 2>&1; then
       if curl -sSfL "${url}" -o "${jar_file}"; then
         download_success=true
       else
         echo "Error: Failed to download ${jar_name} from ${url}" >&2
       fi
     elif command -v wget >/dev/null 2>&1; then
       if wget -qO "${jar_file}" "${url}"; then
         download_success=true
       else
         echo "Error: Failed to download ${jar_name} from ${url}" >&2
       fi
     else
       echo "Error: Neither curl nor wget is available in hudi-spark container." >&2
       exit 1
     fi

     if [[ "${download_success}" == "false" ]]; then
       echo "Error: Failed to download ${jar_name} JAR. Please check the URL: ${url}" >&2
       exit 1
     fi

     if [[ ! -f "${jar_file}" ]]; then
       echo "Error: Downloaded file ${jar_file} does not exist" >&2
       exit 1
     fi
   fi
   echo "${jar_file}"
 }

 # Function to link a JAR file to Spark jars directory
 link_jar() {
   local jar_file="$1"
   local jar_name="$2"
   local version="$3"
   ln -sf "${jar_file}" "${JARS_DIR}/${jar_name}-${version}.jar"
 }

 # Wait for Hive Metastore to be ready
 echo "Waiting for Hive Metastore to be ready..."
 METASTORE_HOST=$(echo "${HIVE_METASTORE_URIS}" | sed 's|thrift://||' | cut -d: -f1)
 METASTORE_PORT=$(echo "${HIVE_METASTORE_URIS}" | sed 's|thrift://||' | cut -d: -f2)
 MAX_RETRIES=120
 RETRY_COUNT=0

 while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
   if command -v nc >/dev/null 2>&1; then
     if nc -z "${METASTORE_HOST}" "${METASTORE_PORT}" 2>/dev/null; then
       echo "Hive Metastore is ready at ${METASTORE_HOST}:${METASTORE_PORT}"
       break
     fi
   elif command -v timeout >/dev/null 2>&1; then
     if timeout 1 bash -c "cat < /dev/null > /dev/tcp/${METASTORE_HOST}/${METASTORE_PORT}" 2>/dev/null; then
       echo "Hive Metastore is ready at ${METASTORE_HOST}:${METASTORE_PORT}"
       break
     fi
   else
     # Fallback: just wait a bit and assume it's ready
     if [ $RETRY_COUNT -eq 0 ]; then
       echo "Warning: nc or timeout command not available, skipping metastore readiness check"
       sleep 10
       break
     fi
   fi

   RETRY_COUNT=$((RETRY_COUNT + 1))
   if [ $((RETRY_COUNT % 10)) -eq 0 ]; then
     echo "Waiting for Hive Metastore... (${RETRY_COUNT}/${MAX_RETRIES})"
   fi
   sleep 2
 done

 if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
   echo "Error: Hive Metastore did not become ready within $((MAX_RETRIES * 2)) seconds"
   exit 1
 fi

 # Write core-site for MinIO (S3A)
 cat >"${CONF_DIR}/core-site.xml" <<EOF
 <configuration>
   <property>
     <name>fs.s3a.endpoint</name>
     <value>${S3_ENDPOINT}</value>
   </property>
   <property>
     <name>fs.s3a.access.key</name>
     <value>${MINIO_ROOT_USER}</value>
   </property>
   <property>
     <name>fs.s3a.secret.key</name>
     <value>${MINIO_ROOT_PASSWORD}</value>
   </property>
   <property>
     <name>fs.s3a.path.style.access</name>
     <value>true</value>
   </property>
   <property>
     <name>fs.s3a.impl</name>
     <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
   </property>
 </configuration>
 EOF

 # hive-site to point Spark to the external metastore
 cat >"${CONF_DIR}/hive-site.xml" <<EOF
 <configuration>
   <property>
     <name>hive.metastore.uris</name>
     <value>${HIVE_METASTORE_URIS}</value>
   </property>
   <property>
     <name>javax.jdo.option.ConnectionDriverName</name>
     <value>org.postgresql.Driver</value>
   </property>
   <property>
     <name>datanucleus.schema.autoCreateAll</name>
     <value>true</value>
   </property>
   <property>
     <name>hive.metastore.warehouse.dir</name>
     <value>s3a://${HUDI_BUCKET}/warehouse</value>
   </property>
 </configuration>
 EOF

 # Download Hudi bundle
 HUDI_BUNDLE_JAR_FILE=$(download_jar "hudi-spark3.5-bundle_2.12" "${HUDI_BUNDLE_VERSION}" "${HUDI_BUNDLE_URL}")
 link_jar "${HUDI_BUNDLE_JAR_FILE}" "hudi-spark3.5-bundle_2.12" "${HUDI_BUNDLE_VERSION}"

 # Download Hadoop AWS S3A filesystem JAR (required for S3A support)
 # Note: hadoop-common is already included in Spark's built-in Hadoop, no need to download separately
 HADOOP_AWS_JAR=$(download_jar "hadoop-aws" "${HADOOP_AWS_VERSION}" "${HADOOP_AWS_URL}")
 link_jar "${HADOOP_AWS_JAR}" "hadoop-aws" "${HADOOP_AWS_VERSION}"

 # Download AWS Java SDK Bundle v1 (required for Hadoop 3.3.6 S3A support)
 # Note: Hadoop 3.3.x uses AWS SDK v1, version 1.12.262 is recommended
 AWS_SDK_BUNDLE_JAR=$(download_jar "aws-java-sdk-bundle" "${AWS_SDK_BUNDLE_VERSION}" "${AWS_SDK_BUNDLE_URL}")
 link_jar "${AWS_SDK_BUNDLE_JAR}" "aws-java-sdk-bundle" "${AWS_SDK_BUNDLE_VERSION}"

 # Download PostgreSQL JDBC driver (required for Hive Metastore connection)
 POSTGRESQL_JDBC_JAR=$(download_jar "postgresql" "${POSTGRESQL_JDBC_VERSION}" "${POSTGRESQL_JDBC_URL}")
 link_jar "${POSTGRESQL_JDBC_JAR}" "postgresql" "${POSTGRESQL_JDBC_VERSION}"

 # Process SQL files with environment variable substitution and execute them
 # Similar to iceberg's approach: group SQL files together to reduce client creation overhead
 SCRIPTS_DIR="/opt/hudi-scripts/create_preinstalled_scripts/hudi"
 TEMP_SQL_DIR="/tmp/hudi_sql"

 if [[ -d "${SCRIPTS_DIR}" ]]; then
   mkdir -p "${TEMP_SQL_DIR}"

   # Process each SQL file: substitute environment variables and combine them
   echo "Processing Hudi SQL scripts..."
   for sql_file in $(find "${SCRIPTS_DIR}" -name '*.sql' | sort); do
     echo "Processing ${sql_file}..."
     # Use sed to replace environment variables in SQL files
     # Replace ${HIVE_METASTORE_URIS} and ${HUDI_BUCKET} with actual values
     sed "s|\${HIVE_METASTORE_URIS}|${HIVE_METASTORE_URIS}|g; s|\${HUDI_BUCKET}|${HUDI_BUCKET}|g" "${sql_file}" >> "${TEMP_SQL_DIR}/hudi_total.sql"
     echo "" >> "${TEMP_SQL_DIR}/hudi_total.sql"
   done

   # Run Spark SQL to execute all SQL scripts
   echo "Executing Hudi SQL scripts..."
   START_TIME=$(date +%s)
   ${SPARK_HOME}/bin/spark-sql \
     --master local[*] \
     --name hudi-init \
     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
     --conf spark.sql.catalogImplementation=hive \
     --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
     --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog \
     -f "${TEMP_SQL_DIR}/hudi_total.sql"
   END_TIME=$(date +%s)
   EXECUTION_TIME=$((END_TIME - START_TIME))
   echo "Hudi SQL scripts executed in ${EXECUTION_TIME} seconds"

   # Clean up temporary SQL file
   rm -f "${TEMP_SQL_DIR}/hudi_total.sql"
 else
   echo "Warning: SQL scripts directory ${SCRIPTS_DIR} not found, skipping table initialization."
 fi

 # Create success marker file to indicate initialization is complete
 # This file is used by docker healthcheck to verify container is ready
 touch ${SUCCESS_FILE}

 echo "Hudi demo data initialized."
 echo "Initialization completed successfully."

 # Keep container running for healthcheck and potential future use
 # Similar to iceberg's approach: tail -f /dev/null
 tail -f /dev/null
	#!/bin/bash
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	set -euo pipefail

	# Remove SUCCESS file from previous run to ensure fresh initialization
	SUCCESS_FILE="/opt/hudi-scripts/SUCCESS"
	if [[ -f "${SUCCESS_FILE}" ]]; then
	echo "Removing previous SUCCESS file to ensure fresh initialization..."
	rm -f "${SUCCESS_FILE}"
	fi

	SPARK_HOME=/opt/spark
	CONF_DIR="${SPARK_HOME}/conf"
	JARS_DIR="${SPARK_HOME}/jars"
	CACHE_DIR=/opt/hudi-cache

	mkdir -p "${CONF_DIR}" "${CACHE_DIR}"

	# Function to download a JAR file if it doesn't exist
	download_jar() {
	local jar_name="$1"
	local version="$2"
	local url="$3"
	local jar_file="${CACHE_DIR}/${jar_name}-${version}.jar"

	if [[ ! -f "${jar_file}" ]]; then
	echo "Downloading ${jar_name} JAR ${version} from ${url} ..." >&2
	local download_success=false
	if command -v curl >/dev/null 2>&1; then
	if curl -sSfL "${url}" -o "${jar_file}"; then
	download_success=true
	else
	echo "Error: Failed to download ${jar_name} from ${url}" >&2
	fi
	elif command -v wget >/dev/null 2>&1; then
	if wget -qO "${jar_file}" "${url}"; then
	download_success=true
	else
	echo "Error: Failed to download ${jar_name} from ${url}" >&2
	fi
	else
	echo "Error: Neither curl nor wget is available in hudi-spark container." >&2
	exit 1
	fi

	if [[ "${download_success}" == "false" ]]; then
	echo "Error: Failed to download ${jar_name} JAR. Please check the URL: ${url}" >&2
	exit 1
	fi

	if [[ ! -f "${jar_file}" ]]; then
	echo "Error: Downloaded file ${jar_file} does not exist" >&2
	exit 1
	fi
	fi
	echo "${jar_file}"
	}

	# Function to link a JAR file to Spark jars directory
	link_jar() {
	local jar_file="$1"
	local jar_name="$2"
	local version="$3"
	ln -sf "${jar_file}" "${JARS_DIR}/${jar_name}-${version}.jar"
	}

	# Wait for Hive Metastore to be ready
	echo "Waiting for Hive Metastore to be ready..."
	METASTORE_HOST=$(echo "${HIVE_METASTORE_URIS}" \| sed 's\|thrift://\|\|' \| cut -d: -f1)
	METASTORE_PORT=$(echo "${HIVE_METASTORE_URIS}" \| sed 's\|thrift://\|\|' \| cut -d: -f2)
	MAX_RETRIES=120
	RETRY_COUNT=0

	while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
	if command -v nc >/dev/null 2>&1; then
	if nc -z "${METASTORE_HOST}" "${METASTORE_PORT}" 2>/dev/null; then
	echo "Hive Metastore is ready at ${METASTORE_HOST}:${METASTORE_PORT}"
	break
	fi
	elif command -v timeout >/dev/null 2>&1; then
	if timeout 1 bash -c "cat < /dev/null > /dev/tcp/${METASTORE_HOST}/${METASTORE_PORT}" 2>/dev/null; then
	echo "Hive Metastore is ready at ${METASTORE_HOST}:${METASTORE_PORT}"
	break
	fi
	else
	# Fallback: just wait a bit and assume it's ready
	if [ $RETRY_COUNT -eq 0 ]; then
	echo "Warning: nc or timeout command not available, skipping metastore readiness check"
	sleep 10
	break
	fi
	fi

	RETRY_COUNT=$((RETRY_COUNT + 1))
	if [ $((RETRY_COUNT % 10)) -eq 0 ]; then
	echo "Waiting for Hive Metastore... (${RETRY_COUNT}/${MAX_RETRIES})"
	fi
	sleep 2
	done

	if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
	echo "Error: Hive Metastore did not become ready within $((MAX_RETRIES * 2)) seconds"
	exit 1
	fi

	# Write core-site for MinIO (S3A)
	cat >"${CONF_DIR}/core-site.xml" <<EOF
	<configuration>
	<property>
	<name>fs.s3a.endpoint</name>
	<value>${S3_ENDPOINT}</value>
	</property>
	<property>
	<name>fs.s3a.access.key</name>
	<value>${MINIO_ROOT_USER}</value>
	</property>
	<property>
	<name>fs.s3a.secret.key</name>
	<value>${MINIO_ROOT_PASSWORD}</value>
	</property>
	<property>
	<name>fs.s3a.path.style.access</name>
	<value>true</value>
	</property>
	<property>
	<name>fs.s3a.impl</name>
	<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
	</property>
	</configuration>
	EOF

	# hive-site to point Spark to the external metastore
	cat >"${CONF_DIR}/hive-site.xml" <<EOF
	<configuration>
	<property>
	<name>hive.metastore.uris</name>
	<value>${HIVE_METASTORE_URIS}</value>
	</property>
	<property>
	<name>javax.jdo.option.ConnectionDriverName</name>
	<value>org.postgresql.Driver</value>
	</property>
	<property>
	<name>datanucleus.schema.autoCreateAll</name>
	<value>true</value>
	</property>
	<property>
	<name>hive.metastore.warehouse.dir</name>
	<value>s3a://${HUDI_BUCKET}/warehouse</value>
	</property>
	</configuration>
	EOF

	# Download Hudi bundle
	HUDI_BUNDLE_JAR_FILE=$(download_jar "hudi-spark3.5-bundle_2.12" "${HUDI_BUNDLE_VERSION}" "${HUDI_BUNDLE_URL}")
	link_jar "${HUDI_BUNDLE_JAR_FILE}" "hudi-spark3.5-bundle_2.12" "${HUDI_BUNDLE_VERSION}"

	# Download Hadoop AWS S3A filesystem JAR (required for S3A support)
	# Note: hadoop-common is already included in Spark's built-in Hadoop, no need to download separately
	HADOOP_AWS_JAR=$(download_jar "hadoop-aws" "${HADOOP_AWS_VERSION}" "${HADOOP_AWS_URL}")
	link_jar "${HADOOP_AWS_JAR}" "hadoop-aws" "${HADOOP_AWS_VERSION}"

	# Download AWS Java SDK Bundle v1 (required for Hadoop 3.3.6 S3A support)
	# Note: Hadoop 3.3.x uses AWS SDK v1, version 1.12.262 is recommended
	AWS_SDK_BUNDLE_JAR=$(download_jar "aws-java-sdk-bundle" "${AWS_SDK_BUNDLE_VERSION}" "${AWS_SDK_BUNDLE_URL}")
	link_jar "${AWS_SDK_BUNDLE_JAR}" "aws-java-sdk-bundle" "${AWS_SDK_BUNDLE_VERSION}"

	# Download PostgreSQL JDBC driver (required for Hive Metastore connection)
	POSTGRESQL_JDBC_JAR=$(download_jar "postgresql" "${POSTGRESQL_JDBC_VERSION}" "${POSTGRESQL_JDBC_URL}")
	link_jar "${POSTGRESQL_JDBC_JAR}" "postgresql" "${POSTGRESQL_JDBC_VERSION}"

	# Process SQL files with environment variable substitution and execute them
	# Similar to iceberg's approach: group SQL files together to reduce client creation overhead
	SCRIPTS_DIR="/opt/hudi-scripts/create_preinstalled_scripts/hudi"
	TEMP_SQL_DIR="/tmp/hudi_sql"

	if [[ -d "${SCRIPTS_DIR}" ]]; then
	mkdir -p "${TEMP_SQL_DIR}"

	# Process each SQL file: substitute environment variables and combine them
	echo "Processing Hudi SQL scripts..."
	for sql_file in $(find "${SCRIPTS_DIR}" -name '*.sql' \| sort); do
	echo "Processing ${sql_file}..."
	# Use sed to replace environment variables in SQL files
	# Replace ${HIVE_METASTORE_URIS} and ${HUDI_BUCKET} with actual values
	sed "s\|\${HIVE_METASTORE_URIS}\|${HIVE_METASTORE_URIS}\|g; s\|\${HUDI_BUCKET}\|${HUDI_BUCKET}\|g" "${sql_file}" >> "${TEMP_SQL_DIR}/hudi_total.sql"
	echo "" >> "${TEMP_SQL_DIR}/hudi_total.sql"
	done

	# Run Spark SQL to execute all SQL scripts
	echo "Executing Hudi SQL scripts..."
	START_TIME=$(date +%s)
	${SPARK_HOME}/bin/spark-sql \
	--master local[*] \
	--name hudi-init \
	--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
	--conf spark.sql.catalogImplementation=hive \
	--conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
	--conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog \
	-f "${TEMP_SQL_DIR}/hudi_total.sql"
	END_TIME=$(date +%s)
	EXECUTION_TIME=$((END_TIME - START_TIME))
	echo "Hudi SQL scripts executed in ${EXECUTION_TIME} seconds"

	# Clean up temporary SQL file
	rm -f "${TEMP_SQL_DIR}/hudi_total.sql"
	else
	echo "Warning: SQL scripts directory ${SCRIPTS_DIR} not found, skipping table initialization."
	fi

	# Create success marker file to indicate initialization is complete
	# This file is used by docker healthcheck to verify container is ready
	touch ${SUCCESS_FILE}

	echo "Hudi demo data initialized."
	echo "Initialization completed successfully."

	# Keep container running for healthcheck and potential future use
	# Similar to iceberg's approach: tail -f /dev/null
	tail -f /dev/null