Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely introduces problems for a specific Spark version.
mvn clean install -DskipTests
bin/spark-submit \ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ --conf spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar \ --conf spark.sql.catalogImplementation=hive \ --conf spark.driver.maxResultSize=1g \ --conf spark.ui.port=6679 \ --packages org.apache.spark:spark-avro_2.11:2.4.7 \ --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \ --props `ls /path/to/hudi/dfs-source.properties` \ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ --source-ordering-field tpep_dropoff_datetime \ --table-type COPY_ON_WRITE \ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \ --target-table ny_hudi_tbl \ --op UPSERT \ --continuous \ --source-limit 5000000 \ --min-sync-interval-seconds 60
mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12
bin/spark-submit \ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ --conf spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar \ --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ --conf spark.sql.catalogImplementation=hive \ --conf spark.driver.maxResultSize=1g \ --conf spark.ui.port=6679 \ --packages org.apache.spark:spark-avro_2.12:3.1.2 \ --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ --props `ls /path/to/hudi/dfs-source.properties` \ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ --source-ordering-field tpep_dropoff_datetime \ --table-type COPY_ON_WRITE \ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \ --target-table ny_hudi_tbl \ --op UPSERT \ --continuous \ --source-limit 5000000 \ --min-sync-interval-seconds 60
mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12
bin/spark-submit \ --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ --conf spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar \ --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ --conf spark.sql.catalogImplementation=hive \ --conf spark.driver.maxResultSize=1g \ --conf spark.ui.port=6679 \ --packages org.apache.spark:spark-avro_2.12:3.2.0 \ --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ --props `ls /path/to/hudi/dfs-source.properties` \ --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ --source-ordering-field tpep_dropoff_datetime \ --table-type COPY_ON_WRITE \ --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \ --target-table ny_hudi_tbl \ --op UPSERT \ --continuous \ --source-limit 5000000 \ --min-sync-interval-seconds 60