hudi-spark/src/test/java/HoodieJavaStreamingApp.java - hudi - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.util.stream.Collectors;
 import org.apache.hudi.DataSourceReadOptions;
 import org.apache.hudi.DataSourceWriteOptions;
 import org.apache.hudi.HoodieDataSourceHelpers;
 import org.apache.hudi.common.model.HoodieTableType;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
 import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.config.HoodieCompactionConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.exception.TableNotFoundException;
 import org.apache.hudi.hive.MultiPartKeysValueExtractor;

 import com.beust.jcommander.JCommander;
 import com.beust.jcommander.Parameter;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.streaming.DataStreamWriter;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.streaming.ProcessingTime;

 import java.util.List;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import org.apache.spark.sql.streaming.StreamingQuery;

 import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;

 /**
  * Sample program that writes & reads hoodie tables via the Spark datasource streaming.
  */
 public class HoodieJavaStreamingApp {

   @Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
   private String tablePath = "/tmp/hoodie/streaming/sample-table";

   @Parameter(names = {"--streaming-source-path", "-ssp"}, description = "path for streaming source file folder")
   private String streamingSourcePath = "/tmp/hoodie/streaming/source";

   @Parameter(names = {"--streaming-checkpointing-path", "-scp"},
       description = "path for streaming checking pointing folder")
   private String streamingCheckpointingPath = "/tmp/hoodie/streaming/checkpoint";

   @Parameter(names = {"--streaming-duration-in-ms", "-sdm"},
       description = "time in millisecond for the streaming duration")
   private Long streamingDurationInMs = 15000L;

   @Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
   private String tableName = "hoodie_test";

   @Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
   private String tableType = HoodieTableType.MERGE_ON_READ.name();

   @Parameter(names = {"--hive-sync", "-hv"}, description = "Enable syncing to hive")
   private Boolean enableHiveSync = false;

   @Parameter(names = {"--hive-db", "-hd"}, description = "hive database")
   private String hiveDB = "default";

   @Parameter(names = {"--hive-table", "-ht"}, description = "hive table")
   private String hiveTable = "hoodie_sample_test";

   @Parameter(names = {"--hive-user", "-hu"}, description = "hive username")
   private String hiveUser = "hive";

   @Parameter(names = {"--hive-password", "-hp"}, description = "hive password")
   private String hivePass = "hive";

   @Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
   private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";

   @Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
   private Boolean useMultiPartitionKeys = false;

   @Parameter(names = {"--help", "-h"}, help = true)
   public Boolean help = false;


   private static final Logger LOG = LogManager.getLogger(HoodieJavaStreamingApp.class);

   public static void main(String[] args) throws Exception {
     HoodieJavaStreamingApp cli = new HoodieJavaStreamingApp();
     JCommander cmd = new JCommander(cli, null, args);

     if (cli.help) {
       cmd.usage();
       System.exit(1);
     }
     int errStatus = 0;
     try {
       cli.run();
     } catch (Exception ex) {
       LOG.error("Got error running app ", ex);
       errStatus = -1;
     } finally {
       System.exit(errStatus);
     }
   }

   /**
    *
    * @throws Exception
    */
   public void run() throws Exception {
     // Spark session setup..
     SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
     JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());

     // folder path clean up and creation, preparing the environment
     FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
     fs.delete(new Path(streamingSourcePath), true);
     fs.delete(new Path(streamingCheckpointingPath), true);
     fs.delete(new Path(tablePath), true);
     fs.mkdirs(new Path(streamingSourcePath));

     // Generator of some records to be loaded in.
     HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();

     List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
     Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

     List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
     Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));


     String ckptPath = streamingCheckpointingPath + "/stream1";
     String srcPath = streamingSourcePath + "/stream1";
     fs.mkdirs(new Path(ckptPath));
     fs.mkdirs(new Path(srcPath));

     // setup the input for streaming
     Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");

     // start streaming and showing
     ExecutorService executor = Executors.newFixedThreadPool(2);
     int numInitialCommits = 0;

     // thread for spark strucutured streaming
     try {
       Future<Void> streamFuture = executor.submit(() -> {
         LOG.info("===== Streaming Starting =====");
         stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
         LOG.info("===== Streaming Ends =====");
         return null;
       });

       // thread for adding data to the streaming source and showing results over time
       Future<Integer> showFuture = executor.submit(() -> {
         LOG.info("===== Showing Starting =====");
         int numCommits = addInputAndValidateIngestion(spark, fs,  srcPath,0, 100, inputDF1, inputDF2, true);
         LOG.info("===== Showing Ends =====");
         return numCommits;
       });

       // let the threads run
       streamFuture.get();
       numInitialCommits = showFuture.get();
     } finally {
       executor.shutdownNow();
     }

     HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath);
     if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
       // Ensure we have successfully completed one compaction commit
       ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
     } else {
       ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
     }

     // Deletes Stream
     // Need to restart application to ensure spark does not assume there are multiple streams active.
     spark.close();
     SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
     jssc = new JavaSparkContext(newSpark.sparkContext());
     String ckptPath2 = streamingCheckpointingPath + "/stream2";
     String srcPath2 = srcPath + "/stream2";
     fs.mkdirs(new Path(ckptPath2));
     fs.mkdirs(new Path(srcPath2));
     Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
     List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
     Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
     executor = Executors.newFixedThreadPool(2);

     // thread for spark strucutured streaming
     try {
       Future<Void> streamFuture = executor.submit(() -> {
         LOG.info("===== Streaming Starting =====");
         stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
         LOG.info("===== Streaming Ends =====");
         return null;
       });

       final int numCommits = numInitialCommits;
       // thread for adding data to the streaming source and showing results over time
       Future<Void> showFuture = executor.submit(() -> {
         LOG.info("===== Showing Starting =====");
         addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
         LOG.info("===== Showing Ends =====");
         return null;
       });

       // let the threads run
       streamFuture.get();
       showFuture.get();
     } finally {
       executor.shutdown();
     }
   }

   private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, int sleepSecsAfterEachRun)
       throws InterruptedException {
     long beginTime = System.currentTimeMillis();
     long currTime = beginTime;
     long timeoutMsecs = timeoutSecs * 1000;

     while ((currTime - beginTime) < timeoutMsecs) {
       try {
         HoodieTimeline timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath);
         LOG.info("Timeline :" + timeline.getInstants().collect(Collectors.toList()));
         if (timeline.countInstants() >= numCommits) {
           return;
         }
         HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true);
         System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList()));
       } catch (TableNotFoundException te) {
         LOG.info("Got table not found exception. Retrying");
       } finally {
         Thread.sleep(sleepSecsAfterEachRun * 1000);
         currTime = System.currentTimeMillis();
       }
     }
     throw new IllegalStateException("Timedout waiting for " + numCommits + " commits to appear in " + tablePath);
   }

   /**
    * Adding data to the streaming source and showing results over time.
    *
    * @param spark
    * @param fs
    * @param inputDF1
    * @param inputDF2
    * @throws Exception
    */
   public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, String srcPath,
       int initialCommits, int expRecords,
       Dataset<Row> inputDF1, Dataset<Row> inputDF2, boolean instantTimeValidation) throws Exception {
     // Ensure, we always write only one file. This is very important to ensure a single batch is reliably read
     // atomically by one iteration of spark streaming.
     inputDF1.coalesce(1).write().mode(SaveMode.Append).json(srcPath);

     int numExpCommits = initialCommits + 1;
     // wait for spark streaming to process one microbatch
     waitTillNCommits(fs, numExpCommits, 180, 3);
     String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
     LOG.info("First commit at instant time :" + commitInstantTime1);

     String commitInstantTime2 = commitInstantTime1;
     if (null != inputDF2) {
       numExpCommits += 1;
       inputDF2.write().mode(SaveMode.Append).json(srcPath);
       // wait for spark streaming to process one microbatch
       Thread.sleep(3000);
       waitTillNCommits(fs, numExpCommits, 180, 3);
       commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
       LOG.info("Second commit at instant time :" + commitInstantTime2);
     }

     if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
       numExpCommits += 1;
       // Wait for compaction to also finish and track latest timestamp as commit timestamp
       waitTillNCommits(fs, numExpCommits, 180, 3);
       commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
       LOG.info("Compaction commit at instant time :" + commitInstantTime2);
     }

     /**
      * Read & do some queries
      */
     Dataset<Row> hoodieROViewDF = spark.read().format("hudi")
         // pass any path glob, can include hoodie & non-hoodie
         // datasets
         .load(tablePath + "/*/*/*/*");
     hoodieROViewDF.registerTempTable("hoodie_ro");
     spark.sql("describe hoodie_ro").show();
     // all trips whose fare amount was greater than 2.
     spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();

     if (instantTimeValidation) {
       System.out.println("Showing all records. Latest Instant Time =" + commitInstantTime2);
       spark.sql("select * from hoodie_ro").show(200, false);
       long numRecordsAtInstant2 =
           spark.sql("select * from hoodie_ro where _hoodie_commit_time = " + commitInstantTime2).count();
       ValidationUtils.checkArgument(numRecordsAtInstant2 == expRecords,
           "Expecting " + expRecords + " records, Got " + numRecordsAtInstant2);
     }

     long numRecords = spark.sql("select * from hoodie_ro").count();
     ValidationUtils.checkArgument(numRecords == expRecords,
         "Expecting " + expRecords + " records, Got " + numRecords);

     if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
       /**
        * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
        */
       Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
           .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
           // Only changes in write 2 above
           .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
           // For incremental view, pass in the root/base path of dataset
           .load(tablePath);

       LOG.info("You will only see records from : " + commitInstantTime2);
       hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
     }
     return numExpCommits;
   }

   /**
    * Hoodie spark streaming job.
    *
    * @param streamingInput
    * @throws Exception
    */
   public void stream(Dataset<Row> streamingInput, String operationType, String checkpointLocation) throws Exception {

     DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
         .option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
         .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
         .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
         .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
         .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
         .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
         .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
         .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "true")
         .option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
         .outputMode(OutputMode.Append());

     updateHiveSyncConfig(writer);
     StreamingQuery query = writer.trigger(new ProcessingTime(500)).start(tablePath);
     query.awaitTermination(streamingDurationInMs);
   }

   /**
    * Setup configs for syncing to hive.
    *
    * @param writer
    * @return
    */
   private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) {
     if (enableHiveSync) {
       LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
       writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
           .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
           .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
           .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
           .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
           .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
       if (useMultiPartitionKeys) {
         writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
             DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
             MultiPartKeysValueExtractor.class.getCanonicalName());
       } else {
         writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
       }
     }
     return writer;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.util.stream.Collectors;
	import org.apache.hudi.DataSourceReadOptions;
	import org.apache.hudi.DataSourceWriteOptions;
	import org.apache.hudi.HoodieDataSourceHelpers;
	import org.apache.hudi.common.model.HoodieTableType;
	import org.apache.hudi.common.table.HoodieTableMetaClient;
	import org.apache.hudi.common.table.timeline.HoodieTimeline;
	import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
	import org.apache.hudi.common.util.ValidationUtils;
	import org.apache.hudi.config.HoodieCompactionConfig;
	import org.apache.hudi.config.HoodieWriteConfig;
	import org.apache.hudi.exception.TableNotFoundException;
	import org.apache.hudi.hive.MultiPartKeysValueExtractor;

	import com.beust.jcommander.JCommander;
	import com.beust.jcommander.Parameter;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.log4j.LogManager;
	import org.apache.log4j.Logger;
	import org.apache.spark.api.java.JavaSparkContext;
	import org.apache.spark.sql.Dataset;
	import org.apache.spark.sql.Row;
	import org.apache.spark.sql.SaveMode;
	import org.apache.spark.sql.SparkSession;
	import org.apache.spark.sql.streaming.DataStreamWriter;
	import org.apache.spark.sql.streaming.OutputMode;
	import org.apache.spark.sql.streaming.ProcessingTime;

	import java.util.List;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.Future;
	import org.apache.spark.sql.streaming.StreamingQuery;

	import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;

	/**
	* Sample program that writes & reads hoodie tables via the Spark datasource streaming.
	*/
	public class HoodieJavaStreamingApp {

	@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
	private String tablePath = "/tmp/hoodie/streaming/sample-table";

	@Parameter(names = {"--streaming-source-path", "-ssp"}, description = "path for streaming source file folder")
	private String streamingSourcePath = "/tmp/hoodie/streaming/source";

	@Parameter(names = {"--streaming-checkpointing-path", "-scp"},
	description = "path for streaming checking pointing folder")
	private String streamingCheckpointingPath = "/tmp/hoodie/streaming/checkpoint";

	@Parameter(names = {"--streaming-duration-in-ms", "-sdm"},
	description = "time in millisecond for the streaming duration")
	private Long streamingDurationInMs = 15000L;

	@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
	private String tableName = "hoodie_test";

	@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
	private String tableType = HoodieTableType.MERGE_ON_READ.name();

	@Parameter(names = {"--hive-sync", "-hv"}, description = "Enable syncing to hive")
	private Boolean enableHiveSync = false;

	@Parameter(names = {"--hive-db", "-hd"}, description = "hive database")
	private String hiveDB = "default";

	@Parameter(names = {"--hive-table", "-ht"}, description = "hive table")
	private String hiveTable = "hoodie_sample_test";

	@Parameter(names = {"--hive-user", "-hu"}, description = "hive username")
	private String hiveUser = "hive";

	@Parameter(names = {"--hive-password", "-hp"}, description = "hive password")
	private String hivePass = "hive";

	@Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
	private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";

	@Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
	private Boolean useMultiPartitionKeys = false;

	@Parameter(names = {"--help", "-h"}, help = true)
	public Boolean help = false;


	private static final Logger LOG = LogManager.getLogger(HoodieJavaStreamingApp.class);

	public static void main(String[] args) throws Exception {
	HoodieJavaStreamingApp cli = new HoodieJavaStreamingApp();
	JCommander cmd = new JCommander(cli, null, args);

	if (cli.help) {
	cmd.usage();
	System.exit(1);
	}
	int errStatus = 0;
	try {
	cli.run();
	} catch (Exception ex) {
	LOG.error("Got error running app ", ex);
	errStatus = -1;
	} finally {
	System.exit(errStatus);
	}
	}

	/**
	*
	* @throws Exception
	*/
	public void run() throws Exception {
	// Spark session setup..
	SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
	.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
	JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());

	// folder path clean up and creation, preparing the environment
	FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
	fs.delete(new Path(streamingSourcePath), true);
	fs.delete(new Path(streamingCheckpointingPath), true);
	fs.delete(new Path(tablePath), true);
	fs.mkdirs(new Path(streamingSourcePath));

	// Generator of some records to be loaded in.
	HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();

	List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
	Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

	List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
	Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));


	String ckptPath = streamingCheckpointingPath + "/stream1";
	String srcPath = streamingSourcePath + "/stream1";
	fs.mkdirs(new Path(ckptPath));
	fs.mkdirs(new Path(srcPath));

	// setup the input for streaming
	Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");

	// start streaming and showing
	ExecutorService executor = Executors.newFixedThreadPool(2);
	int numInitialCommits = 0;

	// thread for spark strucutured streaming
	try {
	Future<Void> streamFuture = executor.submit(() -> {
	LOG.info("===== Streaming Starting =====");
	stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
	LOG.info("===== Streaming Ends =====");
	return null;
	});

	// thread for adding data to the streaming source and showing results over time
	Future<Integer> showFuture = executor.submit(() -> {
	LOG.info("===== Showing Starting =====");
	int numCommits = addInputAndValidateIngestion(spark, fs, srcPath,0, 100, inputDF1, inputDF2, true);
	LOG.info("===== Showing Ends =====");
	return numCommits;
	});

	// let the threads run
	streamFuture.get();
	numInitialCommits = showFuture.get();
	} finally {
	executor.shutdownNow();
	}

	HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath);
	if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
	// Ensure we have successfully completed one compaction commit
	ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
	} else {
	ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
	}

	// Deletes Stream
	// Need to restart application to ensure spark does not assume there are multiple streams active.
	spark.close();
	SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
	.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
	jssc = new JavaSparkContext(newSpark.sparkContext());
	String ckptPath2 = streamingCheckpointingPath + "/stream2";
	String srcPath2 = srcPath + "/stream2";
	fs.mkdirs(new Path(ckptPath2));
	fs.mkdirs(new Path(srcPath2));
	Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
	List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
	Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
	executor = Executors.newFixedThreadPool(2);

	// thread for spark strucutured streaming
	try {
	Future<Void> streamFuture = executor.submit(() -> {
	LOG.info("===== Streaming Starting =====");
	stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
	LOG.info("===== Streaming Ends =====");
	return null;
	});

	final int numCommits = numInitialCommits;
	// thread for adding data to the streaming source and showing results over time
	Future<Void> showFuture = executor.submit(() -> {
	LOG.info("===== Showing Starting =====");
	addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
	LOG.info("===== Showing Ends =====");
	return null;
	});

	// let the threads run
	streamFuture.get();
	showFuture.get();
	} finally {
	executor.shutdown();
	}
	}

	private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, int sleepSecsAfterEachRun)
	throws InterruptedException {
	long beginTime = System.currentTimeMillis();
	long currTime = beginTime;
	long timeoutMsecs = timeoutSecs * 1000;

	while ((currTime - beginTime) < timeoutMsecs) {
	try {
	HoodieTimeline timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath);
	LOG.info("Timeline :" + timeline.getInstants().collect(Collectors.toList()));
	if (timeline.countInstants() >= numCommits) {
	return;
	}
	HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true);
	System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList()));
	} catch (TableNotFoundException te) {
	LOG.info("Got table not found exception. Retrying");
	} finally {
	Thread.sleep(sleepSecsAfterEachRun * 1000);
	currTime = System.currentTimeMillis();
	}
	}
	throw new IllegalStateException("Timedout waiting for " + numCommits + " commits to appear in " + tablePath);
	}

	/**
	* Adding data to the streaming source and showing results over time.
	*
	* @param spark
	* @param fs
	* @param inputDF1
	* @param inputDF2
	* @throws Exception
	*/
	public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, String srcPath,
	int initialCommits, int expRecords,
	Dataset<Row> inputDF1, Dataset<Row> inputDF2, boolean instantTimeValidation) throws Exception {
	// Ensure, we always write only one file. This is very important to ensure a single batch is reliably read
	// atomically by one iteration of spark streaming.
	inputDF1.coalesce(1).write().mode(SaveMode.Append).json(srcPath);

	int numExpCommits = initialCommits + 1;
	// wait for spark streaming to process one microbatch
	waitTillNCommits(fs, numExpCommits, 180, 3);
	String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
	LOG.info("First commit at instant time :" + commitInstantTime1);

	String commitInstantTime2 = commitInstantTime1;
	if (null != inputDF2) {
	numExpCommits += 1;
	inputDF2.write().mode(SaveMode.Append).json(srcPath);
	// wait for spark streaming to process one microbatch
	Thread.sleep(3000);
	waitTillNCommits(fs, numExpCommits, 180, 3);
	commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
	LOG.info("Second commit at instant time :" + commitInstantTime2);
	}

	if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
	numExpCommits += 1;
	// Wait for compaction to also finish and track latest timestamp as commit timestamp
	waitTillNCommits(fs, numExpCommits, 180, 3);
	commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
	LOG.info("Compaction commit at instant time :" + commitInstantTime2);
	}

	/**
	* Read & do some queries
	*/
	Dataset<Row> hoodieROViewDF = spark.read().format("hudi")
	// pass any path glob, can include hoodie & non-hoodie
	// datasets
	.load(tablePath + "////");
	hoodieROViewDF.registerTempTable("hoodie_ro");
	spark.sql("describe hoodie_ro").show();
	// all trips whose fare amount was greater than 2.
	spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();

	if (instantTimeValidation) {
	System.out.println("Showing all records. Latest Instant Time =" + commitInstantTime2);
	spark.sql("select * from hoodie_ro").show(200, false);
	long numRecordsAtInstant2 =
	spark.sql("select * from hoodie_ro where _hoodie_commit_time = " + commitInstantTime2).count();
	ValidationUtils.checkArgument(numRecordsAtInstant2 == expRecords,
	"Expecting " + expRecords + " records, Got " + numRecordsAtInstant2);
	}

	long numRecords = spark.sql("select * from hoodie_ro").count();
	ValidationUtils.checkArgument(numRecords == expRecords,
	"Expecting " + expRecords + " records, Got " + numRecords);

	if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
	/**
	* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
	*/
	Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
	.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
	// Only changes in write 2 above
	.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
	// For incremental view, pass in the root/base path of dataset
	.load(tablePath);

	LOG.info("You will only see records from : " + commitInstantTime2);
	hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
	}
	return numExpCommits;
	}

	/**
	* Hoodie spark streaming job.
	*
	* @param streamingInput
	* @throws Exception
	*/
	public void stream(Dataset<Row> streamingInput, String operationType, String checkpointLocation) throws Exception {

	DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
	.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
	.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
	.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
	.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
	.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
	.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
	.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
	.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "true")
	.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
	.outputMode(OutputMode.Append());

	updateHiveSyncConfig(writer);
	StreamingQuery query = writer.trigger(new ProcessingTime(500)).start(tablePath);
	query.awaitTermination(streamingDurationInMs);
	}

	/**
	* Setup configs for syncing to hive.
	*
	* @param writer
	* @return
	*/
	private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) {
	if (enableHiveSync) {
	LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
	writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
	.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
	.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
	.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
	.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
	.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
	if (useMultiPartitionKeys) {
	writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
	DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
	MultiPartKeysValueExtractor.class.getCanonicalName());
	} else {
	writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
	}
	}
	return writer;
	}
	}