runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/streaming/ReadSourceTranslatorStreaming.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.runners.spark.structuredstreaming.translation.streaming;

 import static org.apache.beam.runners.spark.structuredstreaming.Constants.BEAM_SOURCE_OPTION;
 import static org.apache.beam.runners.spark.structuredstreaming.Constants.DEFAULT_PARALLELISM;
 import static org.apache.beam.runners.spark.structuredstreaming.Constants.PIPELINE_OPTIONS;

 import java.io.IOException;
 import org.apache.beam.runners.core.construction.ReadTranslation;
 import org.apache.beam.runners.core.serialization.Base64Serializer;
 import org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext;
 import org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator;
 import org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers;
 import org.apache.beam.runners.spark.structuredstreaming.translation.helpers.RowHelpers;
 import org.apache.beam.sdk.io.UnboundedSource;
 import org.apache.beam.sdk.runners.AppliedPTransform;
 import org.apache.beam.sdk.transforms.PTransform;
 import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
 import org.apache.beam.sdk.util.WindowedValue;
 import org.apache.beam.sdk.values.PBegin;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;

 class ReadSourceTranslatorStreaming<T>
     implements TransformTranslator<PTransform<PBegin, PCollection<T>>> {

   private static final String sourceProviderClass = DatasetSourceStreaming.class.getCanonicalName();

   @SuppressWarnings("unchecked")
   @Override
   public void translateTransform(
       PTransform<PBegin, PCollection<T>> transform, AbstractTranslationContext context) {
     AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform =
         (AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>)
             context.getCurrentTransform();

     UnboundedSource<T, UnboundedSource.CheckpointMark> source;
     try {
       source = ReadTranslation.unboundedSourceFromTransform(rootTransform);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
     SparkSession sparkSession = context.getSparkSession();

     String serializedSource = Base64Serializer.serializeUnchecked(source);
     Dataset<Row> rowDataset =
         sparkSession
             .readStream()
             .format(sourceProviderClass)
             .option(BEAM_SOURCE_OPTION, serializedSource)
             .option(
                 DEFAULT_PARALLELISM,
                 String.valueOf(context.getSparkSession().sparkContext().defaultParallelism()))
             .option(PIPELINE_OPTIONS, context.getSerializableOptions().toString())
             .load();

     // extract windowedValue from Row
     WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
         WindowedValue.FullWindowedValueCoder.of(
             source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
     Dataset<WindowedValue<T>> dataset =
         rowDataset.map(
             RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder),
             EncoderHelpers.fromBeamCoder(windowedValueCoder));

     PCollection<T> output = (PCollection<T>) context.getOutput();
     context.putDataset(output, dataset);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.runners.spark.structuredstreaming.translation.streaming;

	import static org.apache.beam.runners.spark.structuredstreaming.Constants.BEAM_SOURCE_OPTION;
	import static org.apache.beam.runners.spark.structuredstreaming.Constants.DEFAULT_PARALLELISM;
	import static org.apache.beam.runners.spark.structuredstreaming.Constants.PIPELINE_OPTIONS;

	import java.io.IOException;
	import org.apache.beam.runners.core.construction.ReadTranslation;
	import org.apache.beam.runners.core.serialization.Base64Serializer;
	import org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext;
	import org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator;
	import org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers;
	import org.apache.beam.runners.spark.structuredstreaming.translation.helpers.RowHelpers;
	import org.apache.beam.sdk.io.UnboundedSource;
	import org.apache.beam.sdk.runners.AppliedPTransform;
	import org.apache.beam.sdk.transforms.PTransform;
	import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
	import org.apache.beam.sdk.util.WindowedValue;
	import org.apache.beam.sdk.values.PBegin;
	import org.apache.beam.sdk.values.PCollection;
	import org.apache.spark.sql.Dataset;
	import org.apache.spark.sql.Row;
	import org.apache.spark.sql.SparkSession;

	class ReadSourceTranslatorStreaming<T>
	implements TransformTranslator<PTransform<PBegin, PCollection<T>>> {

	private static final String sourceProviderClass = DatasetSourceStreaming.class.getCanonicalName();

	@SuppressWarnings("unchecked")
	@Override
	public void translateTransform(
	PTransform<PBegin, PCollection<T>> transform, AbstractTranslationContext context) {
	AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform =
	(AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>)
	context.getCurrentTransform();

	UnboundedSource<T, UnboundedSource.CheckpointMark> source;
	try {
	source = ReadTranslation.unboundedSourceFromTransform(rootTransform);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	SparkSession sparkSession = context.getSparkSession();

	String serializedSource = Base64Serializer.serializeUnchecked(source);
	Dataset<Row> rowDataset =
	sparkSession
	.readStream()
	.format(sourceProviderClass)
	.option(BEAM_SOURCE_OPTION, serializedSource)
	.option(
	DEFAULT_PARALLELISM,
	String.valueOf(context.getSparkSession().sparkContext().defaultParallelism()))
	.option(PIPELINE_OPTIONS, context.getSerializableOptions().toString())
	.load();

	// extract windowedValue from Row
	WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
	WindowedValue.FullWindowedValueCoder.of(
	source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
	Dataset<WindowedValue<T>> dataset =
	rowDataset.map(
	RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder),
	EncoderHelpers.fromBeamCoder(windowedValueCoder));

	PCollection<T> output = (PCollection<T>) context.getOutput();
	context.putDataset(output, dataset);
	}
	}