hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java - hudi - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hudi.streamer;

 import org.apache.hudi.common.config.DFSPropertiesConfiguration;
 import org.apache.hudi.common.config.TypedProperties;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.configuration.FlinkOptions;
 import org.apache.hudi.configuration.OptionsResolver;
 import org.apache.hudi.sink.transform.Transformer;
 import org.apache.hudi.sink.utils.Pipelines;
 import org.apache.hudi.util.AvroSchemaConverter;
 import org.apache.hudi.util.StreamerUtil;

 import com.beust.jcommander.JCommander;
 import org.apache.flink.configuration.Configuration;
 import org.apache.flink.formats.common.TimestampFormat;
 import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
 import org.apache.flink.streaming.api.datastream.DataStream;
 import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
 import org.apache.flink.table.data.RowData;
 import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
 import org.apache.flink.table.types.logical.RowType;

 /**
  * A utility which can incrementally consume data from Kafka and apply it to the target table.
  * It has the similar functionality with SQL data source except that the source is bind to Kafka
  * and the format is bind to JSON.
  */
 public class HoodieFlinkStreamer {
   public static void main(String[] args) throws Exception {
     StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

     final FlinkStreamerConfig cfg = new FlinkStreamerConfig();
     JCommander cmd = new JCommander(cfg, null, args);
     if (cfg.help || args.length == 0) {
       cmd.usage();
       System.exit(1);
     }
     env.enableCheckpointing(cfg.checkpointInterval);
     env.getConfig().setGlobalJobParameters(cfg);
     // We use checkpoint to trigger write operation, including instant generating and committing,
     // There can only be one checkpoint at one time.
     env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

     env.setStateBackend(cfg.stateBackend);
     if (cfg.flinkCheckPointPath != null) {
       env.getCheckpointConfig().setCheckpointStorage(cfg.flinkCheckPointPath);
     }

     TypedProperties kafkaProps = DFSPropertiesConfiguration.getGlobalProps();
     kafkaProps.putAll(StreamerUtil.appendKafkaProps(cfg));

     Configuration conf = FlinkStreamerConfig.toFlinkConfig(cfg);
     // Read from kafka source
     RowType rowType =
         (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf))
             .getLogicalType();

     long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout();
     int parallelism = env.getParallelism();
     conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);

     DataStream<RowData> dataStream = env.addSource(new FlinkKafkaConsumer<>(
             cfg.kafkaTopic,
             new JsonRowDataDeserializationSchema(
                 rowType,
                 InternalTypeInfo.of(rowType),
                 false,
                 true,
                 TimestampFormat.ISO_8601
             ), kafkaProps))
         .name("kafka_source")
         .uid("uid_kafka_source");

     if (cfg.transformerClassNames != null && !cfg.transformerClassNames.isEmpty()) {
       Option<Transformer> transformer = StreamerUtil.createTransformer(cfg.transformerClassNames);
       if (transformer.isPresent()) {
         dataStream = transformer.get().apply(dataStream);
       }
     }

     DataStream<HoodieRecord> hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
     DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream);
     if (OptionsResolver.needsAsyncCompaction(conf)) {
       Pipelines.compact(conf, pipeline);
     } else {
       Pipelines.clean(conf, pipeline);
     }

     env.execute(cfg.targetTableName);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hudi.streamer;

	import org.apache.hudi.common.config.DFSPropertiesConfiguration;
	import org.apache.hudi.common.config.TypedProperties;
	import org.apache.hudi.common.model.HoodieRecord;
	import org.apache.hudi.common.util.Option;
	import org.apache.hudi.configuration.FlinkOptions;
	import org.apache.hudi.configuration.OptionsResolver;
	import org.apache.hudi.sink.transform.Transformer;
	import org.apache.hudi.sink.utils.Pipelines;
	import org.apache.hudi.util.AvroSchemaConverter;
	import org.apache.hudi.util.StreamerUtil;

	import com.beust.jcommander.JCommander;
	import org.apache.flink.configuration.Configuration;
	import org.apache.flink.formats.common.TimestampFormat;
	import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
	import org.apache.flink.streaming.api.datastream.DataStream;
	import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
	import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
	import org.apache.flink.table.data.RowData;
	import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
	import org.apache.flink.table.types.logical.RowType;

	/**
	* A utility which can incrementally consume data from Kafka and apply it to the target table.
	* It has the similar functionality with SQL data source except that the source is bind to Kafka
	* and the format is bind to JSON.
	*/
	public class HoodieFlinkStreamer {
	public static void main(String[] args) throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	final FlinkStreamerConfig cfg = new FlinkStreamerConfig();
	JCommander cmd = new JCommander(cfg, null, args);
	if (cfg.help \|\| args.length == 0) {
	cmd.usage();
	System.exit(1);
	}
	env.enableCheckpointing(cfg.checkpointInterval);
	env.getConfig().setGlobalJobParameters(cfg);
	// We use checkpoint to trigger write operation, including instant generating and committing,
	// There can only be one checkpoint at one time.
	env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

	env.setStateBackend(cfg.stateBackend);
	if (cfg.flinkCheckPointPath != null) {
	env.getCheckpointConfig().setCheckpointStorage(cfg.flinkCheckPointPath);
	}

	TypedProperties kafkaProps = DFSPropertiesConfiguration.getGlobalProps();
	kafkaProps.putAll(StreamerUtil.appendKafkaProps(cfg));

	Configuration conf = FlinkStreamerConfig.toFlinkConfig(cfg);
	// Read from kafka source
	RowType rowType =
	(RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf))
	.getLogicalType();

	long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout();
	int parallelism = env.getParallelism();
	conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);

	DataStream<RowData> dataStream = env.addSource(new FlinkKafkaConsumer<>(
	cfg.kafkaTopic,
	new JsonRowDataDeserializationSchema(
	rowType,
	InternalTypeInfo.of(rowType),
	false,
	true,
	TimestampFormat.ISO_8601
	), kafkaProps))
	.name("kafka_source")
	.uid("uid_kafka_source");

	if (cfg.transformerClassNames != null && !cfg.transformerClassNames.isEmpty()) {
	Option<Transformer> transformer = StreamerUtil.createTransformer(cfg.transformerClassNames);
	if (transformer.isPresent()) {
	dataStream = transformer.get().apply(dataStream);
	}
	}

	DataStream<HoodieRecord> hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
	DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream);
	if (OptionsResolver.needsAsyncCompaction(conf)) {
	Pipelines.compact(conf, pipeline);
	} else {
	Pipelines.clean(conf, pipeline);
	}

	env.execute(cfg.targetTableName);
	}
	}