common/thrift/DataSinks.thrift - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 namespace cpp impala
 namespace java org.apache.impala.thrift

 include "ExecStats.thrift"
 include "Exprs.thrift"
 include "Types.thrift"
 include "Descriptors.thrift"
 include "Partitions.thrift"
 include "ResourceProfile.thrift"

 enum TDataSinkType {
   DATA_STREAM_SINK = 0
   TABLE_SINK = 1
   JOIN_BUILD_SINK = 2
   PLAN_ROOT_SINK = 3
 }

 enum TSinkAction {
   INSERT = 0
   UPDATE = 1
   UPSERT = 2
   DELETE = 3
 }

 enum TTableSinkType {
   HDFS = 0
   HBASE = 1
   KUDU = 2
 }

 // Sink which forwards data to a remote plan fragment,
 // according to the given output partition specification
 // (ie, the m:1 part of an m:n data stream)
 struct TDataStreamSink {
   // destination node id
   1: required Types.TPlanNodeId dest_node_id

   // Specification of how the output of a fragment is partitioned.
   // If the partitioning type is UNPARTITIONED, the output is broadcast
   // to each destination host.
   2: required Partitions.TDataPartition output_partition
 }

 // Creates a new Hdfs files according to the evaluation of the partitionKeyExprs,
 // and materializes all its input RowBatches as a Hdfs file.
 struct THdfsTableSink {
   1: required list<Exprs.TExpr> partition_key_exprs

   2: required bool overwrite

   // The 'skip.header.line.count' property of the target Hdfs table. We will insert this
   // many empty lines at the beginning of new text files, which will be skipped by the
   // scanners while reading from the files.
   3: optional i32 skip_header_line_count

   // This property indicates to the table sink whether the input is ordered by the
   // partition keys, meaning partitions can be opened, written, and closed one by one.
   4: required bool input_is_clustered

   // Stores the indices into the list of non-clustering columns of the target table that
   // are stored in the 'sort.columns' table property. This is used in the backend to
   // populate the RowGroup::sorting_columns list in parquet files.
   5: optional list<i32> sort_columns

   // Stores the allocated ACID write id if the target table is transactional.
   6: optional i64 write_id

   // Sorting order. If not lexical, the backend should not populate the
   // RowGroup::sorting_columns list in parquet files.
   7: required Types.TSortingOrder sorting_order
 }

 // Structure to encapsulate specific options that are passed down to the KuduTableSink
 struct TKuduTableSink {
   // The position in this vector is equal to the position in the output expressions of the
   // sink and holds the index of the corresponsding column in the Kudu schema,
   // e.g. 'exprs[i]' references 'kudu_table.column(referenced_cols[i])'
   1: optional list<i32> referenced_columns

   // Defines if duplicate or not found keys should be ignored
   2: optional bool ignore_not_found_or_duplicate
 }

 // Sink to create the build side of a JoinNode.
 struct TJoinBuildSink {
   1: required Types.TJoinTableId join_table_id

   // only set for hash join build sinks
   2: required list<Exprs.TExpr> build_exprs
 }

 struct TPlanRootSink {
   1: required ResourceProfile.TBackendResourceProfile resource_profile
 }

 // Union type of all table sinks.
 struct TTableSink {
   1: required Types.TTableId target_table_id
   2: required TTableSinkType type
   3: required TSinkAction action
   4: optional THdfsTableSink hdfs_table_sink
   5: optional TKuduTableSink kudu_table_sink
 }

 struct TDataSink {
   // Tagged union of the different types of data sink.
   1: required TDataSinkType type
   2: optional TDataStreamSink stream_sink
   3: optional TTableSink table_sink
   4: optional TJoinBuildSink join_build_sink
   5: optional TPlanRootSink plan_root_sink

   // A human-readable label for the sink.
   6: optional string label

   // Estimated execution stats generated by the planner.
   7: optional ExecStats.TExecStats estimated_stats

   // Exprs that produce values for slots of output tuple (one expr per slot).
   // Only set by the DataSink implementations that require it.
   8: optional list<Exprs.TExpr> output_exprs
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	namespace cpp impala
	namespace java org.apache.impala.thrift

	include "ExecStats.thrift"
	include "Exprs.thrift"
	include "Types.thrift"
	include "Descriptors.thrift"
	include "Partitions.thrift"
	include "ResourceProfile.thrift"

	enum TDataSinkType {
	DATA_STREAM_SINK = 0
	TABLE_SINK = 1
	JOIN_BUILD_SINK = 2
	PLAN_ROOT_SINK = 3
	}

	enum TSinkAction {
	INSERT = 0
	UPDATE = 1
	UPSERT = 2
	DELETE = 3
	}

	enum TTableSinkType {
	HDFS = 0
	HBASE = 1
	KUDU = 2
	}

	// Sink which forwards data to a remote plan fragment,
	// according to the given output partition specification
	// (ie, the m:1 part of an m:n data stream)
	struct TDataStreamSink {
	// destination node id
	1: required Types.TPlanNodeId dest_node_id

	// Specification of how the output of a fragment is partitioned.
	// If the partitioning type is UNPARTITIONED, the output is broadcast
	// to each destination host.
	2: required Partitions.TDataPartition output_partition
	}

	// Creates a new Hdfs files according to the evaluation of the partitionKeyExprs,
	// and materializes all its input RowBatches as a Hdfs file.
	struct THdfsTableSink {
	1: required list<Exprs.TExpr> partition_key_exprs

	2: required bool overwrite

	// The 'skip.header.line.count' property of the target Hdfs table. We will insert this
	// many empty lines at the beginning of new text files, which will be skipped by the
	// scanners while reading from the files.
	3: optional i32 skip_header_line_count

	// This property indicates to the table sink whether the input is ordered by the
	// partition keys, meaning partitions can be opened, written, and closed one by one.
	4: required bool input_is_clustered

	// Stores the indices into the list of non-clustering columns of the target table that
	// are stored in the 'sort.columns' table property. This is used in the backend to
	// populate the RowGroup::sorting_columns list in parquet files.
	5: optional list<i32> sort_columns

	// Stores the allocated ACID write id if the target table is transactional.
	6: optional i64 write_id

	// Sorting order. If not lexical, the backend should not populate the
	// RowGroup::sorting_columns list in parquet files.
	7: required Types.TSortingOrder sorting_order
	}

	// Structure to encapsulate specific options that are passed down to the KuduTableSink
	struct TKuduTableSink {
	// The position in this vector is equal to the position in the output expressions of the
	// sink and holds the index of the corresponsding column in the Kudu schema,
	// e.g. 'exprs[i]' references 'kudu_table.column(referenced_cols[i])'
	1: optional list<i32> referenced_columns

	// Defines if duplicate or not found keys should be ignored
	2: optional bool ignore_not_found_or_duplicate
	}

	// Sink to create the build side of a JoinNode.
	struct TJoinBuildSink {
	1: required Types.TJoinTableId join_table_id

	// only set for hash join build sinks
	2: required list<Exprs.TExpr> build_exprs
	}

	struct TPlanRootSink {
	1: required ResourceProfile.TBackendResourceProfile resource_profile
	}

	// Union type of all table sinks.
	struct TTableSink {
	1: required Types.TTableId target_table_id
	2: required TTableSinkType type
	3: required TSinkAction action
	4: optional THdfsTableSink hdfs_table_sink
	5: optional TKuduTableSink kudu_table_sink
	}

	struct TDataSink {
	// Tagged union of the different types of data sink.
	1: required TDataSinkType type
	2: optional TDataStreamSink stream_sink
	3: optional TTableSink table_sink
	4: optional TJoinBuildSink join_build_sink
	5: optional TPlanRootSink plan_root_sink

	// A human-readable label for the sink.
	6: optional string label

	// Estimated execution stats generated by the planner.
	7: optional ExecStats.TExecStats estimated_stats

	// Exprs that produce values for slots of output tuple (one expr per slot).
	// Only set by the DataSink implementations that require it.
	8: optional list<Exprs.TExpr> output_exprs
	}