format/src/main/thrift/schema.thrift - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * File format description for CarbonData schema file
  */
 namespace java org.apache.carbondata.format

 /**
  * The types supported by Carbon Data.
  */
 enum DataType {
 	STRING = 0,
 	SHORT = 1,
 	INT = 2,
 	LONG = 3,
 	DOUBLE = 4,
 	DECIMAL = 5,
 	TIMESTAMP = 6,
 	DATE = 7,
 	BOOLEAN = 8,
 	ARRAY = 20,
 	STRUCT = 21,
 	VARCHAR = 22,
 	MAP = 23,
 	FLOAT = 24,
 	BYTE = 25
 }

 /**
  *	Encodings supported by Carbon Data.  Not all encodings are valid for all types.
  *	Certain Encodings can be chained.
  */
 enum Encoding{
 	DICTIONARY = 0; // Identified that a column is dictionary encoded
 	DELTA = 1;	// Identifies that a column delta encoded
 	RLE = 2;		// Indetifies that a column is run length encoded
 	INVERTED_INDEX = 3; // Identifies that a column is encoded using inverted index, can be used only along with dictionary encoding
 	BIT_PACKED = 4;	// Identifies that a column is encoded using bit packing, can be used only along with dictionary encoding
 	DIRECT_DICTIONARY = 5; // Identifies that a column is direct dictionary encoded
 	DIRECT_COMPRESS = 6;  // Identifies that a columm is encoded using DirectCompressCodec
 	ADAPTIVE_INTEGRAL = 7; // Identifies that a column is encoded using AdaptiveIntegralCodec
 	ADAPTIVE_DELTA_INTEGRAL = 8; // Identifies that a column is encoded using AdaptiveDeltaIntegralCodec
 	RLE_INTEGRAL = 9;     // Identifies that a column is encoded using RLECodec
 	DIRECT_STRING = 10;   // Stores string value and string length separately in page data
 	ADAPTIVE_FLOATING = 11; // Identifies that a column is encoded using AdaptiveFloatingCodec
 	BOOL_BYTE = 12;   // Identifies that a column is encoded using BooleanPageCodec
 	ADAPTIVE_DELTA_FLOATING = 13; // Identifies that a column is encoded using AdaptiveDeltaFloatingCodec
 	DIRECT_COMPRESS_VARCHAR = 14;  // Identifies that a columm is encoded using DirectCompressCodec, it is used for long string columns
 }

 enum PartitionType{
   RANGE = 0;
   RANGE_INTERVAL = 1;
   LIST = 2;
   HASH = 3;
   NATIVE_HIVE = 4; // Uses the standard partition features of spark/hive
 }

 /**
  * Description of a Column for both dimension and measure
  */
 //TODO:where to put the CSV column name and carbon table column name mapping? should not keep in schema
 struct ColumnSchema{
 	1: required DataType data_type;
 	/**
 	 * Name of the column. If it is a complex data type, we follow a naming rule grand_parent_column.parent_column.child_column
 	 * For Array types, two columns will be stored one for the array type and one for the primitive type with the name parent_column.value
 	 */
 	2: required string column_name;  //
 	3: required string column_id;  // Unique ID for a column. if this is dimension, it is an unique ID that used in dictionary
 	4: required bool columnar; // Whether it is stored as columnar format or row format
 	5: required list<Encoding> encoders; // List of encoders that are chained to encode the data for this column
 	6: required bool dimension;  // Whether the column is a dimension or measure
 	7: optional i32 column_group_id; // The group ID for column used for row format columns, where in columns in each group are chunked together.
 	/**
 	 * Used when this column contains mantissa data.
 	 */
 	8: optional i32 scale;
 	9: optional i32 precision;

 	/** Nested fields.  Since thrift does not support nested fields,
 	 * the nesting is flattened to a single list by a depth-first traversal.
 	 * The children count is used to construct the nested relationship.
 	 * This field is not set when the element is a primitive type
 	 */
 	10: optional i32 num_child;

 	/**
 	 * Used when this column is part of an aggregate table.
 	 */
 	11: optional string aggregate_function;

 	12: optional binary default_value;

 	13: optional map<string,string> columnProperties;

   /**
 	 * To specify the visibily of the column by default its false
 	 */
 	14: optional bool invisible;

 	/**
 	 * Column reference id
 	 */
 	15: optional string columnReferenceId;
 	/**
 	 * It will have column order which user has provided
 	 */
 	16: optional i32 schemaOrdinal

   /**
   *  to maintain the column relation with parent table.
   *  will be usefull in case of pre-aggregate
   **/
 	17: optional list<ParentColumnTableRelation> parentColumnTableRelations;
 }

 /**
  * Description of One Schema Change, contains list of added columns and deleted columns
  */
 struct SchemaEvolutionEntry{
 	1: required i64 time_stamp;
 	2: optional list<ColumnSchema> added;
 	3: optional list<ColumnSchema> removed;
 	4: optional string tableName;
 }

 /**
  * History of schema evolution
  */
 struct SchemaEvolution{
     1: required list<SchemaEvolutionEntry> schema_evolution_history;
 }

 /**
  * Partition information of table
  */
 struct PartitionInfo{
     1: required list<ColumnSchema> partition_columns;
     2: required PartitionType partition_type;
     3: optional list<list<string>> list_info; // value list of list partition table
     4: optional list<string> range_info;  // range value list of range partition table
     5: optional list<i32> partition_ids; // partition id list
     6: optional i32 num_partitions;  // total partition count
     7: optional i32 max_partition;  // max partition id for now
 }

 /**
  * Bucketing information of fields on table
  */
 struct BucketingInfo{
   1: required list<ColumnSchema> table_columns;
   2: required i32 number_of_buckets;
 }

 /**
  * The description of table schema
  */
 struct TableSchema{
 	1: required string table_id;  // ID used to
 	2: required list<ColumnSchema> table_columns; // Columns in the table
 	3: required SchemaEvolution schema_evolution; // History of schema evolution of this table
   4: optional map<string,string> tableProperties; // Table properties configured by the user
   5: optional BucketingInfo bucketingInfo; // Bucketing information
   6: optional PartitionInfo partitionInfo; // Partition information
   7: optional list<string> long_string_columns // long string columns in the table
 }

 struct RelationIdentifier {
    1: optional string databaseName;
    2: required string tableName;
    3: required string tableId;
 }

 struct ParentColumnTableRelation {
    1: required RelationIdentifier relationIdentifier;
    2: required string columnId;
    3: required string columnName
 }

 struct DataMapSchema  {
     // DataMap name
     1: required string dataMapName;
     // class name
     2: required string className;
     // to maintain properties which are mentioned in DMPROPERTIES of DDL and also it
     // stores properties of select query, query type like groupby, join in
     // case of preaggregate/timeseries
     3: optional map<string, string> properties;
     // relation identifier of a table which stores data of datamaps like preaggregate/timeseries.
     4: optional RelationIdentifier childTableIdentifier;
     // in case of preaggregate/timeseries datamap it will be used to maintain the child schema
     // which will be usefull in case of query and data load
     5: optional TableSchema childTableSchema;
 }

 struct TableInfo{
 	1: required TableSchema fact_table;
 	2: required list<TableSchema> aggregate_table_list;
 	3: optional list<DataMapSchema> dataMapSchemas; // childSchema information
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* File format description for CarbonData schema file
	*/
	namespace java org.apache.carbondata.format

	/**
	* The types supported by Carbon Data.
	*/
	enum DataType {
	STRING = 0,
	SHORT = 1,
	INT = 2,
	LONG = 3,
	DOUBLE = 4,
	DECIMAL = 5,
	TIMESTAMP = 6,
	DATE = 7,
	BOOLEAN = 8,
	ARRAY = 20,
	STRUCT = 21,
	VARCHAR = 22,
	MAP = 23,
	FLOAT = 24,
	BYTE = 25
	}

	/**
	* Encodings supported by Carbon Data. Not all encodings are valid for all types.
	* Certain Encodings can be chained.
	*/
	enum Encoding{
	DICTIONARY = 0; // Identified that a column is dictionary encoded
	DELTA = 1; // Identifies that a column delta encoded
	RLE = 2; // Indetifies that a column is run length encoded
	INVERTED_INDEX = 3; // Identifies that a column is encoded using inverted index, can be used only along with dictionary encoding
	BIT_PACKED = 4; // Identifies that a column is encoded using bit packing, can be used only along with dictionary encoding
	DIRECT_DICTIONARY = 5; // Identifies that a column is direct dictionary encoded
	DIRECT_COMPRESS = 6; // Identifies that a columm is encoded using DirectCompressCodec
	ADAPTIVE_INTEGRAL = 7; // Identifies that a column is encoded using AdaptiveIntegralCodec
	ADAPTIVE_DELTA_INTEGRAL = 8; // Identifies that a column is encoded using AdaptiveDeltaIntegralCodec
	RLE_INTEGRAL = 9; // Identifies that a column is encoded using RLECodec
	DIRECT_STRING = 10; // Stores string value and string length separately in page data
	ADAPTIVE_FLOATING = 11; // Identifies that a column is encoded using AdaptiveFloatingCodec
	BOOL_BYTE = 12; // Identifies that a column is encoded using BooleanPageCodec
	ADAPTIVE_DELTA_FLOATING = 13; // Identifies that a column is encoded using AdaptiveDeltaFloatingCodec
	DIRECT_COMPRESS_VARCHAR = 14; // Identifies that a columm is encoded using DirectCompressCodec, it is used for long string columns
	}

	enum PartitionType{
	RANGE = 0;
	RANGE_INTERVAL = 1;
	LIST = 2;
	HASH = 3;
	NATIVE_HIVE = 4; // Uses the standard partition features of spark/hive
	}

	/**
	* Description of a Column for both dimension and measure
	*/
	//TODO:where to put the CSV column name and carbon table column name mapping? should not keep in schema
	struct ColumnSchema{
	1: required DataType data_type;
	/**
	* Name of the column. If it is a complex data type, we follow a naming rule grand_parent_column.parent_column.child_column
	* For Array types, two columns will be stored one for the array type and one for the primitive type with the name parent_column.value
	*/
	2: required string column_name; //
	3: required string column_id; // Unique ID for a column. if this is dimension, it is an unique ID that used in dictionary
	4: required bool columnar; // Whether it is stored as columnar format or row format
	5: required list<Encoding> encoders; // List of encoders that are chained to encode the data for this column
	6: required bool dimension; // Whether the column is a dimension or measure
	7: optional i32 column_group_id; // The group ID for column used for row format columns, where in columns in each group are chunked together.
	/**
	* Used when this column contains mantissa data.
	*/
	8: optional i32 scale;
	9: optional i32 precision;

	/** Nested fields. Since thrift does not support nested fields,
	* the nesting is flattened to a single list by a depth-first traversal.
	* The children count is used to construct the nested relationship.
	* This field is not set when the element is a primitive type
	*/
	10: optional i32 num_child;

	/**
	* Used when this column is part of an aggregate table.
	*/
	11: optional string aggregate_function;

	12: optional binary default_value;

	13: optional map<string,string> columnProperties;

	/**
	* To specify the visibily of the column by default its false
	*/
	14: optional bool invisible;

	/**
	* Column reference id
	*/
	15: optional string columnReferenceId;
	/**
	* It will have column order which user has provided
	*/
	16: optional i32 schemaOrdinal

	/**
	* to maintain the column relation with parent table.
	* will be usefull in case of pre-aggregate
	**/
	17: optional list<ParentColumnTableRelation> parentColumnTableRelations;
	}

	/**
	* Description of One Schema Change, contains list of added columns and deleted columns
	*/
	struct SchemaEvolutionEntry{
	1: required i64 time_stamp;
	2: optional list<ColumnSchema> added;
	3: optional list<ColumnSchema> removed;
	4: optional string tableName;
	}

	/**
	* History of schema evolution
	*/
	struct SchemaEvolution{
	1: required list<SchemaEvolutionEntry> schema_evolution_history;
	}

	/**
	* Partition information of table
	*/
	struct PartitionInfo{
	1: required list<ColumnSchema> partition_columns;
	2: required PartitionType partition_type;
	3: optional list<list<string>> list_info; // value list of list partition table
	4: optional list<string> range_info; // range value list of range partition table
	5: optional list<i32> partition_ids; // partition id list
	6: optional i32 num_partitions; // total partition count
	7: optional i32 max_partition; // max partition id for now
	}

	/**
	* Bucketing information of fields on table
	*/
	struct BucketingInfo{
	1: required list<ColumnSchema> table_columns;
	2: required i32 number_of_buckets;
	}

	/**
	* The description of table schema
	*/
	struct TableSchema{
	1: required string table_id; // ID used to
	2: required list<ColumnSchema> table_columns; // Columns in the table
	3: required SchemaEvolution schema_evolution; // History of schema evolution of this table
	4: optional map<string,string> tableProperties; // Table properties configured by the user
	5: optional BucketingInfo bucketingInfo; // Bucketing information
	6: optional PartitionInfo partitionInfo; // Partition information
	7: optional list<string> long_string_columns // long string columns in the table
	}

	struct RelationIdentifier {
	1: optional string databaseName;
	2: required string tableName;
	3: required string tableId;
	}

	struct ParentColumnTableRelation {
	1: required RelationIdentifier relationIdentifier;
	2: required string columnId;
	3: required string columnName
	}

	struct DataMapSchema {
	// DataMap name
	1: required string dataMapName;
	// class name
	2: required string className;
	// to maintain properties which are mentioned in DMPROPERTIES of DDL and also it
	// stores properties of select query, query type like groupby, join in
	// case of preaggregate/timeseries
	3: optional map<string, string> properties;
	// relation identifier of a table which stores data of datamaps like preaggregate/timeseries.
	4: optional RelationIdentifier childTableIdentifier;
	// in case of preaggregate/timeseries datamap it will be used to maintain the child schema
	// which will be usefull in case of query and data load
	5: optional TableSchema childTableSchema;
	}

	struct TableInfo{
	1: required TableSchema fact_table;
	2: required list<TableSchema> aggregate_table_list;
	3: optional list<DataMapSchema> dataMapSchemas; // childSchema information
	}