blob: d39e54803b9ae30eb4522a4b0fd4544f573e3bab [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* File format description for CarbonData schema file
*/
namespace java org.apache.carbondata.format
/**
* The types supported by Carbon Data.
*/
enum DataType {
STRING = 0,
SHORT = 1,
INT = 2,
LONG = 3,
DOUBLE = 4,
DECIMAL = 5,
TIMESTAMP = 6,
DATE = 7,
BOOLEAN = 8,
ARRAY = 20,
STRUCT = 21,
VARCHAR = 22,
MAP = 23,
FLOAT = 24,
BYTE = 25
}
/**
* Encodings supported by Carbon Data. Not all encodings are valid for all types.
* Certain Encodings can be chained.
*/
enum Encoding{
DICTIONARY = 0; // Identified that a column is dictionary encoded
DELTA = 1; // Identifies that a column delta encoded
RLE = 2; // Indetifies that a column is run length encoded
INVERTED_INDEX = 3; // Identifies that a column is encoded using inverted index, can be used only along with dictionary encoding
BIT_PACKED = 4; // Identifies that a column is encoded using bit packing, can be used only along with dictionary encoding
DIRECT_DICTIONARY = 5; // Identifies that a column is direct dictionary encoded
DIRECT_COMPRESS = 6; // Identifies that a columm is encoded using DirectCompressCodec
ADAPTIVE_INTEGRAL = 7; // Identifies that a column is encoded using AdaptiveIntegralCodec
ADAPTIVE_DELTA_INTEGRAL = 8; // Identifies that a column is encoded using AdaptiveDeltaIntegralCodec
RLE_INTEGRAL = 9; // Identifies that a column is encoded using RLECodec
DIRECT_STRING = 10; // Stores string value and string length separately in page data
ADAPTIVE_FLOATING = 11; // Identifies that a column is encoded using AdaptiveFloatingCodec
BOOL_BYTE = 12; // Identifies that a column is encoded using BooleanPageCodec
ADAPTIVE_DELTA_FLOATING = 13; // Identifies that a column is encoded using AdaptiveDeltaFloatingCodec
DIRECT_COMPRESS_VARCHAR = 14; // Identifies that a columm is encoded using DirectCompressCodec, it is used for long string columns
}
enum PartitionType{
RANGE = 0;
RANGE_INTERVAL = 1;
LIST = 2;
HASH = 3;
NATIVE_HIVE = 4; // Uses the standard partition features of spark/hive
}
/**
* Description of a Column for both dimension and measure
*/
//TODO:where to put the CSV column name and carbon table column name mapping? should not keep in schema
struct ColumnSchema{
1: required DataType data_type;
/**
* Name of the column. If it is a complex data type, we follow a naming rule grand_parent_column.parent_column.child_column
* For Array types, two columns will be stored one for the array type and one for the primitive type with the name parent_column.value
*/
2: required string column_name; //
3: required string column_id; // Unique ID for a column. if this is dimension, it is an unique ID that used in dictionary
4: required bool columnar; // Whether it is stored as columnar format or row format
5: required list<Encoding> encoders; // List of encoders that are chained to encode the data for this column
6: required bool dimension; // Whether the column is a dimension or measure
7: optional i32 column_group_id; // The group ID for column used for row format columns, where in columns in each group are chunked together.
/**
* Used when this column contains mantissa data.
*/
8: optional i32 scale;
9: optional i32 precision;
/** Nested fields. Since thrift does not support nested fields,
* the nesting is flattened to a single list by a depth-first traversal.
* The children count is used to construct the nested relationship.
* This field is not set when the element is a primitive type
*/
10: optional i32 num_child;
/**
* Used when this column is part of an aggregate table.
*/
11: optional string aggregate_function;
12: optional binary default_value;
13: optional map<string,string> columnProperties;
/**
* To specify the visibily of the column by default its false
*/
14: optional bool invisible;
/**
* Column reference id
*/
15: optional string columnReferenceId;
/**
* It will have column order which user has provided
*/
16: optional i32 schemaOrdinal
/**
* to maintain the column relation with parent table.
* will be usefull in case of pre-aggregate
**/
17: optional list<ParentColumnTableRelation> parentColumnTableRelations;
}
/**
* Description of One Schema Change, contains list of added columns and deleted columns
*/
struct SchemaEvolutionEntry{
1: required i64 time_stamp;
2: optional list<ColumnSchema> added;
3: optional list<ColumnSchema> removed;
4: optional string tableName;
}
/**
* History of schema evolution
*/
struct SchemaEvolution{
1: required list<SchemaEvolutionEntry> schema_evolution_history;
}
/**
* Partition information of table
*/
struct PartitionInfo{
1: required list<ColumnSchema> partition_columns;
2: required PartitionType partition_type;
3: optional list<list<string>> list_info; // value list of list partition table
4: optional list<string> range_info; // range value list of range partition table
5: optional list<i32> partition_ids; // partition id list
6: optional i32 num_partitions; // total partition count
7: optional i32 max_partition; // max partition id for now
}
/**
* Bucketing information of fields on table
*/
struct BucketingInfo{
1: required list<ColumnSchema> table_columns;
2: required i32 number_of_buckets;
}
/**
* The description of table schema
*/
struct TableSchema{
1: required string table_id; // ID used to
2: required list<ColumnSchema> table_columns; // Columns in the table
3: required SchemaEvolution schema_evolution; // History of schema evolution of this table
4: optional map<string,string> tableProperties; // Table properties configured by the user
5: optional BucketingInfo bucketingInfo; // Bucketing information
6: optional PartitionInfo partitionInfo; // Partition information
7: optional list<string> long_string_columns // long string columns in the table
}
struct RelationIdentifier {
1: optional string databaseName;
2: required string tableName;
3: required string tableId;
}
struct ParentColumnTableRelation {
1: required RelationIdentifier relationIdentifier;
2: required string columnId;
3: required string columnName
}
struct DataMapSchema {
// DataMap name
1: required string dataMapName;
// class name
2: required string className;
// to maintain properties which are mentioned in DMPROPERTIES of DDL and also it
// stores properties of select query, query type like groupby, join in
// case of preaggregate/timeseries
3: optional map<string, string> properties;
// relation identifier of a table which stores data of datamaps like preaggregate/timeseries.
4: optional RelationIdentifier childTableIdentifier;
// in case of preaggregate/timeseries datamap it will be used to maintain the child schema
// which will be usefull in case of query and data load
5: optional TableSchema childTableSchema;
}
struct TableInfo{
1: required TableSchema fact_table;
2: required list<TableSchema> aggregate_table_list;
3: optional list<DataMapSchema> dataMapSchemas; // childSchema information
}