| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| syntax = "proto2"; |
| |
| package orc.proto; |
| |
| option java_package = "org.apache.orc"; |
| |
| message IntegerStatistics { |
| optional sint64 minimum = 1; |
| optional sint64 maximum = 2; |
| optional sint64 sum = 3; |
| } |
| |
| message DoubleStatistics { |
| optional double minimum = 1; |
| optional double maximum = 2; |
| optional double sum = 3; |
| } |
| |
| message StringStatistics { |
| optional string minimum = 1; |
| optional string maximum = 2; |
| // sum will store the total length of all strings in a stripe |
| optional sint64 sum = 3; |
| } |
| |
| message BucketStatistics { |
| repeated uint64 count = 1 [packed=true]; |
| } |
| |
| message DecimalStatistics { |
| optional string minimum = 1; |
| optional string maximum = 2; |
| optional string sum = 3; |
| } |
| |
| message DateStatistics { |
| // min,max values saved as days since epoch |
| optional sint32 minimum = 1; |
| optional sint32 maximum = 2; |
| } |
| |
| message TimestampStatistics { |
| // min,max values saved as milliseconds since epoch |
| optional sint64 minimum = 1; |
| optional sint64 maximum = 2; |
| optional sint64 minimumUtc = 3; |
| optional sint64 maximumUtc = 4; |
| } |
| |
| message BinaryStatistics { |
| // sum will store the total binary blob length in a stripe |
| optional sint64 sum = 1; |
| } |
| |
| message ColumnStatistics { |
| optional uint64 numberOfValues = 1; |
| optional IntegerStatistics intStatistics = 2; |
| optional DoubleStatistics doubleStatistics = 3; |
| optional StringStatistics stringStatistics = 4; |
| optional BucketStatistics bucketStatistics = 5; |
| optional DecimalStatistics decimalStatistics = 6; |
| optional DateStatistics dateStatistics = 7; |
| optional BinaryStatistics binaryStatistics = 8; |
| optional TimestampStatistics timestampStatistics = 9; |
| optional bool hasNull = 10; |
| optional uint64 bytesOnDisk = 11; |
| } |
| |
| message RowIndexEntry { |
| repeated uint64 positions = 1 [packed=true]; |
| optional ColumnStatistics statistics = 2; |
| } |
| |
| message RowIndex { |
| repeated RowIndexEntry entry = 1; |
| } |
| |
| message BloomFilter { |
| optional uint32 numHashFunctions = 1; |
| repeated fixed64 bitset = 2; |
| optional bytes utf8bitset = 3; |
| } |
| |
| message BloomFilterIndex { |
| repeated BloomFilter bloomFilter = 1; |
| } |
| |
| message Stream { |
| // if you add new index stream kinds, you need to make sure to update |
| // StreamName to ensure it is added to the stripe in the right area |
| enum Kind { |
| PRESENT = 0; |
| DATA = 1; |
| LENGTH = 2; |
| DICTIONARY_DATA = 3; |
| DICTIONARY_COUNT = 4; |
| SECONDARY = 5; |
| ROW_INDEX = 6; |
| BLOOM_FILTER = 7; |
| BLOOM_FILTER_UTF8 = 8; |
| } |
| optional Kind kind = 1; |
| optional uint32 column = 2; |
| optional uint64 length = 3; |
| } |
| |
| message ColumnEncoding { |
| enum Kind { |
| DIRECT = 0; |
| DICTIONARY = 1; |
| DIRECT_V2 = 2; |
| DICTIONARY_V2 = 3; |
| } |
| optional Kind kind = 1; |
| optional uint32 dictionarySize = 2; |
| |
| // The encoding of the bloom filters for this column: |
| // 0 or missing = none or original |
| // 1 = ORC-135 (utc for timestamps) |
| optional uint32 bloomEncoding = 3; |
| } |
| |
| message StripeFooter { |
| repeated Stream streams = 1; |
| repeated ColumnEncoding columns = 2; |
| optional string writerTimezone = 3; |
| } |
| |
| message Type { |
| enum Kind { |
| BOOLEAN = 0; |
| BYTE = 1; |
| SHORT = 2; |
| INT = 3; |
| LONG = 4; |
| FLOAT = 5; |
| DOUBLE = 6; |
| STRING = 7; |
| BINARY = 8; |
| TIMESTAMP = 9; |
| LIST = 10; |
| MAP = 11; |
| STRUCT = 12; |
| UNION = 13; |
| DECIMAL = 14; |
| DATE = 15; |
| VARCHAR = 16; |
| CHAR = 17; |
| } |
| optional Kind kind = 1; |
| repeated uint32 subtypes = 2 [packed=true]; |
| repeated string fieldNames = 3; |
| optional uint32 maximumLength = 4; |
| optional uint32 precision = 5; |
| optional uint32 scale = 6; |
| } |
| |
| message StripeInformation { |
| optional uint64 offset = 1; |
| optional uint64 indexLength = 2; |
| optional uint64 dataLength = 3; |
| optional uint64 footerLength = 4; |
| optional uint64 numberOfRows = 5; |
| } |
| |
| message UserMetadataItem { |
| optional string name = 1; |
| optional bytes value = 2; |
| } |
| |
| message StripeStatistics { |
| repeated ColumnStatistics colStats = 1; |
| } |
| |
| message Metadata { |
| repeated StripeStatistics stripeStats = 1; |
| } |
| |
| message Footer { |
| optional uint64 headerLength = 1; |
| optional uint64 contentLength = 2; |
| repeated StripeInformation stripes = 3; |
| repeated Type types = 4; |
| repeated UserMetadataItem metadata = 5; |
| optional uint64 numberOfRows = 6; |
| repeated ColumnStatistics statistics = 7; |
| optional uint32 rowIndexStride = 8; |
| |
| // Each implementation that writes ORC files should register for a code |
| // 0 = ORC Java |
| // 1 = ORC C++ |
| // 2 = Presto |
| // 3 = Scritchley Go from https://github.com/scritchley/orc |
| optional uint32 writer = 9; |
| } |
| |
| enum CompressionKind { |
| NONE = 0; |
| ZLIB = 1; |
| SNAPPY = 2; |
| LZO = 3; |
| LZ4 = 4; |
| ZSTD = 5; |
| } |
| |
| // Serialized length must be less that 255 bytes |
| message PostScript { |
| optional uint64 footerLength = 1; |
| optional CompressionKind compression = 2; |
| optional uint64 compressionBlockSize = 3; |
| // the version of the file format |
| // [0, 11] = Hive 0.11 |
| // [0, 12] = Hive 0.12 |
| repeated uint32 version = 4 [packed = true]; |
| optional uint64 metadataLength = 5; |
| |
| // The version of the writer that wrote the file. This number is |
| // updated when we make fixes or large changes to the writer so that |
| // readers can detect whether a given bug is present in the data. |
| // |
| // Only the Java ORC writer may use values under 6 (or missing) so that |
| // readers that predate ORC-202 treat the new writers correctly. Each |
| // writer should assign their own sequence of versions starting from 6. |
| // |
| // Version of the ORC Java writer: |
| // 0 = original |
| // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & |
| // string statistics use utf8 for min/max) |
| // 2 = HIVE-4243 fixed (use real column names from Hive tables) |
| // 3 = HIVE-12055 fixed (vectorized writer implementation) |
| // 4 = HIVE-13083 fixed (decimals write present stream correctly) |
| // 5 = ORC-101 fixed (bloom filters use utf8 consistently) |
| // 6 = ORC-135 fixed (timestamp statistics use utc) |
| // |
| // Version of the ORC C++ writer: |
| // 6 = original |
| // |
| // Version of the Presto writer: |
| // 6 = original |
| // |
| // Version of the Scritchley Go writer: |
| // 6 = original |
| // |
| optional uint32 writerVersion = 6; |
| |
| // Leave this last in the record |
| optional string magic = 8000; |
| } |
| |
| // The contents of the file tail that must be serialized. |
| // This gets serialized as part of OrcSplit, also used by footer cache. |
| message FileTail { |
| optional PostScript postscript = 1; |
| optional Footer footer = 2; |
| optional uint64 fileLength = 3; |
| optional uint64 postscriptLength = 4; |
| } |