| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| syntax = "proto2"; |
| |
| package orc.proto; |
| |
| option java_package = "org.apache.orc"; |
| |
| message IntegerStatistics { |
| optional sint64 minimum = 1; |
| optional sint64 maximum = 2; |
| optional sint64 sum = 3; |
| } |
| |
| message DoubleStatistics { |
| optional double minimum = 1; |
| optional double maximum = 2; |
| optional double sum = 3; |
| } |
| |
| message StringStatistics { |
| optional string minimum = 1; |
| optional string maximum = 2; |
| // sum will store the total length of all strings in a stripe |
| optional sint64 sum = 3; |
| // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper |
| // bound instead of the minimum or maximum values above. |
| optional string lowerBound = 4; |
| optional string upperBound = 5; |
| } |
| |
| message BucketStatistics { |
| repeated uint64 count = 1 [packed=true]; |
| } |
| |
| message DecimalStatistics { |
| optional string minimum = 1; |
| optional string maximum = 2; |
| optional string sum = 3; |
| } |
| |
| message DateStatistics { |
| // min,max values saved as days since epoch |
| optional sint32 minimum = 1; |
| optional sint32 maximum = 2; |
| } |
| |
| message TimestampStatistics { |
| // min,max values saved as milliseconds since epoch |
| optional sint64 minimum = 1; |
| optional sint64 maximum = 2; |
| optional sint64 minimumUtc = 3; |
| optional sint64 maximumUtc = 4; |
| } |
| |
| message BinaryStatistics { |
| // sum will store the total binary blob length in a stripe |
| optional sint64 sum = 1; |
| } |
| |
| // Statistics for list and map |
| message CollectionStatistics { |
| optional uint64 minChildren = 1; |
| optional uint64 maxChildren = 2; |
| optional uint64 totalChildren = 3; |
| } |
| |
| message ColumnStatistics { |
| optional uint64 numberOfValues = 1; |
| optional IntegerStatistics intStatistics = 2; |
| optional DoubleStatistics doubleStatistics = 3; |
| optional StringStatistics stringStatistics = 4; |
| optional BucketStatistics bucketStatistics = 5; |
| optional DecimalStatistics decimalStatistics = 6; |
| optional DateStatistics dateStatistics = 7; |
| optional BinaryStatistics binaryStatistics = 8; |
| optional TimestampStatistics timestampStatistics = 9; |
| optional bool hasNull = 10; |
| optional uint64 bytesOnDisk = 11; |
| optional CollectionStatistics collectionStatistics = 12; |
| } |
| |
| message RowIndexEntry { |
| repeated uint64 positions = 1 [packed=true]; |
| optional ColumnStatistics statistics = 2; |
| } |
| |
| message RowIndex { |
| repeated RowIndexEntry entry = 1; |
| } |
| |
| message BloomFilter { |
| optional uint32 numHashFunctions = 1; |
| repeated fixed64 bitset = 2; |
| optional bytes utf8bitset = 3; |
| } |
| |
| message BloomFilterIndex { |
| repeated BloomFilter bloomFilter = 1; |
| } |
| |
| message Stream { |
| // if you add new index stream kinds, you need to make sure to update |
| // StreamName to ensure it is added to the stripe in the right area |
| enum Kind { |
| PRESENT = 0; |
| DATA = 1; |
| LENGTH = 2; |
| DICTIONARY_DATA = 3; |
| DICTIONARY_COUNT = 4; |
| SECONDARY = 5; |
| ROW_INDEX = 6; |
| BLOOM_FILTER = 7; |
| BLOOM_FILTER_UTF8 = 8; |
| // Virtual stream kinds to allocate space for encrypted index and data. |
| ENCRYPTED_INDEX = 9; |
| ENCRYPTED_DATA = 10; |
| |
| // stripe statistics streams |
| STRIPE_STATISTICS = 100; |
| // A virtual stream kind that is used for setting the encryption IV. |
| FILE_STATISTICS = 101; |
| } |
| optional Kind kind = 1; |
| optional uint32 column = 2; |
| optional uint64 length = 3; |
| } |
| |
| message ColumnEncoding { |
| enum Kind { |
| DIRECT = 0; |
| DICTIONARY = 1; |
| DIRECT_V2 = 2; |
| DICTIONARY_V2 = 3; |
| } |
| optional Kind kind = 1; |
| optional uint32 dictionarySize = 2; |
| |
| // The encoding of the bloom filters for this column: |
| // 0 or missing = none or original |
| // 1 = ORC-135 (utc for timestamps) |
| optional uint32 bloomEncoding = 3; |
| } |
| |
| message StripeEncryptionVariant { |
| repeated Stream streams = 1; |
| repeated ColumnEncoding encoding = 2; |
| } |
| |
| // each stripe looks like: |
| // index streams |
| // unencrypted |
| // variant 1..N |
| // data streams |
| // unencrypted |
| // variant 1..N |
| // footer |
| |
| message StripeFooter { |
| repeated Stream streams = 1; |
| repeated ColumnEncoding columns = 2; |
| optional string writerTimezone = 3; |
| // one for each column encryption variant |
| repeated StripeEncryptionVariant encryption = 4; |
| } |
| |
| // the file tail looks like: |
| // encrypted stripe statistics: ColumnarStripeStatistics (order by variant) |
| // stripe statistics: Metadata |
| // footer: Footer |
| // postscript: PostScript |
| // psLen: byte |
| |
| message StringPair { |
| optional string key = 1; |
| optional string value = 2; |
| } |
| |
| message Type { |
| enum Kind { |
| BOOLEAN = 0; |
| BYTE = 1; |
| SHORT = 2; |
| INT = 3; |
| LONG = 4; |
| FLOAT = 5; |
| DOUBLE = 6; |
| STRING = 7; |
| BINARY = 8; |
| TIMESTAMP = 9; |
| LIST = 10; |
| MAP = 11; |
| STRUCT = 12; |
| UNION = 13; |
| DECIMAL = 14; |
| DATE = 15; |
| VARCHAR = 16; |
| CHAR = 17; |
| TIMESTAMP_INSTANT = 18; |
| } |
| optional Kind kind = 1; |
| repeated uint32 subtypes = 2 [packed=true]; |
| repeated string fieldNames = 3; |
| optional uint32 maximumLength = 4; |
| optional uint32 precision = 5; |
| optional uint32 scale = 6; |
| repeated StringPair attributes = 7; |
| } |
| |
| message StripeInformation { |
| // the global file offset of the start of the stripe |
| optional uint64 offset = 1; |
| // the number of bytes of index |
| optional uint64 indexLength = 2; |
| // the number of bytes of data |
| optional uint64 dataLength = 3; |
| // the number of bytes in the stripe footer |
| optional uint64 footerLength = 4; |
| // the number of rows in this stripe |
| optional uint64 numberOfRows = 5; |
| // If this is present, the reader should use this value for the encryption |
| // stripe id for setting the encryption IV. Otherwise, the reader should |
| // use one larger than the previous stripe's encryptStripeId. |
| // For unmerged ORC files, the first stripe will use 1 and the rest of the |
| // stripes won't have it set. For merged files, the stripe information |
| // will be copied from their original files and thus the first stripe of |
| // each of the input files will reset it to 1. |
| // Note that 1 was choosen, because protobuf v3 doesn't serialize |
| // primitive types that are the default (eg. 0). |
| optional uint64 encryptStripeId = 6; |
| // For each encryption variant, the new encrypted local key to use |
| // until we find a replacement. |
| repeated bytes encryptedLocalKeys = 7; |
| } |
| |
| message UserMetadataItem { |
| optional string name = 1; |
| optional bytes value = 2; |
| } |
| |
| // StripeStatistics (1 per a stripe), which each contain the |
| // ColumnStatistics for each column. |
| // This message type is only used in ORC v0 and v1. |
| message StripeStatistics { |
| repeated ColumnStatistics colStats = 1; |
| } |
| |
| // This message type is only used in ORC v0 and v1. |
| message Metadata { |
| repeated StripeStatistics stripeStats = 1; |
| } |
| |
| // In ORC v2 (and for encrypted columns in v1), each column has |
| // their column statistics written separately. |
| message ColumnarStripeStatistics { |
| // one value for each stripe in the file |
| repeated ColumnStatistics colStats = 1; |
| } |
| |
| enum EncryptionAlgorithm { |
| UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms |
| AES_CTR_128 = 1; |
| AES_CTR_256 = 2; |
| } |
| |
| message FileStatistics { |
| repeated ColumnStatistics column = 1; |
| } |
| |
| // How was the data masked? This isn't necessary for reading the file, but |
| // is documentation about how the file was written. |
| message DataMask { |
| // the kind of masking, which may include third party masks |
| optional string name = 1; |
| // parameters for the mask |
| repeated string maskParameters = 2; |
| // the unencrypted column roots this mask was applied to |
| repeated uint32 columns = 3 [packed = true]; |
| } |
| |
| // Information about the encryption keys. |
| message EncryptionKey { |
| optional string keyName = 1; |
| optional uint32 keyVersion = 2; |
| optional EncryptionAlgorithm algorithm = 3; |
| } |
| |
| // The description of an encryption variant. |
| // Each variant is a single subtype that is encrypted with a single key. |
| message EncryptionVariant { |
| // the column id of the root |
| optional uint32 root = 1; |
| // The master key that was used to encrypt the local key, referenced as |
| // an index into the Encryption.key list. |
| optional uint32 key = 2; |
| // the encrypted key for the file footer |
| optional bytes encryptedKey = 3; |
| // the stripe statistics for this variant |
| repeated Stream stripeStatistics = 4; |
| // encrypted file statistics as a FileStatistics |
| optional bytes fileStatistics = 5; |
| } |
| |
| // Which KeyProvider encrypted the local keys. |
| enum KeyProviderKind { |
| UNKNOWN = 0; |
| HADOOP = 1; |
| AWS = 2; |
| GCP = 3; |
| AZURE = 4; |
| } |
| |
| message Encryption { |
| // all of the masks used in this file |
| repeated DataMask mask = 1; |
| // all of the keys used in this file |
| repeated EncryptionKey key = 2; |
| // The encrypted variants. |
| // Readers should prefer the first variant that the user has access to |
| // the corresponding key. If they don't have access to any of the keys, |
| // they should get the unencrypted masked data. |
| repeated EncryptionVariant variants = 3; |
| // How are the local keys encrypted? |
| optional KeyProviderKind keyProvider = 4; |
| } |
| |
| enum CalendarKind { |
| UNKNOWN_CALENDAR = 0; |
| // The Java default calendar changes from Julian to Gregorian |
| // in 1583. |
| JULIAN_GREGORIAN = 1; |
| // A calendar that extends the Gregorian calendar back forever. |
| PROLEPTIC_GREGORIAN = 2; |
| } |
| |
| message Footer { |
| optional uint64 headerLength = 1; |
| optional uint64 contentLength = 2; |
| repeated StripeInformation stripes = 3; |
| repeated Type types = 4; |
| repeated UserMetadataItem metadata = 5; |
| optional uint64 numberOfRows = 6; |
| repeated ColumnStatistics statistics = 7; |
| optional uint32 rowIndexStride = 8; |
| |
| // Each implementation that writes ORC files should register for a code |
| // 0 = ORC Java |
| // 1 = ORC C++ |
| // 2 = Presto |
| // 3 = Scritchley Go from https://github.com/scritchley/orc |
| optional uint32 writer = 9; |
| |
| // information about the encryption in this file |
| optional Encryption encryption = 10; |
| optional CalendarKind calendar = 11; |
| } |
| |
| enum CompressionKind { |
| NONE = 0; |
| ZLIB = 1; |
| SNAPPY = 2; |
| LZO = 3; |
| LZ4 = 4; |
| ZSTD = 5; |
| } |
| |
| // Serialized length must be less that 255 bytes |
| message PostScript { |
| optional uint64 footerLength = 1; |
| optional CompressionKind compression = 2; |
| optional uint64 compressionBlockSize = 3; |
| // the version of the file format |
| // [0, 11] = Hive 0.11 |
| // [0, 12] = Hive 0.12 |
| repeated uint32 version = 4 [packed = true]; |
| optional uint64 metadataLength = 5; |
| |
| // The version of the writer that wrote the file. This number is |
| // updated when we make fixes or large changes to the writer so that |
| // readers can detect whether a given bug is present in the data. |
| // |
| // Only the Java ORC writer may use values under 6 (or missing) so that |
| // readers that predate ORC-202 treat the new writers correctly. Each |
| // writer should assign their own sequence of versions starting from 6. |
| // |
| // Version of the ORC Java writer: |
| // 0 = original |
| // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & |
| // string statistics use utf8 for min/max) |
| // 2 = HIVE-4243 fixed (use real column names from Hive tables) |
| // 3 = HIVE-12055 added (vectorized writer implementation) |
| // 4 = HIVE-13083 fixed (decimals write present stream correctly) |
| // 5 = ORC-101 fixed (bloom filters use utf8 consistently) |
| // 6 = ORC-135 fixed (timestamp statistics use utc) |
| // 7 = ORC-517 fixed (decimal64 min/max incorrect) |
| // 8 = ORC-203 added (trim very long string statistics) |
| // 9 = ORC-14 added (column encryption) |
| // |
| // Version of the ORC C++ writer: |
| // 6 = original |
| // |
| // Version of the Presto writer: |
| // 6 = original |
| // |
| // Version of the Scritchley Go writer: |
| // 6 = original |
| // |
| optional uint32 writerVersion = 6; |
| |
| // the number of bytes in the encrypted stripe statistics |
| optional uint64 stripeStatisticsLength = 7; |
| |
| // Leave this last in the record |
| optional string magic = 8000; |
| } |
| |
| // The contents of the file tail that must be serialized. |
| // This gets serialized as part of OrcSplit, also used by footer cache. |
| message FileTail { |
| optional PostScript postscript = 1; |
| optional Footer footer = 2; |
| optional uint64 fileLength = 3; |
| optional uint64 postscriptLength = 4; |
| } |