Merge branch 'delta_encoding'
Conflicts:
Encodings.md
src/thrift/parquet.thrift
diff --git a/Encodings.md b/Encodings.md
index d39ac47..75ed475 100644
--- a/Encodings.md
+++ b/Encodings.md
@@ -1,4 +1,3 @@
-
Parquet encoding definitions
====
@@ -130,4 +129,34 @@
block
0 (minimum delta), 2 (bitwidth), 000000111111b (0,0,0,3,3,3 packed on 2 bits)
+### Delta-length byte array:
+Supported Types: BYTE_ARRAY
+
+This encoding is always preferred over PLAIN for byte array columns.
+
+For this encoding, we will take all the byte array lengths and encode them using delta
+encoding. The byte array data follows all of the length data just concatenated back to
+back. The expected savings is from the cost of encoding the lengths and possibly
+better compression in the data (it is no longer interleaved with the lengths).
+
+The data stream looks like:
+
+<Delta Encoded Lengths> <Byte Array Data>
+
+For example, if the data was "Hello", "World", "Foobar", "ABCDEF":
+
+The encoded data would be DeltaEncoding(5, 5, 6, 6) "HelloWorldFoobarABCDEF"
+
+### Delta Strings:
+
+Supported Types: BYTE_ARRAY
+
+This is also known as incremental encoding or front compression: for each element in a
+sorted sequence of strings, store the prefix length of the previous entry plus the
+suffix.
+
+For a longer description, see http://en.wikipedia.org/wiki/Incremental_encoding.
+
+This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED), followed by
+the suffixes encoded as delta length byte arrays (DELTA_LENGTH_BYTE_ARRAY).
diff --git a/src/thrift/parquet.thrift b/src/thrift/parquet.thrift
index 6762578..71807f9 100644
--- a/src/thrift/parquet.thrift
+++ b/src/thrift/parquet.thrift
@@ -54,7 +54,7 @@
/** a key/value pair is converted into a group of two fields */
MAP_KEY_VALUE = 2;
- /** a list is converted into an optional field containing a repeated field for its
+ /** a list is converted into an optional field containing a repeated field for its
* values */
LIST = 3;
@@ -62,7 +62,7 @@
ENUM = 4;
}
-/**
+/**
* Representation of Schemas
*/
enum FieldRepetitionType {
@@ -125,7 +125,7 @@
* INT64 - 8 bytes per value. Stored as little-endian.
* FLOAT - 4 bytes per value. IEEE. Stored as little-endian.
* DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.
- * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
+ * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
* FIXED_LEN_BYTE_ARRAY - Just the bytes.
*/
PLAIN = 0;
@@ -133,8 +133,8 @@
/** Group VarInt encoding for INT32/INT64. */
GROUP_VAR_INT = 1;
- /** Dictionary encoding. The values in the dictionary are encoded in the
- * plain type.
+ /** Dictionary encoding. The values in the dictionary are encoded in the
+ * plain type.
*/
PLAIN_DICTIONARY = 2;
@@ -149,10 +149,18 @@
/** Delta encoding for integers. This can be used for int columns and works best
* on sorted data */
DELTA_BINARY_PACKED = 5;
+
+ /** Encoding for byte arrays to separate the length values and the data. The lengths
+ * are encoded using DELTA_RLE **/
+ DELTA_LENGTH_BYTE_ARRAY = 6;
+
+ /** Delta-encoded sorted strings.
+ */
+ DELTA_STRINGS = 7;
}
/**
- * Supported compression algorithms.
+ * Supported compression algorithms.
*/
enum CompressionCodec {
UNCOMPRESSED = 0;
@@ -205,7 +213,7 @@
3: required i32 compressed_page_size
/** 32bit crc for the data below. This allows for disabling checksumming in HDFS
- * if only a few pages needs to be read
+ * if only a few pages needs to be read
**/
4: optional i32 crc
@@ -215,7 +223,7 @@
7: optional DictionaryPageHeader dictionary_page_header;
}
-/**
+/**
* Wrapper struct to store key values
*/
struct KeyValue {
@@ -245,7 +253,7 @@
/** Type of this column **/
1: required Type type
- /** Set of all encodings used for this column. The purpose is to validate
+ /** Set of all encodings used for this column. The purpose is to validate
* whether we can decode those pages. **/
2: required list<Encoding> encodings
@@ -278,7 +286,7 @@
}
struct ColumnChunk {
- /** File where column data is stored. If not set, assumed to be same file as
+ /** File where column data is stored. If not set, assumed to be same file as
* metadata. This path is relative to the current file.
**/
1: optional string file_path
@@ -288,11 +296,11 @@
/** Column metadata for this chunk. This is the same content as what is at
* file_path/file_offset. Having it here has it replicated in the file
- * metadata.
+ * metadata.
**/
3: optional ColumnMetaData meta_data
}
-
+
struct RowGroup {
1: required list<ColumnChunk> columns
@@ -333,7 +341,7 @@
5: optional list<KeyValue> key_value_metadata
/** String for application that wrote this file. This should be in the format
- * <Application> version <App Version> (build <App Build Hash>).
+ * <Application> version <App Version> (build <App Build Hash>).
* e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
**/
6: optional string created_by