Merge branch 'delta_encoding' Conflicts: Encodings.md src/thrift/parquet.thrift

commit: d10fcccd9fb3aedd9f2deb36bc85a8fcc8fa8a24 [log] [tgz]
author: Aniket Mokashi <amokashi@twitter.com> Mon Nov 11 17:03:56 2013 -0800
committer: Aniket Mokashi <amokashi@twitter.com> Mon Nov 11 17:03:56 2013 -0800
tree: 57ef772b00f18af57451a2f910673dfc53cc2471
parent: 38c894deaaf9878002367e6e51e7804877497dd0 [diff]
parent: 9a38528e47760912f73ecebae262596d51c12463 [diff]
diff --git a/Encodings.md b/Encodings.md
index d39ac47..75ed475 100644
--- a/Encodings.md
+++ b/Encodings.md

@@ -1,4 +1,3 @@
-
 Parquet encoding definitions
 ====
 
@@ -130,4 +129,34 @@
  block
 0 (minimum delta), 2 (bitwidth), 000000111111b (0,0,0,3,3,3 packed on 2 bits)
 
+### Delta-length byte array:
 
+Supported Types: BYTE_ARRAY
+
+This encoding is always preferred over PLAIN for byte array columns.
+
+For this encoding, we will take all the byte array lengths and encode them using delta
+encoding. The byte array data follows all of the length data just concatenated back to 
+back. The expected savings is from the cost of encoding the lengths and possibly 
+better compression in the data (it is no longer interleaved with the lengths).
+
+The data stream looks like:
+
+<Delta Encoded Lengths> <Byte Array Data>
+
+For example, if the data was "Hello", "World", "Foobar", "ABCDEF":
+
+The encoded data would be DeltaEncoding(5, 5, 6, 6) "HelloWorldFoobarABCDEF"
+
+### Delta Strings:
+
+Supported Types: BYTE_ARRAY
+
+This is also known as incremental encoding or front compression: for each element in a
+sorted sequence of strings, store the prefix length of the previous entry plus the
+suffix.
+
+For a longer description, see http://en.wikipedia.org/wiki/Incremental_encoding.
+
+This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED), followed by
+the suffixes encoded as delta length byte arrays (DELTA_LENGTH_BYTE_ARRAY). 

diff --git a/src/thrift/parquet.thrift b/src/thrift/parquet.thrift
index 6762578..71807f9 100644
--- a/src/thrift/parquet.thrift
+++ b/src/thrift/parquet.thrift

@@ -54,7 +54,7 @@
   /** a key/value pair is converted into a group of two fields */
   MAP_KEY_VALUE = 2;
 
-  /** a list is converted into an optional field containing a repeated field for its 
+  /** a list is converted into an optional field containing a repeated field for its
    * values */
   LIST = 3;
 
@@ -62,7 +62,7 @@
   ENUM = 4;
 }
 
-/** 
+/**
  * Representation of Schemas
  */
 enum FieldRepetitionType {
@@ -125,7 +125,7 @@
    * INT64 - 8 bytes per value.  Stored as little-endian.
    * FLOAT - 4 bytes per value.  IEEE. Stored as little-endian.
    * DOUBLE - 8 bytes per value.  IEEE. Stored as little-endian.
-   * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.  
+   * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
    * FIXED_LEN_BYTE_ARRAY - Just the bytes.
    */
   PLAIN = 0;
@@ -133,8 +133,8 @@
   /** Group VarInt encoding for INT32/INT64. */
   GROUP_VAR_INT = 1;
 
-  /** Dictionary encoding. The values in the dictionary are encoded in the 
-   * plain type. 
+  /** Dictionary encoding. The values in the dictionary are encoded in the
+   * plain type.
    */
   PLAIN_DICTIONARY = 2;
 
@@ -149,10 +149,18 @@
   /** Delta encoding for integers. This can be used for int columns and works best 
    * on sorted data */
   DELTA_BINARY_PACKED = 5;
+  
+  /** Encoding for byte arrays to separate the length values and the data. The lengths
+   * are encoded using DELTA_RLE **/
+  DELTA_LENGTH_BYTE_ARRAY = 6;
+
+  /** Delta-encoded sorted strings.
+   */
+  DELTA_STRINGS = 7;
 }
 
 /**
- * Supported compression algorithms.  
+ * Supported compression algorithms.
  */
 enum CompressionCodec {
   UNCOMPRESSED = 0;
@@ -205,7 +213,7 @@
   3: required i32 compressed_page_size
 
   /** 32bit crc for the data below. This allows for disabling checksumming in HDFS
-   *  if only a few pages needs to be read 
+   *  if only a few pages needs to be read
    **/
   4: optional i32 crc
 
@@ -215,7 +223,7 @@
   7: optional DictionaryPageHeader dictionary_page_header;
 }
 
-/** 
+/**
  * Wrapper struct to store key values
  */
  struct KeyValue {
@@ -245,7 +253,7 @@
   /** Type of this column **/
   1: required Type type
 
-  /** Set of all encodings used for this column. The purpose is to validate 
+  /** Set of all encodings used for this column. The purpose is to validate
    * whether we can decode those pages. **/
   2: required list<Encoding> encodings
 
@@ -278,7 +286,7 @@
 }
 
 struct ColumnChunk {
-  /** File where column data is stored.  If not set, assumed to be same file as 
+  /** File where column data is stored.  If not set, assumed to be same file as
     * metadata.  This path is relative to the current file.
     **/
   1: optional string file_path
@@ -288,11 +296,11 @@
 
   /** Column metadata for this chunk. This is the same content as what is at
    * file_path/file_offset.  Having it here has it replicated in the file
-   * metadata. 
+   * metadata.
    **/
   3: optional ColumnMetaData meta_data
 }
-  
+
 struct RowGroup {
   1: required list<ColumnChunk> columns
 
@@ -333,7 +341,7 @@
   5: optional list<KeyValue> key_value_metadata
 
   /** String for application that wrote this file.  This should be in the format
-   * <Application> version <App Version> (build <App Build Hash>).  
+   * <Application> version <App Version> (build <App Build Hash>).
    * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
    **/
   6: optional string created_by
commit	d10fcccd9fb3aedd9f2deb36bc85a8fcc8fa8a24	[log] [tgz]
author	Aniket Mokashi <amokashi@twitter.com>	Mon Nov 11 17:03:56 2013 -0800
committer	Aniket Mokashi <amokashi@twitter.com>	Mon Nov 11 17:03:56 2013 -0800
tree	57ef772b00f18af57451a2f910673dfc53cc2471
parent	38c894deaaf9878002367e6e51e7804877497dd0 [diff]
parent	9a38528e47760912f73ecebae262596d51c12463 [diff]