PARQUET-2261: add statistics for better estimating unencoded/uncompressed sizes and finer grained filtering (#197) Adds histograms of repetition/definition level and unencoded byte_array sizes to column chunks and to page indices. --------- Co-authored-by: Gang Wu <ustcwg@gmail.com> Co-authored-by: Gabor Szadovszky <gabor@apache.org> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Co-authored-by: mwish <1506118561@qq.com> Co-authored-by: JFinis <jan_finis@gmx.de>

commit: 40699d05bd24181de6b1457babbee2c16dce3803 [log] [tgz]
author: emkornfield <emkornfield@gmail.com> Tue Nov 14 12:48:23 2023 -0800
committer: GitHub <noreply@github.com> Tue Nov 14 12:48:23 2023 -0800
tree: 52387a14ac48fbc0226c9abbdc5fa9a33ea460f4
parent: 6f3f909ef410852713cc3865aa72e65eb21f9323 [diff]
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index e52675e..9decb5a 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift

@@ -192,6 +192,52 @@
 }
 
 /**
+ * A structure for capturing metadata for estimating the unencoded,
+ * uncompressed size of data written. This is useful for readers to estimate
+ * how much memory is needed to reconstruct data in their memory model and for
+ * fine grained filter pushdown on nested structures (the histograms contained
+ * in this structure can help determine the number of nulls at a particular
+ * nesting level and maximum length of lists).
+ */
+struct SizeStatistics {
+   /**
+    * The number of physical bytes stored for BYTE_ARRAY data values assuming
+    * no encoding. This is exclusive of the bytes needed to store the length of
+    * each byte array. In other words, this field is equivalent to the `(size
+    * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
+    * written)`. To determine unencoded sizes of other types readers can use
+    * schema information multiplied by the number of non-null and null values.
+    * The number of null/non-null values can be inferred from the histograms
+    * below.
+    *
+    * For example, if a column chunk is dictionary-encoded with dictionary
+    * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
+    * then this value for that data page should be 7 (1 + 1 + 2 + 3).
+    *
+    * This field should only be set for types that use BYTE_ARRAY as their
+    * physical type.
+    */
+   1: optional i64 unencoded_byte_array_data_bytes;
+   /**
+    * When present, there is expected to be one element corresponding to each
+    * repetition (i.e. size=max repetition_level+1) where each element
+    * represents the number of times the repetition level was observed in the
+    * data.
+    *
+    * This field may be omitted if max_repetition_level is 0 without loss
+    * of information.
+    **/
+   2: optional list<i64> repetition_level_histogram;
+   /**
+    * Same as repetition_level_histogram except for definition levels.
+    *
+    * This field may be omitted if max_definition_level is 0 or 1 without
+    * loss of information.
+    **/
+   3: optional list<i64> definition_level_histogram;
+}
+
+/**
  * Statistics per row group and per page
  * All fields are optional.
  */
@@ -541,7 +587,7 @@
   /** Encoding used for repetition levels **/
   4: required Encoding repetition_level_encoding;
 
-  /** Optional statistics for the data in this page**/
+  /** Optional statistics for the data in this page **/
   5: optional Statistics statistics;
 }
 
@@ -583,19 +629,19 @@
 
   // repetition levels and definition levels are always using RLE (without size in it)
 
-  /** length of the definition levels */
+  /** Length of the definition levels */
   5: required i32 definition_levels_byte_length;
-  /** length of the repetition levels */
+  /** Length of the repetition levels */
   6: required i32 repetition_levels_byte_length;
 
-  /**  whether the values are compressed.
+  /**  Whether the values are compressed.
   Which means the section of the page between
   definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
   is compressed with the compression_codec.
   If missing it is considered compressed */
   7: optional bool is_compressed = true;
 
-  /** optional statistics for the data in this page **/
+  /** Optional statistics for the data in this page **/
   8: optional Statistics statistics;
 }
 
@@ -608,11 +654,11 @@
 }
 
 /** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash
- * algorithm. It uses 64 bits version of xxHash. 
+ * algorithm. It uses 64 bits version of xxHash.
  **/
 struct XxHash {}
 
-/** 
+/**
  * The hash function used in Bloom filter. This function takes the hash of a column value
  * using plain encoding.
  **/
@@ -776,6 +822,14 @@
    * in a single I/O.
    */
   15: optional i32 bloom_filter_length;
+
+  /**
+   * Optional statistics to help estimate total memory when converted to in-memory
+   * representations. The histograms contained in these statistics can
+   * also be useful in some cases for more fine-grained nullability/list length
+   * filter pushdown.
+   */
+  16: optional SizeStatistics size_statistics;
 }
 
 struct EncryptionWithFooterKey {
@@ -784,7 +838,7 @@
 struct EncryptionWithColumnKey {
   /** Column path in schema **/
   1: required list<string> path_in_schema
-  
+
   /** Retrieval metadata of column encryption key **/
   2: optional binary key_metadata
 }
@@ -823,7 +877,7 @@
 
   /** Crypto metadata of encrypted columns **/
   8: optional ColumnCryptoMetaData crypto_metadata
-  
+
   /** Encrypted column metadata for this chunk **/
   9: optional binary encrypted_column_metadata
 }
@@ -950,6 +1004,13 @@
    * that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
    */
   1: required list<PageLocation> page_locations
+  /**
+   * Unencoded/uncompressed size for BYTE_ARRAY types.
+   *
+   * See documention for unencoded_byte_array_data_bytes in SizeStatistics for
+   * more details on this field.
+   */
+  2: optional list<i64> unencoded_byte_array_data_bytes
 }
 
 /**
@@ -989,6 +1050,25 @@
 
   /** A list containing the number of null values for each page **/
   5: optional list<i64> null_counts
+
+  /**
+   * Contains repetition level histograms for each page
+   * concatenated together.  The repetition_level_histogram field on
+   * SizeStatistics contains more details.
+   *
+   * When present the length should always be (number of pages *
+   * (max_repetition_level + 1)) elements.
+   *
+   * Element 0 is the first element of the histogram for the first page.
+   * Element (max_repetition_level + 1) is the first element of the histogram
+   * for the second page.
+   **/
+   6: optional list<i64> repetition_level_histograms;
+   /**
+    * Same as repetition_level_histograms except for definitions levels.
+    **/
+   7: optional list<i64> definition_level_histograms;
+
 }
 
 struct AesGcmV1 {
@@ -997,7 +1077,7 @@
 
   /** Unique file identifier part of AAD suffix **/
   2: optional binary aad_file_unique
-  
+
   /** In files encrypted with AAD prefix without storing it,
    * readers must supply the prefix **/
   3: optional bool supply_aad_prefix
@@ -1009,7 +1089,7 @@
 
   /** Unique file identifier part of AAD suffix **/
   2: optional binary aad_file_unique
-  
+
   /** In files encrypted with AAD prefix without storing it,
    * readers must supply the prefix **/
   3: optional bool supply_aad_prefix
commit	40699d05bd24181de6b1457babbee2c16dce3803	[log] [tgz]
author	emkornfield <emkornfield@gmail.com>	Tue Nov 14 12:48:23 2023 -0800
committer	GitHub <noreply@github.com>	Tue Nov 14 12:48:23 2023 -0800
tree	52387a14ac48fbc0226c9abbdc5fa9a33ea460f4
parent	6f3f909ef410852713cc3865aa72e65eb21f9323 [diff]