PARQUET-2261: add statistics for better estimating unencoded/uncompressed sizes and finer grained filtering (#197)
Adds histograms of repetition/definition level and unencoded byte_array sizes to column chunks and to page indices.
---------
Co-authored-by: Gang Wu <ustcwg@gmail.com>
Co-authored-by: Gabor Szadovszky <gabor@apache.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: JFinis <jan_finis@gmx.de>
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index e52675e..9decb5a 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -192,6 +192,52 @@
}
/**
+ * A structure for capturing metadata for estimating the unencoded,
+ * uncompressed size of data written. This is useful for readers to estimate
+ * how much memory is needed to reconstruct data in their memory model and for
+ * fine grained filter pushdown on nested structures (the histograms contained
+ * in this structure can help determine the number of nulls at a particular
+ * nesting level and maximum length of lists).
+ */
+struct SizeStatistics {
+ /**
+ * The number of physical bytes stored for BYTE_ARRAY data values assuming
+ * no encoding. This is exclusive of the bytes needed to store the length of
+ * each byte array. In other words, this field is equivalent to the `(size
+ * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
+ * written)`. To determine unencoded sizes of other types readers can use
+ * schema information multiplied by the number of non-null and null values.
+ * The number of null/non-null values can be inferred from the histograms
+ * below.
+ *
+ * For example, if a column chunk is dictionary-encoded with dictionary
+ * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
+ * then this value for that data page should be 7 (1 + 1 + 2 + 3).
+ *
+ * This field should only be set for types that use BYTE_ARRAY as their
+ * physical type.
+ */
+ 1: optional i64 unencoded_byte_array_data_bytes;
+ /**
+ * When present, there is expected to be one element corresponding to each
+ * repetition (i.e. size=max repetition_level+1) where each element
+ * represents the number of times the repetition level was observed in the
+ * data.
+ *
+ * This field may be omitted if max_repetition_level is 0 without loss
+ * of information.
+ **/
+ 2: optional list<i64> repetition_level_histogram;
+ /**
+ * Same as repetition_level_histogram except for definition levels.
+ *
+ * This field may be omitted if max_definition_level is 0 or 1 without
+ * loss of information.
+ **/
+ 3: optional list<i64> definition_level_histogram;
+}
+
+/**
* Statistics per row group and per page
* All fields are optional.
*/
@@ -541,7 +587,7 @@
/** Encoding used for repetition levels **/
4: required Encoding repetition_level_encoding;
- /** Optional statistics for the data in this page**/
+ /** Optional statistics for the data in this page **/
5: optional Statistics statistics;
}
@@ -583,19 +629,19 @@
// repetition levels and definition levels are always using RLE (without size in it)
- /** length of the definition levels */
+ /** Length of the definition levels */
5: required i32 definition_levels_byte_length;
- /** length of the repetition levels */
+ /** Length of the repetition levels */
6: required i32 repetition_levels_byte_length;
- /** whether the values are compressed.
+ /** Whether the values are compressed.
Which means the section of the page between
definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
is compressed with the compression_codec.
If missing it is considered compressed */
7: optional bool is_compressed = true;
- /** optional statistics for the data in this page **/
+ /** Optional statistics for the data in this page **/
8: optional Statistics statistics;
}
@@ -608,11 +654,11 @@
}
/** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash
- * algorithm. It uses 64 bits version of xxHash.
+ * algorithm. It uses 64 bits version of xxHash.
**/
struct XxHash {}
-/**
+/**
* The hash function used in Bloom filter. This function takes the hash of a column value
* using plain encoding.
**/
@@ -776,6 +822,14 @@
* in a single I/O.
*/
15: optional i32 bloom_filter_length;
+
+ /**
+ * Optional statistics to help estimate total memory when converted to in-memory
+ * representations. The histograms contained in these statistics can
+ * also be useful in some cases for more fine-grained nullability/list length
+ * filter pushdown.
+ */
+ 16: optional SizeStatistics size_statistics;
}
struct EncryptionWithFooterKey {
@@ -784,7 +838,7 @@
struct EncryptionWithColumnKey {
/** Column path in schema **/
1: required list<string> path_in_schema
-
+
/** Retrieval metadata of column encryption key **/
2: optional binary key_metadata
}
@@ -823,7 +877,7 @@
/** Crypto metadata of encrypted columns **/
8: optional ColumnCryptoMetaData crypto_metadata
-
+
/** Encrypted column metadata for this chunk **/
9: optional binary encrypted_column_metadata
}
@@ -950,6 +1004,13 @@
* that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
*/
1: required list<PageLocation> page_locations
+ /**
+ * Unencoded/uncompressed size for BYTE_ARRAY types.
+ *
+ * See documention for unencoded_byte_array_data_bytes in SizeStatistics for
+ * more details on this field.
+ */
+ 2: optional list<i64> unencoded_byte_array_data_bytes
}
/**
@@ -989,6 +1050,25 @@
/** A list containing the number of null values for each page **/
5: optional list<i64> null_counts
+
+ /**
+ * Contains repetition level histograms for each page
+ * concatenated together. The repetition_level_histogram field on
+ * SizeStatistics contains more details.
+ *
+ * When present the length should always be (number of pages *
+ * (max_repetition_level + 1)) elements.
+ *
+ * Element 0 is the first element of the histogram for the first page.
+ * Element (max_repetition_level + 1) is the first element of the histogram
+ * for the second page.
+ **/
+ 6: optional list<i64> repetition_level_histograms;
+ /**
+ * Same as repetition_level_histograms except for definitions levels.
+ **/
+ 7: optional list<i64> definition_level_histograms;
+
}
struct AesGcmV1 {
@@ -997,7 +1077,7 @@
/** Unique file identifier part of AAD suffix **/
2: optional binary aad_file_unique
-
+
/** In files encrypted with AAD prefix without storing it,
* readers must supply the prefix **/
3: optional bool supply_aad_prefix
@@ -1009,7 +1089,7 @@
/** Unique file identifier part of AAD suffix **/
2: optional binary aad_file_unique
-
+
/** In files encrypted with AAD prefix without storing it,
* readers must supply the prefix **/
3: optional bool supply_aad_prefix