PARQUET-2414: Extend BYTE_STREAM_SPLIT to support INT32, INT64 and FIXED_LEN_BYTE_ARRAY data (#229)

commit: e517ac4dbe08d518eb5c2e58576d4c711973db94 [log] [tgz]
author: Antoine Pitrou <antoine@python.org> Mon Mar 18 11:41:22 2024 +0100
committer: GitHub <noreply@github.com> Mon Mar 18 11:41:22 2024 +0100
tree: 5d6e947ace4916038f6ecd4aa6f24b16af7a93a5
parent: 38f79c91dfbdd2aaee3f83cb023da3fa240ed9fc [diff]
diff --git a/CHANGES.md b/CHANGES.md
index 4002000..7bbce7c 100644
--- a/CHANGES.md
+++ b/CHANGES.md

@@ -19,6 +19,12 @@
 
 # Parquet #
 
+### Version 2.11.0 ###
+
+#### New Feature
+
+*   [PARQUET-2414](https://issues.apache.org/jira/browse/PARQUET-2414) - Extend BYTE_STREAM_SPLIT to support INT32, INT64 and FIXED_LEN_BYTE_ARRAY data
+
 ### Version 2.10.0 ###
 
 #### New Feature

diff --git a/Encodings.md b/Encodings.md
index 5040094..ea7e4e3 100644
--- a/Encodings.md
+++ b/Encodings.md

@@ -337,14 +337,15 @@
 
 ### Byte Stream Split: (BYTE_STREAM_SPLIT = 9)
 
-Supported Types: FLOAT, DOUBLE
+Supported Types: FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY
 
 This encoding does not reduce the size of the data but can lead to a significantly better
 compression ratio and speed when a compression algorithm is used afterwards.
 
 This encoding creates K byte-streams of length N where K is the size in bytes of the data
-type and N is the number of elements in the data sequence. Specifically, K is 4 for FLOAT
+type and N is the number of elements in the data sequence. For example, K is 4 for FLOAT
 type and 8 for DOUBLE type.
+
 The bytes of each value are scattered to the corresponding streams. The 0-th byte goes to the
 0-th stream, the 1-st byte goes to the 1-st stream and so on.
 The streams are concatenated in the following order: 0-th stream, 1-st stream, etc.

diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 2084ac6..27d4043 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift

@@ -526,12 +526,15 @@
    */
   RLE_DICTIONARY = 8;
 
-  /** Encoding for floating-point data.
+  /** Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY).
       K byte-streams are created where K is the size in bytes of the data type.
-      The individual bytes of an FP value are scattered to the corresponding stream and
+      The individual bytes of a value are scattered to the corresponding stream and
       the streams are concatenated.
       This itself does not reduce the size of the data but can lead to better compression
       afterwards.
+
+      Added in 2.8 for FLOAT and DOUBLE.
+      Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11.
    */
   BYTE_STREAM_SPLIT = 9;
 }
commit	e517ac4dbe08d518eb5c2e58576d4c711973db94	[log] [tgz]
author	Antoine Pitrou <antoine@python.org>	Mon Mar 18 11:41:22 2024 +0100
committer	GitHub <noreply@github.com>	Mon Mar 18 11:41:22 2024 +0100
tree	5d6e947ace4916038f6ecd4aa6f24b16af7a93a5
parent	38f79c91dfbdd2aaee3f83cb023da3fa240ed9fc [diff]