PARQUET-2480: Clarify what "page index" means in Parquet.thrift (#245)
diff --git a/PageIndex.md b/PageIndex.md
index f4a8f64..a371c42 100644
--- a/PageIndex.md
+++ b/PageIndex.md
@@ -17,11 +17,13 @@
- under the License.
-->
-# ColumnIndex Layout to Support Page Skipping
+# Parquet page index: Layout to Support Page Skipping
-This document describes the format for column index pages in the Parquet
-footer. These pages contain statistics for DataPages and can be used to skip
-pages when scanning data in ordered and unordered columns.
+In Parquet, a *page index* is optional metadata for a
+ColumnChunk, containing statistics for DataPages that can be used
+to skip those pages when scanning in ordered and unordered columns.
+The page index is stored using the OffsetIndex and ColumnIndex structures,
+defined in [`parquet.thrift`](src/main/thrift/parquet.thrift)
## Problem Statement
In previous versions of the format, Statistics are stored for ColumnChunks in
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 27d4043..c928ad6 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -738,10 +738,10 @@
}
/**
- * Wrapper struct to specify sort order
+ * Sort order within a RowGroup of a leaf column
*/
struct SortingColumn {
- /** The column index (in this row group) **/
+ /** The ordinal position of the column (in this row group) **/
1: required i32 column_idx
/** If true, indicates this column is sorted in descending order. **/
@@ -1001,6 +1001,13 @@
3: required i64 first_row_index
}
+/**
+ * Optional offsets for each data page in a ColumnChunk.
+ *
+ * Forms part of the page index, along with ColumnIndex.
+ *
+ * OffsetIndex may be present even if ColumnIndex is not.
+ */
struct OffsetIndex {
/**
* PageLocations, ordered by increasing PageLocation.offset. It is required
@@ -1017,8 +1024,14 @@
}
/**
- * Description for ColumnIndex.
- * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
+ * Optional statistics for each data page in a ColumnChunk.
+ *
+ * Forms part the page index, along with OffsetIndex.
+ *
+ * If this structure is present, OffsetIndex must also be present.
+ *
+ * For each field in this structure, <field>[i] refers to the page at
+ * OffsetIndex.page_locations[i]
*/
struct ColumnIndex {
/**
@@ -1071,7 +1084,6 @@
* Same as repetition_level_histograms except for definitions levels.
**/
7: optional list<i64> definition_level_histograms;
-
}
struct AesGcmV1 {