blob: b1e30dd4590cdd14def0c2c48ceee8334d49855b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! [`OffsetIndexMetaData`] structure holding decoded [`OffsetIndex`] information
//!
//! [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
use std::io::Write;
use crate::parquet_thrift::{
ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
WriteThrift, WriteThriftField, read_thrift_vec,
};
use crate::{
errors::{ParquetError, Result},
thrift_struct,
};
thrift_struct!(
/// Page location information for [`OffsetIndexMetaData`]
pub struct PageLocation {
/// Offset of the page in the file
1: required i64 offset
/// Size of the page, including header. Sum of compressed_page_size and header
2: required i32 compressed_page_size
/// Index within the RowGroup of the first row of the page. When an
/// OffsetIndex is present, pages must begin on row boundaries
/// (repetition_level = 0).
3: required i64 first_row_index
}
);
thrift_struct!(
/// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page
/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns.
///
/// See [`ParquetOffsetIndex`] for more information.
///
/// [`ParquetOffsetIndex`]: crate::file::metadata::ParquetOffsetIndex
/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct OffsetIndexMetaData {
/// Vector of [`PageLocation`] objects, one per page in the chunk.
1: required list<PageLocation> page_locations
/// Optional vector of unencoded page sizes, one per page in the chunk.
/// Only defined for BYTE_ARRAY columns.
2: optional list<i64> unencoded_byte_array_data_bytes
}
);
impl OffsetIndexMetaData {
/// Vector of [`PageLocation`] objects, one per page in the chunk.
pub fn page_locations(&self) -> &Vec<PageLocation> {
&self.page_locations
}
/// Optional vector of unencoded page sizes, one per page in the chunk. Only defined
/// for BYTE_ARRAY columns.
pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec<i64>> {
self.unencoded_byte_array_data_bytes.as_ref()
}
// Fast-path read of offset index. This works because we expect all field deltas to be 1,
// and there's no nesting beyond PageLocation, so no need to save the last field id. Like
// read_page_locations(), this will fail if absolute field id's are used.
pub(super) fn try_from_fast<'a, R: ThriftCompactInputProtocol<'a>>(
prot: &mut R,
) -> Result<Self> {
// Offset index is a struct with 2 fields. First field is an array of PageLocations,
// the second an optional array of i64.
// read field 1 header, then list header, then vec of PageLocations
let (field_type, delta) = prot.read_field_header()?;
if delta != 1 || field_type != FieldType::List as u8 {
return Err(general_err!("error reading OffsetIndex::page_locations"));
}
// we have to do this manually because we want to use the fast PageLocation decoder
let list_ident = prot.read_list_begin()?;
let mut page_locations = Vec::with_capacity(list_ident.size as usize);
for _ in 0..list_ident.size {
page_locations.push(read_page_location(prot)?);
}
let mut unencoded_byte_array_data_bytes: Option<Vec<i64>> = None;
// read second field...if it's Stop we're done
let (mut field_type, delta) = prot.read_field_header()?;
if field_type == FieldType::List as u8 {
if delta != 1 {
return Err(general_err!(
"encountered unknown field while reading OffsetIndex"
));
}
let vec = read_thrift_vec::<i64, R>(&mut *prot)?;
unencoded_byte_array_data_bytes = Some(vec);
// this one should be Stop
(field_type, _) = prot.read_field_header()?;
}
if field_type != FieldType::Stop as u8 {
return Err(general_err!(
"encountered unknown field while reading OffsetIndex"
));
}
Ok(Self {
page_locations,
unencoded_byte_array_data_bytes,
})
}
}
// hand coding this one because it is very time critical
// Note: this will fail if the fields are either out of order, or if a suboptimal
// encoder doesn't use field deltas.
fn read_page_location<'a, R: ThriftCompactInputProtocol<'a>>(prot: &mut R) -> Result<PageLocation> {
// there are 3 fields, all mandatory, so all field deltas should be 1
let (field_type, delta) = prot.read_field_header()?;
if delta != 1 || field_type != FieldType::I64 as u8 {
return Err(general_err!("error reading PageLocation::offset"));
}
let offset = prot.read_i64()?;
let (field_type, delta) = prot.read_field_header()?;
if delta != 1 || field_type != FieldType::I32 as u8 {
return Err(general_err!(
"error reading PageLocation::compressed_page_size"
));
}
let compressed_page_size = prot.read_i32()?;
let (field_type, delta) = prot.read_field_header()?;
if delta != 1 || field_type != FieldType::I64 as u8 {
return Err(general_err!("error reading PageLocation::first_row_index"));
}
let first_row_index = prot.read_i64()?;
// read end of struct...return error if there are unknown fields present
let (field_type, _) = prot.read_field_header()?;
if field_type != FieldType::Stop as u8 {
return Err(general_err!("unexpected field in PageLocation"));
}
Ok(PageLocation {
offset,
compressed_page_size,
first_row_index,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parquet_thrift::tests::test_roundtrip;
#[test]
fn test_offset_idx_roundtrip() {
let page_locations = [
PageLocation {
offset: 0,
compressed_page_size: 10,
first_row_index: 0,
},
PageLocation {
offset: 10,
compressed_page_size: 20,
first_row_index: 100,
},
]
.to_vec();
let unenc = [0i64, 100i64].to_vec();
test_roundtrip(OffsetIndexMetaData {
page_locations: page_locations.clone(),
unencoded_byte_array_data_bytes: Some(unenc),
});
test_roundtrip(OffsetIndexMetaData {
page_locations,
unencoded_byte_array_data_bytes: None,
});
}
}