parquet/src/file/metadata.rs - arrow-rs - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 //! Contains information about available Parquet metadata.
 //!
 //! The hierarchy of metadata is as follows:
 //!
 //! [`ParquetMetaData`](struct.ParquetMetaData.html) contains
 //! [`FileMetaData`](struct.FileMetaData.html) and zero or more
 //! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group.
 //!
 //! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific
 //! metadata.
 //!
 //! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row
 //! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for
 //! each column chunk.
 //!
 //! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column
 //! chunk (primitive leaf column), including encoding/compression, number of values, etc.

 use std::sync::Arc;

 use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup};

 use crate::basic::{ColumnOrder, Compression, Encoding, Type};
 use crate::errors::{ParquetError, Result};
 use crate::file::statistics::{self, Statistics};
 use crate::schema::types::{
     ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
     Type as SchemaType,
 };

 /// Global Parquet metadata.
 #[derive(Debug, Clone)]
 pub struct ParquetMetaData {
     file_metadata: FileMetaData,
     row_groups: Vec<RowGroupMetaData>,
 }

 impl ParquetMetaData {
     /// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s
     /// for each available row group.
     pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
         ParquetMetaData {
             file_metadata,
             row_groups,
         }
     }

     /// Returns file metadata as reference.
     pub fn file_metadata(&self) -> &FileMetaData {
         &self.file_metadata
     }

     /// Returns number of row groups in this file.
     pub fn num_row_groups(&self) -> usize {
         self.row_groups.len()
     }

     /// Returns row group metadata for `i`th position.
     /// Position should be less than number of row groups `num_row_groups`.
     pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
         &self.row_groups[i]
     }

     /// Returns slice of row groups in this file.
     pub fn row_groups(&self) -> &[RowGroupMetaData] {
         &self.row_groups
     }
 }

 pub type KeyValue = parquet_format::KeyValue;

 /// Reference counted pointer for [`FileMetaData`].
 pub type FileMetaDataPtr = Arc<FileMetaData>;

 /// Metadata for a Parquet file.
 #[derive(Debug, Clone)]
 pub struct FileMetaData {
     version: i32,
     num_rows: i64,
     created_by: Option<String>,
     key_value_metadata: Option<Vec<KeyValue>>,
     schema_descr: SchemaDescPtr,
     column_orders: Option<Vec<ColumnOrder>>,
 }

 impl FileMetaData {
     /// Creates new file metadata.
     pub fn new(
         version: i32,
         num_rows: i64,
         created_by: Option<String>,
         key_value_metadata: Option<Vec<KeyValue>>,
         schema_descr: SchemaDescPtr,
         column_orders: Option<Vec<ColumnOrder>>,
     ) -> Self {
         FileMetaData {
             version,
             num_rows,
             created_by,
             key_value_metadata,
             schema_descr,
             column_orders,
         }
     }

     /// Returns version of this file.
     pub fn version(&self) -> i32 {
         self.version
     }

     /// Returns number of rows in the file.
     pub fn num_rows(&self) -> i64 {
         self.num_rows
     }

     /// String message for application that wrote this file.
     ///
     /// This should have the following format:
     /// `<application> version <application version> (build <application build hash>)`.
     ///
     /// ```shell
     /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
     /// ```
     pub fn created_by(&self) -> &Option<String> {
         &self.created_by
     }

     /// Returns key_value_metadata of this file.
     pub fn key_value_metadata(&self) -> &Option<Vec<KeyValue>> {
         &self.key_value_metadata
     }

     /// Returns Parquet ['Type`] that describes schema in this file.
     pub fn schema(&self) -> &SchemaType {
         self.schema_descr.root_schema()
     }

     /// Returns a reference to schema descriptor.
     pub fn schema_descr(&self) -> &SchemaDescriptor {
         &self.schema_descr
     }

     /// Returns reference counted clone for schema descriptor.
     pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
         self.schema_descr.clone()
     }

     /// Column (sort) order used for `min` and `max` values of each column in this file.
     ///
     /// Each column order corresponds to one column, determined by its position in the
     /// list, matching the position of the column in the schema.
     ///
     /// When `None` is returned, there are no column orders available, and each column
     /// should be assumed to have undefined (legacy) column order.
     pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
         self.column_orders.as_ref()
     }

     /// Returns column order for `i`th column in this file.
     /// If column orders are not available, returns undefined (legacy) column order.
     pub fn column_order(&self, i: usize) -> ColumnOrder {
         self.column_orders
             .as_ref()
             .map(|data| data[i])
             .unwrap_or(ColumnOrder::UNDEFINED)
     }
 }

 /// Reference counted pointer for [`RowGroupMetaData`].
 pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;

 /// Metadata for a row group.
 #[derive(Debug, Clone)]
 pub struct RowGroupMetaData {
     columns: Vec<ColumnChunkMetaData>,
     num_rows: i64,
     total_byte_size: i64,
     schema_descr: SchemaDescPtr,
 }

 impl RowGroupMetaData {
     /// Returns builer for row group metadata.
     pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
         RowGroupMetaDataBuilder::new(schema_descr)
     }

     /// Number of columns in this row group.
     pub fn num_columns(&self) -> usize {
         self.columns.len()
     }

     /// Returns column chunk metadata for `i`th column.
     pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
         &self.columns[i]
     }

     /// Returns slice of column chunk metadata.
     pub fn columns(&self) -> &[ColumnChunkMetaData] {
         &self.columns
     }

     /// Number of rows in this row group.
     pub fn num_rows(&self) -> i64 {
         self.num_rows
     }

     /// Total byte size of all uncompressed column data in this row group.
     pub fn total_byte_size(&self) -> i64 {
         self.total_byte_size
     }

     /// Total size of all compressed column data in this row group.
     pub fn compressed_size(&self) -> i64 {
         self.columns.iter().map(|c| c.total_compressed_size).sum()
     }

     /// Returns reference to a schema descriptor.
     pub fn schema_descr(&self) -> &SchemaDescriptor {
         self.schema_descr.as_ref()
     }

     /// Returns reference counted clone of schema descriptor.
     pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
         self.schema_descr.clone()
     }

     /// Method to convert from Thrift.
     pub fn from_thrift(
         schema_descr: SchemaDescPtr,
         mut rg: RowGroup,
     ) -> Result<RowGroupMetaData> {
         assert_eq!(schema_descr.num_columns(), rg.columns.len());
         let total_byte_size = rg.total_byte_size;
         let num_rows = rg.num_rows;
         let mut columns = vec![];
         for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
             let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?;
             columns.push(cc);
         }
         Ok(RowGroupMetaData {
             columns,
             num_rows,
             total_byte_size,
             schema_descr,
         })
     }

     /// Method to convert to Thrift.
     pub fn to_thrift(&self) -> RowGroup {
         RowGroup {
             columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
             total_byte_size: self.total_byte_size,
             num_rows: self.num_rows,
             sorting_columns: None,
         }
     }
 }

 /// Builder for row group metadata.
 pub struct RowGroupMetaDataBuilder {
     columns: Vec<ColumnChunkMetaData>,
     schema_descr: SchemaDescPtr,
     num_rows: i64,
     total_byte_size: i64,
 }

 impl RowGroupMetaDataBuilder {
     /// Creates new builder from schema descriptor.
     fn new(schema_descr: SchemaDescPtr) -> Self {
         Self {
             columns: Vec::with_capacity(schema_descr.num_columns()),
             schema_descr,
             num_rows: 0,
             total_byte_size: 0,
         }
     }

     /// Sets number of rows in this row group.
     pub fn set_num_rows(mut self, value: i64) -> Self {
         self.num_rows = value;
         self
     }

     /// Sets total size in bytes for this row group.
     pub fn set_total_byte_size(mut self, value: i64) -> Self {
         self.total_byte_size = value;
         self
     }

     /// Sets column metadata for this row group.
     pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
         self.columns = value;
         self
     }

     /// Builds row group metadata.
     pub fn build(self) -> Result<RowGroupMetaData> {
         if self.schema_descr.num_columns() != self.columns.len() {
             return Err(general_err!(
                 "Column length mismatch: {} != {}",
                 self.schema_descr.num_columns(),
                 self.columns.len()
             ));
         }

         Ok(RowGroupMetaData {
             columns: self.columns,
             num_rows: self.num_rows,
             total_byte_size: self.total_byte_size,
             schema_descr: self.schema_descr,
         })
     }
 }

 /// Metadata for a column chunk.
 #[derive(Debug, Clone)]
 pub struct ColumnChunkMetaData {
     column_type: Type,
     column_path: ColumnPath,
     column_descr: ColumnDescPtr,
     encodings: Vec<Encoding>,
     file_path: Option<String>,
     file_offset: i64,
     num_values: i64,
     compression: Compression,
     total_compressed_size: i64,
     total_uncompressed_size: i64,
     data_page_offset: i64,
     index_page_offset: Option<i64>,
     dictionary_page_offset: Option<i64>,
     statistics: Option<Statistics>,
 }

 /// Represents common operations for a column chunk.
 impl ColumnChunkMetaData {
     /// Returns builder for column chunk metadata.
     pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
         ColumnChunkMetaDataBuilder::new(column_descr)
     }

     /// File where the column chunk is stored.
     ///
     /// If not set, assumed to belong to the same file as the metadata.
     /// This path is relative to the current file.
     pub fn file_path(&self) -> Option<&String> {
         self.file_path.as_ref()
     }

     /// Byte offset in `file_path()`.
     pub fn file_offset(&self) -> i64 {
         self.file_offset
     }

     /// Type of this column. Must be primitive.
     pub fn column_type(&self) -> Type {
         self.column_type
     }

     /// Path (or identifier) of this column.
     pub fn column_path(&self) -> &ColumnPath {
         &self.column_path
     }

     /// Descriptor for this column.
     pub fn column_descr(&self) -> &ColumnDescriptor {
         self.column_descr.as_ref()
     }

     /// Reference counted clone of descriptor for this column.
     pub fn column_descr_ptr(&self) -> ColumnDescPtr {
         self.column_descr.clone()
     }

     /// All encodings used for this column.
     pub fn encodings(&self) -> &Vec<Encoding> {
         &self.encodings
     }

     /// Total number of values in this column chunk.
     pub fn num_values(&self) -> i64 {
         self.num_values
     }

     /// Compression for this column.
     pub fn compression(&self) -> Compression {
         self.compression
     }

     /// Returns the total compressed data size of this column chunk.
     pub fn compressed_size(&self) -> i64 {
         self.total_compressed_size
     }

     /// Returns the total uncompressed data size of this column chunk.
     pub fn uncompressed_size(&self) -> i64 {
         self.total_uncompressed_size
     }

     /// Returns the offset for the column data.
     pub fn data_page_offset(&self) -> i64 {
         self.data_page_offset
     }

     /// Returns `true` if this column chunk contains a index page, `false` otherwise.
     pub fn has_index_page(&self) -> bool {
         self.index_page_offset.is_some()
     }

     /// Returns the offset for the index page.
     pub fn index_page_offset(&self) -> Option<i64> {
         self.index_page_offset
     }

     /// Returns `true` if this column chunk contains a dictionary page, `false` otherwise.
     pub fn has_dictionary_page(&self) -> bool {
         self.dictionary_page_offset.is_some()
     }

     /// Returns the offset for the dictionary page, if any.
     pub fn dictionary_page_offset(&self) -> Option<i64> {
         self.dictionary_page_offset
     }

     /// Returns the offset and length in bytes of the column chunk within the file
     pub fn byte_range(&self) -> (u64, u64) {
         let col_start = if self.has_dictionary_page() {
             self.dictionary_page_offset().unwrap()
         } else {
             self.data_page_offset()
         };
         let col_len = self.compressed_size();
         assert!(
             col_start >= 0 && col_len >= 0,
             "column start and length should not be negative"
         );
         (col_start as u64, col_len as u64)
     }

     /// Returns statistics that are set for this column chunk,
     /// or `None` if no statistics are available.
     pub fn statistics(&self) -> Option<&Statistics> {
         self.statistics.as_ref()
     }

     /// Method to convert from Thrift.
     pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
         if cc.meta_data.is_none() {
             return Err(general_err!("Expected to have column metadata"));
         }
         let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
         let column_type = Type::from(col_metadata.type_);
         let column_path = ColumnPath::new(col_metadata.path_in_schema);
         let encodings = col_metadata
             .encodings
             .drain(0..)
             .map(Encoding::from)
             .collect();
         let compression = Compression::from(col_metadata.codec);
         let file_path = cc.file_path;
         let file_offset = cc.file_offset;
         let num_values = col_metadata.num_values;
         let total_compressed_size = col_metadata.total_compressed_size;
         let total_uncompressed_size = col_metadata.total_uncompressed_size;
         let data_page_offset = col_metadata.data_page_offset;
         let index_page_offset = col_metadata.index_page_offset;
         let dictionary_page_offset = col_metadata.dictionary_page_offset;
         let statistics = statistics::from_thrift(column_type, col_metadata.statistics);
         let result = ColumnChunkMetaData {
             column_type,
             column_path,
             column_descr,
             encodings,
             file_path,
             file_offset,
             num_values,
             compression,
             total_compressed_size,
             total_uncompressed_size,
             data_page_offset,
             index_page_offset,
             dictionary_page_offset,
             statistics,
         };
         Ok(result)
     }

     /// Method to convert to Thrift.
     pub fn to_thrift(&self) -> ColumnChunk {
         let column_metadata = ColumnMetaData {
             type_: self.column_type.into(),
             encodings: self.encodings().iter().map(|&v| v.into()).collect(),
             path_in_schema: Vec::from(self.column_path.as_ref()),
             codec: self.compression.into(),
             num_values: self.num_values,
             total_uncompressed_size: self.total_uncompressed_size,
             total_compressed_size: self.total_compressed_size,
             key_value_metadata: None,
             data_page_offset: self.data_page_offset,
             index_page_offset: self.index_page_offset,
             dictionary_page_offset: self.dictionary_page_offset,
             statistics: statistics::to_thrift(self.statistics.as_ref()),
             encoding_stats: None,
         };

         ColumnChunk {
             file_path: self.file_path().cloned(),
             file_offset: self.file_offset,
             meta_data: Some(column_metadata),
             offset_index_offset: None,
             offset_index_length: None,
             column_index_offset: None,
             column_index_length: None,
         }
     }
 }

 /// Builder for column chunk metadata.
 pub struct ColumnChunkMetaDataBuilder {
     column_descr: ColumnDescPtr,
     encodings: Vec<Encoding>,
     file_path: Option<String>,
     file_offset: i64,
     num_values: i64,
     compression: Compression,
     total_compressed_size: i64,
     total_uncompressed_size: i64,
     data_page_offset: i64,
     index_page_offset: Option<i64>,
     dictionary_page_offset: Option<i64>,
     statistics: Option<Statistics>,
 }

 impl ColumnChunkMetaDataBuilder {
     /// Creates new column chunk metadata builder.
     fn new(column_descr: ColumnDescPtr) -> Self {
         Self {
             column_descr,
             encodings: Vec::new(),
             file_path: None,
             file_offset: 0,
             num_values: 0,
             compression: Compression::UNCOMPRESSED,
             total_compressed_size: 0,
             total_uncompressed_size: 0,
             data_page_offset: 0,
             index_page_offset: None,
             dictionary_page_offset: None,
             statistics: None,
         }
     }

     /// Sets list of encodings for this column chunk.
     pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
         self.encodings = encodings;
         self
     }

     /// Sets optional file path for this column chunk.
     pub fn set_file_path(mut self, value: String) -> Self {
         self.file_path = Some(value);
         self
     }

     /// Sets file offset in bytes.
     pub fn set_file_offset(mut self, value: i64) -> Self {
         self.file_offset = value;
         self
     }

     /// Sets number of values.
     pub fn set_num_values(mut self, value: i64) -> Self {
         self.num_values = value;
         self
     }

     /// Sets compression.
     pub fn set_compression(mut self, value: Compression) -> Self {
         self.compression = value;
         self
     }

     /// Sets total compressed size in bytes.
     pub fn set_total_compressed_size(mut self, value: i64) -> Self {
         self.total_compressed_size = value;
         self
     }

     /// Sets total uncompressed size in bytes.
     pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
         self.total_uncompressed_size = value;
         self
     }

     /// Sets data page offset in bytes.
     pub fn set_data_page_offset(mut self, value: i64) -> Self {
         self.data_page_offset = value;
         self
     }

     /// Sets optional dictionary page ofset in bytes.
     pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
         self.dictionary_page_offset = value;
         self
     }

     /// Sets optional index page offset in bytes.
     pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
         self.index_page_offset = value;
         self
     }

     /// Sets statistics for this column chunk.
     pub fn set_statistics(mut self, value: Statistics) -> Self {
         self.statistics = Some(value);
         self
     }

     /// Builds column chunk metadata.
     pub fn build(self) -> Result<ColumnChunkMetaData> {
         Ok(ColumnChunkMetaData {
             column_type: self.column_descr.physical_type(),
             column_path: self.column_descr.path().clone(),
             column_descr: self.column_descr,
             encodings: self.encodings,
             file_path: self.file_path,
             file_offset: self.file_offset,
             num_values: self.num_values,
             compression: self.compression,
             total_compressed_size: self.total_compressed_size,
             total_uncompressed_size: self.total_uncompressed_size,
             data_page_offset: self.data_page_offset,
             index_page_offset: self.index_page_offset,
             dictionary_page_offset: self.dictionary_page_offset,
             statistics: self.statistics,
         })
     }
 }

 #[cfg(test)]
 mod tests {
     use super::*;

     #[test]
     fn test_row_group_metadata_thrift_conversion() {
         let schema_descr = get_test_schema_descr();

         let mut columns = vec![];
         for ptr in schema_descr.columns() {
             let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
             columns.push(column);
         }
         let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
             .set_num_rows(1000)
             .set_total_byte_size(2000)
             .set_column_metadata(columns)
             .build()
             .unwrap();

         let row_group_exp = row_group_meta.to_thrift();
         let row_group_res =
             RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
                 .unwrap()
                 .to_thrift();

         assert_eq!(row_group_res, row_group_exp);
     }

     #[test]
     fn test_row_group_metadata_thrift_conversion_empty() {
         let schema_descr = get_test_schema_descr();

         let row_group_meta = RowGroupMetaData::builder(schema_descr).build();

         assert!(row_group_meta.is_err());
         if let Err(e) = row_group_meta {
             assert_eq!(
                 format!("{}", e),
                 "Parquet error: Column length mismatch: 2 != 0"
             );
         }
     }

     #[test]
     fn test_column_chunk_metadata_thrift_conversion() {
         let column_descr = get_test_schema_descr().column(0);

         let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
             .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
             .set_file_path("file_path".to_owned())
             .set_file_offset(100)
             .set_num_values(1000)
             .set_compression(Compression::SNAPPY)
             .set_total_compressed_size(2000)
             .set_total_uncompressed_size(3000)
             .set_data_page_offset(4000)
             .set_dictionary_page_offset(Some(5000))
             .build()
             .unwrap();

         let col_chunk_exp = col_metadata.to_thrift();

         let col_chunk_res =
             ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
                 .unwrap()
                 .to_thrift();

         assert_eq!(col_chunk_res, col_chunk_exp);
     }

     #[test]
     fn test_column_chunk_metadata_thrift_conversion_empty() {
         let column_descr = get_test_schema_descr().column(0);

         let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
             .build()
             .unwrap();

         let col_chunk_exp = col_metadata.to_thrift();
         let col_chunk_res =
             ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
                 .unwrap()
                 .to_thrift();

         assert_eq!(col_chunk_res, col_chunk_exp);
     }

     #[test]
     fn test_compressed_size() {
         let schema_descr = get_test_schema_descr();

         let mut columns = vec![];
         for column_descr in schema_descr.columns() {
             let column = ColumnChunkMetaData::builder(column_descr.clone())
                 .set_total_compressed_size(500)
                 .set_total_uncompressed_size(700)
                 .build()
                 .unwrap();
             columns.push(column);
         }
         let row_group_meta = RowGroupMetaData::builder(schema_descr)
             .set_num_rows(1000)
             .set_column_metadata(columns)
             .build()
             .unwrap();

         let compressed_size_res: i64 = row_group_meta.compressed_size();
         let compressed_size_exp: i64 = 1000;

         assert_eq!(compressed_size_res, compressed_size_exp);
     }

     /// Returns sample schema descriptor so we can create column metadata.
     fn get_test_schema_descr() -> SchemaDescPtr {
         let schema = SchemaType::group_type_builder("schema")
             .with_fields(&mut vec![
                 Arc::new(
                     SchemaType::primitive_type_builder("a", Type::INT32)
                         .build()
                         .unwrap(),
                 ),
                 Arc::new(
                     SchemaType::primitive_type_builder("b", Type::INT32)
                         .build()
                         .unwrap(),
                 ),
             ])
             .build()
             .unwrap();

         Arc::new(SchemaDescriptor::new(Arc::new(schema)))
     }
 }