| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "arrow/io/caching.h" |
| #include "arrow/util/type_fwd.h" |
| #include "parquet/metadata.h" // IWYU pragma: keep |
| #include "parquet/platform.h" |
| #include "parquet/properties.h" |
| |
| namespace parquet { |
| |
| class ColumnReader; |
| class FileMetaData; |
| class PageIndexReader; |
| class BloomFilterReader; |
| class PageReader; |
| class RowGroupMetaData; |
| |
| namespace internal { |
| class RecordReader; |
| } |
| |
| class PARQUET_EXPORT RowGroupReader { |
| public: |
| // Forward declare a virtual class 'Contents' to aid dependency injection and more |
| // easily create test fixtures |
| // An implementation of the Contents class is defined in the .cc file |
| struct Contents { |
| virtual ~Contents() {} |
| virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; |
| virtual const RowGroupMetaData* metadata() const = 0; |
| virtual const ReaderProperties* properties() const = 0; |
| }; |
| |
| explicit RowGroupReader(std::unique_ptr<Contents> contents); |
| |
| // Returns the rowgroup metadata |
| const RowGroupMetaData* metadata() const; |
| |
| // Construct a ColumnReader for the indicated row group-relative |
| // column. Ownership is shared with the RowGroupReader. |
| std::shared_ptr<ColumnReader> Column(int i); |
| |
| // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. |
| // Ownership is shared with the RowGroupReader. |
| std::shared_ptr<internal::RecordReader> RecordReader(int i, |
| bool read_dictionary = false); |
| |
| // Construct a ColumnReader, trying to enable exposed encoding. |
| // |
| // For dictionary encoding, currently we only support column chunks that are fully |
| // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded. |
| // If a column chunk uses dictionary encoding but then falls back to plain encoding, the |
| // encoding will not be exposed. |
| // |
| // The returned column reader provides an API GetExposedEncoding() for the |
| // users to check the exposed encoding and determine how to read the batches. |
| // |
| // \note API EXPERIMENTAL |
| std::shared_ptr<ColumnReader> ColumnWithExposeEncoding( |
| int i, ExposedEncoding encoding_to_expose); |
| |
| // Construct a RecordReader, trying to enable exposed encoding. |
| // |
| // For dictionary encoding, currently we only support column chunks that are |
| // fully dictionary encoded byte arrays. The caller should verify if the reader can read |
| // and expose the dictionary by checking the reader's read_dictionary(). If a column |
| // chunk uses dictionary encoding but then falls back to plain encoding, the returned |
| // reader will read decoded data without exposing the dictionary. |
| // |
| // \note API EXPERIMENTAL |
| std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding( |
| int i, ExposedEncoding encoding_to_expose); |
| |
| std::unique_ptr<PageReader> GetColumnPageReader(int i); |
| |
| private: |
| // Holds a pointer to an instance of Contents implementation |
| std::unique_ptr<Contents> contents_; |
| }; |
| |
| class PARQUET_EXPORT ParquetFileReader { |
| public: |
| // Declare a virtual class 'Contents' to aid dependency injection and more |
| // easily create test fixtures |
| // An implementation of the Contents class is defined in the .cc file |
| struct PARQUET_EXPORT Contents { |
| static std::unique_ptr<Contents> Open( |
| std::shared_ptr<::arrow::io::RandomAccessFile> source, |
| const ReaderProperties& props = default_reader_properties(), |
| std::shared_ptr<FileMetaData> metadata = NULLPTR); |
| |
| static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync( |
| std::shared_ptr<::arrow::io::RandomAccessFile> source, |
| const ReaderProperties& props = default_reader_properties(), |
| std::shared_ptr<FileMetaData> metadata = NULLPTR); |
| |
| virtual ~Contents() = default; |
| // Perform any cleanup associated with the file contents |
| virtual void Close() = 0; |
| virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; |
| virtual std::shared_ptr<FileMetaData> metadata() const = 0; |
| virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0; |
| virtual BloomFilterReader& GetBloomFilterReader() = 0; |
| }; |
| |
| ParquetFileReader(); |
| ~ParquetFileReader(); |
| |
| // Create a file reader instance from an Arrow file object. Thread-safety is |
| // the responsibility of the file implementation |
| static std::unique_ptr<ParquetFileReader> Open( |
| std::shared_ptr<::arrow::io::RandomAccessFile> source, |
| const ReaderProperties& props = default_reader_properties(), |
| std::shared_ptr<FileMetaData> metadata = NULLPTR); |
| |
| // API Convenience to open a serialized Parquet file on disk, using Arrow IO |
| // interfaces. |
| static std::unique_ptr<ParquetFileReader> OpenFile( |
| const std::string& path, bool memory_map = false, |
| const ReaderProperties& props = default_reader_properties(), |
| std::shared_ptr<FileMetaData> metadata = NULLPTR); |
| |
| // Asynchronously open a file reader from an Arrow file object. |
| // Does not throw - all errors are reported through the Future. |
| static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync( |
| std::shared_ptr<::arrow::io::RandomAccessFile> source, |
| const ReaderProperties& props = default_reader_properties(), |
| std::shared_ptr<FileMetaData> metadata = NULLPTR); |
| |
| void Open(std::unique_ptr<Contents> contents); |
| void Close(); |
| |
| // The RowGroupReader is owned by the FileReader |
| std::shared_ptr<RowGroupReader> RowGroup(int i); |
| |
| // Returns the file metadata. Only one instance is ever created |
| std::shared_ptr<FileMetaData> metadata() const; |
| |
| /// Returns the PageIndexReader. Only one instance is ever created. |
| /// |
| /// If the file does not have the page index, nullptr may be returned. |
| /// Because it pays to check existence of page index in the file, it |
| /// is possible to return a non null value even if page index does |
| /// not exist. It is the caller's responsibility to check the return |
| /// value and follow-up calls to PageIndexReader. |
| /// |
| /// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader. |
| /// Initialize GetPageIndexReader() is not thread-safety. |
| std::shared_ptr<PageIndexReader> GetPageIndexReader(); |
| |
| /// Returns the BloomFilterReader. Only one instance is ever created. |
| /// |
| /// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader. |
| /// Initialize GetBloomFilterReader() is not thread-safety. |
| BloomFilterReader& GetBloomFilterReader(); |
| |
| /// Pre-buffer the specified column indices in all row groups. |
| /// |
| /// Readers can optionally call this to cache the necessary slices |
| /// of the file in-memory before deserialization. Arrow readers can |
| /// automatically do this via an option. This is intended to |
| /// increase performance when reading from high-latency filesystems |
| /// (e.g. Amazon S3). |
| /// |
| /// After calling this, creating readers for row groups/column |
| /// indices that were not buffered may fail. Creating multiple |
| /// readers for the a subset of the buffered regions is |
| /// acceptable. This may be called again to buffer a different set |
| /// of row groups/columns. |
| /// |
| /// If memory usage is a concern, note that data will remain |
| /// buffered in memory until either \a PreBuffer() is called again, |
| /// or the reader itself is destructed. Reading - and buffering - |
| /// only one row group at a time may be useful. |
| /// |
| /// This method may throw. |
| void PreBuffer(const std::vector<int>& row_groups, |
| const std::vector<int>& column_indices, |
| const ::arrow::io::IOContext& ctx, |
| const ::arrow::io::CacheOptions& options); |
| |
| /// Retrieve the list of byte ranges that would need to be read to retrieve |
| /// the data for the specified row groups and column indices. |
| /// |
| /// A reader can optionally call this if they wish to handle their own |
| /// caching and management of file reads (or offload them to other readers). |
| /// Unlike PreBuffer, this method will not perform any actual caching or |
| /// reads, instead just using the file metadata to determine the byte ranges |
| /// that would need to be read if you were to consume the entirety of the column |
| /// chunks for the provided columns in the specified row groups. |
| /// |
| /// If row_groups or column_indices are empty, then the result of this will be empty. |
| /// |
| /// hole_size_limit represents the maximum distance, in bytes, between two |
| /// consecutive ranges; beyond this value, ranges will not be combined. The default |
| /// value is 1MB. |
| /// |
| /// range_size_limit is the maximum size in bytes of a combined range; if combining |
| /// two consecutive ranges would produce a range larger than this, they are not |
| /// combined. The default values is 64MB. This *must* be larger than hole_size_limit. |
| /// |
| /// This will not take into account page indexes or any other predicate push down |
| /// benefits that may be available. |
| ::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges( |
| const std::vector<int>& row_groups, const std::vector<int>& column_indices, |
| int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); |
| |
| /// Wait for the specified row groups and column indices to be pre-buffered. |
| /// |
| /// After the returned Future completes, reading the specified row |
| /// groups/columns will not block. |
| /// |
| /// PreBuffer must be called first. This method does not throw. |
| ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups, |
| const std::vector<int>& column_indices) const; |
| |
| private: |
| // Holds a pointer to an instance of Contents implementation |
| std::unique_ptr<Contents> contents_; |
| }; |
| |
| // Read only Parquet file metadata |
| std::shared_ptr<FileMetaData> PARQUET_EXPORT |
| ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); |
| |
| /// \brief Scan all values in file. Useful for performance testing |
| /// \param[in] columns the column numbers to scan. If empty scans all |
| /// \param[in] column_batch_size number of values to read at a time when scanning column |
| /// \param[in] reader a ParquetFileReader instance |
| /// \return number of semantic rows in file |
| PARQUET_EXPORT |
| int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, |
| ParquetFileReader* reader); |
| |
| } // namespace parquet |