src/parquet/arrow/reader.h - parquet-cpp - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #ifndef PARQUET_ARROW_READER_H
 #define PARQUET_ARROW_READER_H

 #include <memory>
 #include <vector>

 #include "parquet/api/reader.h"
 #include "parquet/api/schema.h"

 #include "arrow/io/interfaces.h"

 namespace arrow {

 class Array;
 class MemoryPool;
 class RowBatch;
 class Status;
 class Table;
 }  // namespace arrow

 namespace parquet {

 namespace arrow {

 class ColumnReader;

 // Arrow read adapter class for deserializing Parquet files as Arrow row
 // batches.
 //
 // TODO(wesm): nested data does not always make sense with this user
 // interface unless you are only reading a single leaf node from a branch of
 // a table. For example:
 //
 // repeated group data {
 //   optional group record {
 //     optional int32 val1;
 //     optional byte_array val2;
 //     optional bool val3;
 //   }
 //   optional int32 val4;
 // }
 //
 // In the Parquet file, there are 3 leaf nodes:
 //
 // * data.record.val1
 // * data.record.val2
 // * data.record.val3
 // * data.val4
 //
 // When materializing this data in an Arrow array, we would have:
 //
 // data: list<struct<
 //   record: struct<
 //    val1: int32,
 //    val2: string (= list<uint8>),
 //    val3: bool,
 //   >,
 //   val4: int32
 // >>
 //
 // However, in the Parquet format, each leaf node has its own repetition and
 // definition levels describing the structure of the intermediate nodes in
 // this array structure. Thus, we will need to scan the leaf data for a group
 // of leaf nodes part of the same type tree to create a single result Arrow
 // nested array structure.
 //
 // This is additionally complicated "chunky" repeated fields or very large byte
 // arrays
 class PARQUET_EXPORT FileReader {
  public:
   FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);

   // Since the distribution of columns amongst a Parquet file's row groups may
   // be uneven (the number of values in each column chunk can be different), we
   // provide a column-oriented read interface. The ColumnReader hides the
   // details of paging through the file's row groups and yielding
   // fully-materialized arrow::Array instances
   //
   // Returns error status if the column of interest is not flat.
   ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out);

   // Read column as a whole into an Array.
   ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out);

   // Read a table of flat columns into a Table.
   ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);

   // Read a table of flat columns into a Table. Read only the indicated column
   // indices (relative to the schema)
   ::arrow::Status ReadTable(
       const std::vector<int>& column_indices, std::shared_ptr<::arrow::Table>* out);

   ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
       std::shared_ptr<::arrow::Table>* out);

   ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);

   int num_row_groups() const;

   const ParquetFileReader* parquet_reader() const;

   /// Set the number of threads to use during reads of multiple columns. By
   /// default only 1 thread is used
   void set_num_threads(int num_threads);

   virtual ~FileReader();

  private:
   class PARQUET_NO_EXPORT Impl;
   std::unique_ptr<Impl> impl_;
 };

 // At this point, the column reader is a stream iterator. It only knows how to
 // read the next batch of values for a particular column from the file until it
 // runs out.
 //
 // We also do not expose any internal Parquet details, such as row groups. This
 // might change in the future.
 class PARQUET_EXPORT ColumnReader {
  public:
   virtual ~ColumnReader();

   // Scan the next array of the indicated size. The actual size of the
   // returned array may be less than the passed size depending how much data is
   // available in the file.
   //
   // When all the data in the file has been exhausted, the result is set to
   // nullptr.
   //
   // Returns Status::OK on a successful read, including if you have exhausted
   // the data available in the file.
   ::arrow::Status NextBatch(int batch_size, std::shared_ptr<::arrow::Array>* out);

  private:
   class PARQUET_NO_EXPORT Impl;
   std::unique_ptr<Impl> impl_;
   explicit ColumnReader(std::unique_ptr<Impl> impl);

   friend class FileReader;
 };

 // Helper function to create a file reader from an implementation of an Arrow
 // readable file
 //
 // metadata : separately-computed file metadata, can be nullptr
 PARQUET_EXPORT
 ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
     ::arrow::MemoryPool* allocator, const ReaderProperties& properties,
     const std::shared_ptr<FileMetaData>& metadata, std::unique_ptr<FileReader>* reader);

 PARQUET_EXPORT
 ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
     ::arrow::MemoryPool* allocator, std::unique_ptr<FileReader>* reader);

 }  // namespace arrow
 }  // namespace parquet

 #endif  // PARQUET_ARROW_READER_H
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#ifndef PARQUET_ARROW_READER_H
	#define PARQUET_ARROW_READER_H

	#include <memory>
	#include <vector>

	#include "parquet/api/reader.h"
	#include "parquet/api/schema.h"

	#include "arrow/io/interfaces.h"

	namespace arrow {

	class Array;
	class MemoryPool;
	class RowBatch;
	class Status;
	class Table;
	} // namespace arrow

	namespace parquet {

	namespace arrow {

	class ColumnReader;

	// Arrow read adapter class for deserializing Parquet files as Arrow row
	// batches.
	//
	// TODO(wesm): nested data does not always make sense with this user
	// interface unless you are only reading a single leaf node from a branch of
	// a table. For example:
	//
	// repeated group data {
	// optional group record {
	// optional int32 val1;
	// optional byte_array val2;
	// optional bool val3;
	// }
	// optional int32 val4;
	// }
	//
	// In the Parquet file, there are 3 leaf nodes:
	//
	// * data.record.val1
	// * data.record.val2
	// * data.record.val3
	// * data.val4
	//
	// When materializing this data in an Arrow array, we would have:
	//
	// data: list<struct<
	// record: struct<
	// val1: int32,
	// val2: string (= list<uint8>),
	// val3: bool,
	// >,
	// val4: int32
	// >>
	//
	// However, in the Parquet format, each leaf node has its own repetition and
	// definition levels describing the structure of the intermediate nodes in
	// this array structure. Thus, we will need to scan the leaf data for a group
	// of leaf nodes part of the same type tree to create a single result Arrow
	// nested array structure.
	//
	// This is additionally complicated "chunky" repeated fields or very large byte
	// arrays
	class PARQUET_EXPORT FileReader {
	public:
	FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);

	// Since the distribution of columns amongst a Parquet file's row groups may
	// be uneven (the number of values in each column chunk can be different), we
	// provide a column-oriented read interface. The ColumnReader hides the
	// details of paging through the file's row groups and yielding
	// fully-materialized arrow::Array instances
	//
	// Returns error status if the column of interest is not flat.
	::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out);

	// Read column as a whole into an Array.
	::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out);

	// Read a table of flat columns into a Table.
	::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);

	// Read a table of flat columns into a Table. Read only the indicated column
	// indices (relative to the schema)
	::arrow::Status ReadTable(
	const std::vector<int>& column_indices, std::shared_ptr<::arrow::Table>* out);

	::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
	std::shared_ptr<::arrow::Table>* out);

	::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);

	int num_row_groups() const;

	const ParquetFileReader* parquet_reader() const;

	/// Set the number of threads to use during reads of multiple columns. By
	/// default only 1 thread is used
	void set_num_threads(int num_threads);

	virtual ~FileReader();

	private:
	class PARQUET_NO_EXPORT Impl;
	std::unique_ptr<Impl> impl_;
	};

	// At this point, the column reader is a stream iterator. It only knows how to
	// read the next batch of values for a particular column from the file until it
	// runs out.
	//
	// We also do not expose any internal Parquet details, such as row groups. This
	// might change in the future.
	class PARQUET_EXPORT ColumnReader {
	public:
	virtual ~ColumnReader();

	// Scan the next array of the indicated size. The actual size of the
	// returned array may be less than the passed size depending how much data is
	// available in the file.
	//
	// When all the data in the file has been exhausted, the result is set to
	// nullptr.
	//
	// Returns Status::OK on a successful read, including if you have exhausted
	// the data available in the file.
	::arrow::Status NextBatch(int batch_size, std::shared_ptr<::arrow::Array>* out);

	private:
	class PARQUET_NO_EXPORT Impl;
	std::unique_ptr<Impl> impl_;
	explicit ColumnReader(std::unique_ptr<Impl> impl);

	friend class FileReader;
	};

	// Helper function to create a file reader from an implementation of an Arrow
	// readable file
	//
	// metadata : separately-computed file metadata, can be nullptr
	PARQUET_EXPORT
	::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
	::arrow::MemoryPool* allocator, const ReaderProperties& properties,
	const std::shared_ptr<FileMetaData>& metadata, std::unique_ptr<FileReader>* reader);

	PARQUET_EXPORT
	::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
	::arrow::MemoryPool* allocator, std::unique_ptr<FileReader>* reader);

	} // namespace arrow
	} // namespace parquet

	#endif // PARQUET_ARROW_READER_H