| diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc |
| index 03243e751..1eb9b2c81 100644 |
| --- a/cpp/src/arrow/adapters/orc/adapter.cc |
| +++ b/cpp/src/arrow/adapters/orc/adapter.cc |
| @@ -47,9 +47,6 @@ |
| #include "arrow/util/visibility.h" |
| #include "orc/Exceptions.hh" |
| |
| -// alias to not interfere with nested orc namespace |
| -namespace liborc = orc; |
| - |
| #define ORC_THROW_NOT_OK(s) \ |
| do { \ |
| Status _s = (s); \ |
| @@ -198,6 +195,8 @@ class ORCFileReader::Impl { |
| return Init(); |
| } |
| |
| + virtual liborc::Reader* GetRawORCReader() { return reader_.get(); } |
| + |
| Status Init() { |
| int64_t nstripes = reader_->getNumberOfStripes(); |
| stripes_.resize(nstripes); |
| @@ -504,6 +503,32 @@ class ORCFileReader::Impl { |
| return Status::OK(); |
| } |
| |
| + Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( |
| + int64_t batch_size, const std::vector<std::string>& include_names) { |
| + if (current_row_ >= NumberOfRows()) { |
| + return nullptr; |
| + } |
| + |
| + liborc::RowReaderOptions opts; |
| + if (!include_names.empty()) { |
| + RETURN_NOT_OK(SelectNames(&opts, include_names)); |
| + } |
| + StripeInformation stripe_info({0, 0, 0, 0}); |
| + RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info)); |
| + std::shared_ptr<Schema> schema; |
| + RETURN_NOT_OK(ReadSchema(opts, &schema)); |
| + std::unique_ptr<liborc::RowReader> row_reader; |
| + |
| + ORC_BEGIN_CATCH_NOT_OK |
| + row_reader = reader_->createRowReader(opts); |
| + row_reader->seekToRow(current_row_); |
| + current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; |
| + ORC_END_CATCH_NOT_OK |
| + |
| + return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size, |
| + pool_); |
| + } |
| + |
| Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) { |
| return NextStripeReader(batch_size, {}, out); |
| } |
| @@ -531,6 +556,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open( |
| return std::move(result); |
| } |
| |
| +liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); } |
| + |
| Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() { |
| return impl_->ReadMetadata(); |
| } |
| @@ -653,6 +680,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader( |
| return reader; |
| } |
| |
| +Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader( |
| + int64_t batch_size, const std::vector<std::string>& include_names) { |
| + return impl_->NextStripeReader(batch_size, include_names); |
| +} |
| + |
| int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } |
| |
| int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } |
| diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h |
| index 223efa515..04e6b0612 100644 |
| --- a/cpp/src/arrow/adapters/orc/adapter.h |
| +++ b/cpp/src/arrow/adapters/orc/adapter.h |
| @@ -30,6 +30,10 @@ |
| #include "arrow/type_fwd.h" |
| #include "arrow/util/macros.h" |
| #include "arrow/util/visibility.h" |
| +#include "orc/Reader.hh" |
| + |
| +// alias to not interfere with nested orc namespace |
| +namespace liborc = orc; |
| |
| namespace arrow { |
| namespace adapters { |
| @@ -51,6 +55,9 @@ class ARROW_EXPORT ORCFileReader { |
| static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool, |
| std::unique_ptr<ORCFileReader>* reader); |
| |
| + /// \brief Get ORC reader from inside. |
| + liborc::Reader* GetRawORCReader(); |
| + |
| /// \brief Creates a new ORC reader |
| /// |
| /// \param[in] file the data source |
| @@ -240,6 +247,19 @@ class ARROW_EXPORT ORCFileReader { |
| Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( |
| int64_t batch_size, const std::vector<int>& include_indices); |
| |
| + /// \brief Get a stripe level record batch iterator with specified row count |
| + /// in each record batch. NextStripeReader serves as a fine grain |
| + /// alternative to ReadStripe which may cause OOM issue by loading |
| + /// the whole stripes into memory. |
| + /// |
| + /// \param[in] batch_size Get a stripe level record batch iterator with specified row |
| + /// count in each record batch. |
| + /// |
| + /// \param[in] include_names the selected field names to read |
| + /// \return the returned stripe reader |
| + Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( |
| + int64_t batch_size, const std::vector<std::string>& include_names); |
| + |
| /// \brief The number of stripes in the file |
| int64_t NumberOfStripes(); |
| |