blob: 072ea9b567a8c4e187ff84c37ac48338c9779073 [file] [log] [blame]
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 03243e751..1eb9b2c81 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -47,9 +47,6 @@
#include "arrow/util/visibility.h"
#include "orc/Exceptions.hh"
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
#define ORC_THROW_NOT_OK(s) \
do { \
Status _s = (s); \
@@ -198,6 +195,8 @@ class ORCFileReader::Impl {
return Init();
}
+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
+
Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
@@ -504,6 +503,32 @@ class ORCFileReader::Impl {
return Status::OK();
}
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ if (current_row_ >= NumberOfRows()) {
+ return nullptr;
+ }
+
+ liborc::RowReaderOptions opts;
+ if (!include_names.empty()) {
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
+ pool_);
+ }
+
Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
return NextStripeReader(batch_size, {}, out);
}
@@ -531,6 +556,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
return std::move(result);
}
+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
+
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
return impl_->ReadMetadata();
}
@@ -653,6 +680,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return reader;
}
+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ return impl_->NextStripeReader(batch_size, include_names);
+}
+
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 223efa515..04e6b0612 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -30,6 +30,10 @@
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
+#include "orc/Reader.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
namespace arrow {
namespace adapters {
@@ -51,6 +55,9 @@ class ARROW_EXPORT ORCFileReader {
static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
std::unique_ptr<ORCFileReader>* reader);
+ /// \brief Get ORC reader from inside.
+ liborc::Reader* GetRawORCReader();
+
/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
@@ -240,6 +247,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices);
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_names the selected field names to read
+ /// \return the returned stripe reader
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names);
+
/// \brief The number of stripes in the file
int64_t NumberOfStripes();