blob: f20e94e9ae6d43dba01d11f9d6bf486ec46dc631 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include <set>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/compute/api.h"
#include "arrow/dataset/file_parquet.h"
#include "arrow/record_batch.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "paimon/common/utils/arrow/status_utils.h"
#include "paimon/result.h"
#include "paimon/status.h"
#include "parquet/arrow/reader.h"
namespace arrow {
class Schema;
} // namespace arrow
namespace paimon::parquet {
// The FileReaderWrapper is a decorator class designed to support seek functionality, as well as the
// methods GetPreviousBatchFirstRowNumber and GetNextRowToRead.
class FileReaderWrapper {
public:
static Result<std::unique_ptr<FileReaderWrapper>> Create(
std::unique_ptr<::parquet::arrow::FileReader>&& reader);
Status SeekToRow(uint64_t row_number);
Result<std::shared_ptr<arrow::RecordBatch>> Next();
Result<uint64_t> GetPreviousBatchFirstRowNumber() const {
return previous_first_row_;
}
uint64_t GetNextRowToRead() const {
return next_row_to_read_;
}
uint64_t GetNumberOfRows() const {
return num_rows_;
}
int32_t GetNumberOfRowGroups() const {
return file_reader_->num_row_groups();
}
::parquet::arrow::FileReader* GetFileReader() const {
return file_reader_.get();
}
const std::vector<std::pair<uint64_t, uint64_t>>& GetAllRowGroupRanges() const {
return all_row_group_ranges_;
}
Result<std::shared_ptr<arrow::Schema>> GetSchema() const {
std::shared_ptr<arrow::Schema> file_schema;
PAIMON_RETURN_NOT_OK_FROM_ARROW(file_reader_->GetSchema(&file_schema));
return file_schema;
}
Status Close() {
if (batch_reader_) {
PAIMON_RETURN_NOT_OK_FROM_ARROW(batch_reader_->Close());
}
return Status::OK();
}
Result<std::vector<std::pair<uint64_t, uint64_t>>> GetRowGroupRanges(
const std::set<int32_t>& row_group_indices) const;
Status PrepareForReadingLazy(const std::set<int32_t>& row_group_indices,
const std::vector<int32_t>& column_indices);
Status PrepareForReading(const std::set<int32_t>& row_group_indices,
const std::vector<int32_t>& column_indices);
Result<std::set<int32_t>> FilterRowGroupsByReadRanges(
const std::vector<std::pair<uint64_t, uint64_t>>& read_ranges,
const std::vector<int32_t>& src_row_groups) const;
private:
FileReaderWrapper(std::unique_ptr<::parquet::arrow::FileReader>&& file_reader,
const std::vector<std::pair<uint64_t, uint64_t>>& all_row_group_ranges,
uint64_t num_rows);
Result<std::set<int32_t>> ReadRangesToRowGroupIds(
const std::vector<std::pair<uint64_t, uint64_t>>& read_ranges) const;
Result<int32_t> GetRowGroupId(std::pair<uint64_t, uint64_t> target_range) const;
std::unique_ptr<::parquet::arrow::FileReader> file_reader_;
std::unique_ptr<arrow::RecordBatchReader> batch_reader_;
std::vector<std::pair<uint64_t, uint64_t>> all_row_group_ranges_;
std::set<int32_t> target_row_group_indices_;
std::vector<std::pair<uint64_t, uint64_t>> target_row_groups_;
std::vector<int32_t> target_column_indices_;
const uint64_t num_rows_;
uint64_t next_row_to_read_ = std::numeric_limits<uint64_t>::max();
uint64_t previous_first_row_ = std::numeric_limits<uint64_t>::max();
uint64_t current_row_group_idx_ = 0;
bool reader_initialized_ = false;
};
} // namespace paimon::parquet