blob: acb7bd0e8ee0a1d15daf7c78c12a47b8865c877c [file] [log] [blame]
/*
* Copyright 2026-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include <utility>
#include <vector>
#include "paimon/reader/file_batch_reader.h"
namespace paimon {
/// The prefetch file batch reader extends the basic FileBatchReader interface for prefetch read,
/// if a format implementation inherits from this class, it will automatically support the C++
/// Paimon prefetch capability and integrate with the Paimon prefetch framework.
class PAIMON_EXPORT PrefetchFileBatchReader : public FileBatchReader {
public:
/// Seeks to a specific row in the file.
/// @param row_number The row number to seek to.
/// @return The status of the operation.
virtual Status SeekToRow(uint64_t row_number) = 0;
/// Retrieves the row number of the next row to be read.
/// This method indicates the current read position within the file.
/// @return The row number of the next row to read.
virtual uint64_t GetNextRowToRead() const = 0;
/// Generates a list of row ranges to be read in batches.
/// Each range specifies the start and end row numbers for a batch,
/// allowing for efficient batch processing.
///
/// The underlying format layer (e.g., parquet) is responsible for determining
/// the most effective way to split the data. This could be by row groups, stripes,
/// or other internal data structures. The key principle is to split the data
/// into contiguous, seekable ranges to minimize read amplification.
///
/// For example:
/// - A parquet format could split by RowGroup directly, ensuring each range aligns
/// with a single RowGroup.
///
/// The smallest splittable unit must be seekable to its start position, and the
/// splitting strategy should aim to avoid read amplification.
///
/// @param need_prefetch A pointer to a boolean. The format layer sets this to indicate whether
/// prefetching is beneficial for the current scenario, to avoid performance regression in
/// certain cases.
/// @return A vector of pairs, where each pair represents a range with a start and end row
/// number.
virtual Result<std::vector<std::pair<uint64_t, uint64_t>>> GenReadRanges(
bool* need_prefetch) const = 0;
/// Sets the specific row ranges as a hint to be read from format file.
///
/// If the specific file format does not support explicit range-based reads, implementations may
/// gracefully ignore this hint and provide an empty (no-op) implementation.
///
/// @param read_ranges A vector of pairs, where each pair defines a half-open interval
/// `[start_row, end_row)`. The `start_row` is inclusive, and the `end_row` is exclusive.
virtual Status SetReadRanges(const std::vector<std::pair<uint64_t, uint64_t>>& read_ranges) = 0;
/// Returns a list of file offset/length ranges that should be prefetched for the current read
/// scenario.
///
/// This method should analyze the columns selected by the user and return the minimal set of
/// physical file ranges (offset, length) that need to be read, avoiding unnecessary IO
/// amplification. For example, if only a subset of columns is requested, the implementation
/// should only return the byte ranges corresponding to those columns, rather than the entire
/// row group or block.
///
/// This enables the cache to prefetch only the required data, reducing disk and network load
/// and improving performance for columnar formats and selective queries.
///
/// By default, returns an empty list (no prefetching). Format-specific implementations should
/// override this method to provide accurate offset/length hints for efficient IO.
/// @return A vector of pairs, where each pair contains the file offset and length to be
/// prefetched.
virtual Result<std::vector<std::pair<uint64_t, uint64_t>>> PreBufferRange() {
return std::vector<std::pair<uint64_t, uint64_t>>{};
}
};
} // namespace paimon