blob: 5bd23f710c19e7e8843952d26632403001cd428e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Adapted from Apache ORC
// https://github.com/apache/orc/blob/main/c%2B%2B/src/io/Cache.hh
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "paimon/fs/file_system.h"
#include "paimon/memory/bytes.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/status.h"
#include "paimon/visibility.h"
namespace paimon {
/// PrefetchCacheMode
/// Cache prefetch switch modes.
/// Controls whether to enable cache prefetching under different circumstances, such as queries with
/// predicates or bitmap indexes.
///
/// - ALWAYS: Enable cache in all scenarios.
/// - EXCLUDE_PREDICATE: Disable cache when query has predicates.
/// - EXCLUDE_BITMAP: Disable cache when using bitmap index.
/// - EXCLUDE_BITMAP_OR_PREDICATE: Disable cache if query has predicates or bitmap index.
/// - NEVER: Always disable cache.
enum class PAIMON_EXPORT PrefetchCacheMode {
ALWAYS = 1,
EXCLUDE_PREDICATE = 2,
EXCLUDE_BITMAP = 3,
EXCLUDE_BITMAP_OR_PREDICATE = 4,
NEVER = 5
};
/// Configuration parameters for the read-ahead cache behavior.
///
/// This struct controls various limits and prefetching strategies used by
/// ReadAheadCache to balance memory usage, I/O efficiency, and latency hiding.
class PAIMON_EXPORT CacheConfig {
public:
CacheConfig();
CacheConfig(uint64_t buffer_size_limit, uint64_t range_size_limit, uint64_t hole_size_limit,
uint64_t pre_buffer_limit);
/// Returns the maximum total size (in bytes) of cached data.
uint64_t GetBufferSizeLimit() const {
return buffer_size_limit_;
}
/// Sets the maximum total size (in bytes) of cached data.
void SetBufferSizeLimit(uint64_t buffer_size_limit) {
buffer_size_limit_ = buffer_size_limit;
}
/// Returns the maximum allowed size (in bytes) for a single cached range.
uint64_t GetRangeSizeLimit() const {
return range_size_limit_;
}
/// Sets the maximum allowed size (in bytes) for a single cached range.
void SetRangeSizeLimit(uint64_t range_size_limit) {
range_size_limit_ = range_size_limit;
}
/// Returns the maximum gap size (in bytes) considered mergeable between adjacent ranges.
uint64_t GetHoleSizeLimit() const {
return hole_size_limit_;
}
/// Sets the maximum gap size (in bytes) considered mergeable between adjacent ranges.
void SetHoleSizeLimit(uint64_t hole_size_limit) {
hole_size_limit_ = hole_size_limit;
}
/// Returns the maximum size to pre-buffer ahead of the current read position.
uint64_t GetPreBufferLimit() const {
return pre_buffer_limit_;
}
/// Sets the maximum size to pre-buffer ahead of the current read position.
void SetPreBufferLimit(uint64_t pre_buffer_limit) {
pre_buffer_limit_ = pre_buffer_limit;
}
private:
uint64_t buffer_size_limit_;
uint64_t range_size_limit_;
uint64_t hole_size_limit_;
uint64_t pre_buffer_limit_;
};
/// A byte range with offset and length.
struct PAIMON_EXPORT ByteRange {
uint64_t offset;
uint64_t length;
ByteRange() = default;
ByteRange(uint64_t offset, uint64_t length) : offset(offset), length(length) {}
friend bool operator==(const ByteRange& left, const ByteRange& right) {
return (left.offset == right.offset && left.length == right.length);
}
friend bool operator!=(const ByteRange& left, const ByteRange& right) {
return !(left == right);
}
/// @param other The other byte range to check.
/// @return true if this range contains the other range
bool Contains(const ByteRange& other) const {
return (offset <= other.offset && offset + length >= other.offset + other.length);
}
};
/// A byte slice with buffer, offset and length.
struct PAIMON_EXPORT ByteSlice {
std::shared_ptr<Bytes> buffer = nullptr;
uint64_t offset = 0;
uint64_t length = 0;
};
/// A read cache designed to hide IO latencies when reading.
/// Prefetching strategy: When a range is read, the cache will prefetch up to
/// `pre_buffer_range_count` additional adjacent ranges ahead of the requested offset. This helps
/// hide I/O latency for sequential access. Example: If you read range [0, 100), and
/// pre_buffer_range_count=2, the next two configured ranges will also be prefetched.
///
/// Eviction policy: The cache uses a simple FIFO eviction policy based on total cached byte size.
/// When adding new ranges would exceed `buffer_size_limit`, the oldest cached ranges are evicted
/// first until there is enough space for the new data.
class PAIMON_EXPORT ReadAheadCache {
public:
/// Construct a read cache with given options
ReadAheadCache(const std::shared_ptr<InputStream>& stream, const CacheConfig& config,
const std::shared_ptr<MemoryPool>& memory_pool);
~ReadAheadCache();
/// Initialize the cache with given byte ranges to be cached.
/// @param ranges The byte ranges to be cached.
/// @return Status of the operation.
/// @note This method must be called before any Read() calls. Ranges will be coalesced based
/// on the cache configuration.
Status Init(std::vector<ByteRange>&& ranges);
/// Read a range previously provided to Init().
/// @param range The byte range to read.
/// @return The byte slice containing the requested data. If the data is not yet cached
/// (cache miss), the returned `ByteSlice` will have a null buffer (`buffer == nullptr`)
Result<ByteSlice> Read(const ByteRange& range);
/// Reset the cache to its initial state, clearing all cached data and configuration.
///
/// This method waits for all ongoing asynchronous read operations to complete,
/// clears all cached entries, and resets the internal state so that Init() can be called again.
/// After calling Reset, the cache can be safely re-initialized with new ranges.
void Reset();
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace paimon