| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "storage/segment/condition_cache.h" |
| |
| #include <gtest/gtest.h> |
| |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "common/status.h" |
| #include "format/generic_reader.h" |
| #include "format/orc/vorc_reader.h" |
| #include "format/parquet/vparquet_reader.h" |
| #include "format/table/transactional_hive_common.h" |
| |
| namespace doris::vectorized { |
| |
| constexpr int GS = ConditionCacheContext::GRANULE_SIZE; // 2048 |
| |
| class FilterRangesByCacheTest : public testing::Test {}; |
| |
| // Single contiguous range, first_row = 0, alternating true/false granules. |
| TEST_F(FilterRangesByCacheTest, SingleRangeAlternatingGranules) { |
| // 4 full granules = 8192 rows, range [0, 8192) |
| RowRanges ranges; |
| ranges.add(RowRange(0, 4 * GS)); |
| // granule 0=true, 1=false, 2=true, 3=false |
| std::vector<bool> cache = {true, false, true, false}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, /*first_row=*/0); |
| |
| // Expect granules 0 and 2 kept: [0, 2048), [4096, 6144) |
| EXPECT_EQ(result.range_size(), 2); |
| EXPECT_EQ(result.count(), 2 * GS); |
| EXPECT_EQ(result.get_range_from(0), 0); |
| EXPECT_EQ(result.get_range_to(0), GS); |
| EXPECT_EQ(result.get_range_from(1), 2 * GS); |
| EXPECT_EQ(result.get_range_to(1), 3 * GS); |
| } |
| |
| // All granules true -> ranges unchanged. |
| TEST_F(FilterRangesByCacheTest, AllGranulesTrue) { |
| RowRanges ranges; |
| ranges.add(RowRange(0, 3 * GS)); |
| std::vector<bool> cache = {true, true, true}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.count(), 3 * GS); |
| EXPECT_EQ(result.get_range_from(0), 0); |
| EXPECT_EQ(result.get_range_to(0), 3 * GS); |
| } |
| |
| // All granules false -> empty ranges. |
| TEST_F(FilterRangesByCacheTest, AllGranulesFalse) { |
| RowRanges ranges; |
| ranges.add(RowRange(0, 3 * GS)); |
| std::vector<bool> cache = {false, false, false}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| EXPECT_EQ(result.range_size(), 0); |
| EXPECT_EQ(result.count(), 0); |
| } |
| |
| // first_row offset shifts granule boundaries. |
| TEST_F(FilterRangesByCacheTest, NonZeroFirstRow) { |
| // first_row = 1024, range [0, 4096) -> 4096 rows |
| // Sequential positions 0..4095, global_seq = 1024..5119 |
| // granule 0 (global 0..2047): seq 0..1023 -> range [0, 1024) |
| // granule 1 (global 2048..4095): seq 1024..3071 -> range [1024, 3072) |
| // granule 2 (global 4096..6143): seq 3072..4095 -> range [3072, 4096) |
| RowRanges ranges; |
| ranges.add(RowRange(0, 4096)); |
| // granule 0=false, 1=true, 2=false |
| std::vector<bool> cache = {false, true, false}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, /*first_row=*/1024); |
| |
| // Only granule 1 kept: rows with global_seq in [2048, 4096) -> range [1024, 3072) |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.count(), 2048); |
| EXPECT_EQ(result.get_range_from(0), 1024); |
| EXPECT_EQ(result.get_range_to(0), 3072); |
| } |
| |
| // Range that doesn't start at 0 (from page index filtering). |
| TEST_F(FilterRangesByCacheTest, RangeNotStartingAtZero) { |
| // Range [2048, 6144) = 4096 rows, first_row = 0 |
| // Granule 0 (false): covers rg-relative [0, 2048) — no overlap with [2048, 6144) |
| // Granule 1 (true): covers rg-relative [2048, 4096) — kept |
| // Beyond cache: [4096, 6144) kept conservatively |
| RowRanges ranges; |
| ranges.add(RowRange(2048, 6144)); |
| std::vector<bool> cache = {false, true}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| // False granule [0, 2048) doesn't overlap [2048, 6144), so nothing is filtered |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.count(), 4096); |
| EXPECT_EQ(result.get_range_from(0), 2048); |
| EXPECT_EQ(result.get_range_to(0), 6144); |
| } |
| |
| // Multiple non-contiguous ranges (from page index filtering) with a single-entry cache. |
| TEST_F(FilterRangesByCacheTest, NonContiguousRangesGranuleSpansGap) { |
| // Ranges: [0, 1000), [5000, 6000) = 2000 total rows, first_row = 0 |
| // Granule 0 covers rg-relative [0, 2048) — only overlaps [0, 1000) |
| // [5000, 6000) is in granule 2 ([4096, 6144)) which is beyond cache -> kept conservatively |
| RowRanges ranges; |
| ranges.add(RowRange(0, 1000)); |
| ranges.add(RowRange(5000, 6000)); |
| |
| // Granule 0 = false -> discard [0, 1000); [5000, 6000) kept (beyond cache) |
| std::vector<bool> cache = {false}; |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| EXPECT_EQ(result.count(), 1000); |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.get_range_from(0), 5000); |
| EXPECT_EQ(result.get_range_to(0), 6000); |
| |
| // Granule 0 = true -> keep all |
| std::vector<bool> cache2 = {true}; |
| result = RowGroupReader::filter_ranges_by_cache(ranges, cache2, 0); |
| EXPECT_EQ(result.count(), 2000); |
| EXPECT_EQ(result.range_size(), 2); |
| } |
| |
| // Non-contiguous ranges where granule boundaries fall within ranges. |
| TEST_F(FilterRangesByCacheTest, NonContiguousRangesMultipleGranules) { |
| // Ranges: [0, 3000), [8000, 11000) = 6000 total rows, first_row = 0 |
| // Granule 0 (false): rg-relative [0, 2048) — overlaps [0, 2048) of first range |
| // Granule 1 (true): rg-relative [2048, 4096) — overlaps [2048, 3000) of first range |
| // Granule 2 (false): rg-relative [4096, 6144) — no overlap with either range |
| // [8000, 11000) is in granules 3-5, all beyond cache -> kept conservatively |
| RowRanges ranges; |
| ranges.add(RowRange(0, 3000)); |
| ranges.add(RowRange(8000, 11000)); |
| |
| // granule 0=false, 1=true, 2=false |
| std::vector<bool> cache = {false, true, false}; |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| // Granule 0 removes [0, 2048) from [0, 3000) -> [2048, 3000) kept |
| // Granule 2 [4096, 6144) doesn't overlap [8000, 11000) -> [8000, 11000) kept |
| EXPECT_EQ(result.range_size(), 2); |
| EXPECT_EQ(result.get_range_from(0), 2048); |
| EXPECT_EQ(result.get_range_to(0), 3000); |
| EXPECT_EQ(result.get_range_from(1), 8000); |
| EXPECT_EQ(result.get_range_to(1), 11000); |
| EXPECT_EQ(result.count(), (3000 - 2048) + (11000 - 8000)); // 952 + 3000 = 3952 |
| } |
| |
| // Cache smaller than the actual row range -> out-of-range granules kept conservatively. |
| TEST_F(FilterRangesByCacheTest, CacheSmallerThanRange) { |
| // 4 granules of rows, cache only covers 2 |
| RowRanges ranges; |
| ranges.add(RowRange(0, 4 * GS)); |
| std::vector<bool> cache = {false, true}; // only 2 entries |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| // Granule 0 = false -> skip; granule 1 = true -> keep |
| // Granule 2, 3 beyond cache -> kept conservatively |
| EXPECT_EQ(result.range_size(), 1); // [GS, 4*GS) merged since granules 1,2,3 all kept |
| EXPECT_EQ(result.count(), 3 * GS); |
| EXPECT_EQ(result.get_range_from(0), GS); |
| EXPECT_EQ(result.get_range_to(0), 4 * GS); |
| } |
| |
| // Partial granule at the end of a range. |
| TEST_F(FilterRangesByCacheTest, PartialGranuleAtEnd) { |
| // Range [0, 3000) = 3000 rows, first_row = 0 |
| // Granule 0: seq [0, 2048) -> [0, 2048) |
| // Granule 1: seq [2048, 3000) -> [2048, 3000) (partial, only 952 rows) |
| RowRanges ranges; |
| ranges.add(RowRange(0, 3000)); |
| std::vector<bool> cache = {true, false}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| // Only granule 0 kept |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.count(), GS); |
| EXPECT_EQ(result.get_range_from(0), 0); |
| EXPECT_EQ(result.get_range_to(0), GS); |
| } |
| |
| // Empty ranges input. |
| TEST_F(FilterRangesByCacheTest, EmptyRanges) { |
| RowRanges ranges; |
| std::vector<bool> cache = {true, false, true}; |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0); |
| |
| EXPECT_EQ(result.range_size(), 0); |
| EXPECT_EQ(result.count(), 0); |
| } |
| |
| // Large first_row offset (simulating second row group in file). |
| TEST_F(FilterRangesByCacheTest, LargeFirstRowOffset) { |
| // first_row = 100000 (second row group starts here) |
| // Range [0, 2048) = one full granule |
| // global_seq = 100000 + 0 = 100000, granule = 100000/2048 = 48 |
| int64_t first_row = 100000; |
| RowRanges ranges; |
| ranges.add(RowRange(0, GS)); |
| std::vector<bool> cache(50, false); // 50 granules, all false |
| cache[48] = true; // granule 48 = true |
| |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, first_row); |
| |
| // global_seq for first chunk: 100000, granule 48 = true -> keep |
| // But first chunk may not be the full range if 100000 is not aligned... |
| // 100000 / 2048 = 48, 49*2048 = 100352, so rows_to_granule_end = 100352 - 100000 = 352 |
| // chunk 1: [0, 352) -> granule 48 = true -> keep |
| // chunk 2: [352, 2048) -> global_seq = 100352 -> granule 49 = false -> discard |
| EXPECT_EQ(result.count(), 352); |
| EXPECT_EQ(result.range_size(), 1); |
| EXPECT_EQ(result.get_range_from(0), 0); |
| EXPECT_EQ(result.get_range_to(0), 352); |
| } |
| |
| // ============================================================ |
| // Tests for cache vector pre-allocation with +1 safety margin |
| // when first_row is not aligned to granule boundary. |
| // ============================================================ |
| |
| class CachePreAllocTest : public testing::Test {}; |
| |
| // When first_row is not aligned to granule boundary, pre-allocating |
| // ceil(total_rows / GS) + 1 guarantees coverage of all granules. |
| TEST_F(CachePreAllocTest, PlusOneCoversUnalignedFirstRow) { |
| // Simulates: Scanner reads RG2+RG3 of a file: |
| // RG2: 5000 rows (row 20000~24999), RG3: 5000 rows (row 25000~29999) |
| // first_assigned_row = 20000, total_rows = 10000 |
| // |
| // Pre-allocation: ceil(10000 / 2048) + 1 = 5 + 1 = 6 |
| // base_granule = 20000 / 2048 = 9 |
| // last_granule = ceil(30000 / 2048) = 15 |
| // needed = 15 - 9 = 6 <= 6 (pre-allocated) → sufficient! |
| constexpr int64_t GS = ConditionCacheContext::GRANULE_SIZE; // 2048 |
| int64_t first_assigned_row = 20000; |
| int64_t total_rows = 10000; |
| |
| // Step 1: simulate pre-allocation with +1 (as FileScanner now does) |
| int64_t pre_alloc = (total_rows + GS - 1) / GS + 1; // ceil(10000/2048) + 1 = 6 |
| EXPECT_EQ(pre_alloc, 6); |
| std::vector<bool> cache(pre_alloc, false); |
| |
| // Step 2: compute base_granule (as set_condition_cache_context does) |
| int64_t base_granule = first_assigned_row / GS; // 9 |
| EXPECT_EQ(base_granule, 9); |
| |
| // Step 3: verify all granules are coverable |
| int64_t last_granule = (first_assigned_row + total_rows + GS - 1) / GS; // 15 |
| int64_t needed = last_granule - base_granule; // 6 |
| EXPECT_LE(static_cast<size_t>(needed), cache.size()); |
| |
| // Step 4: mark all granules as true and verify HIT keeps all rows |
| cache.assign(cache.size(), true); |
| RowRanges ranges; |
| ranges.add(RowRange(0, total_rows)); |
| auto result = |
| RowGroupReader::filter_ranges_by_cache(ranges, cache, first_assigned_row, base_granule); |
| EXPECT_EQ(result.count(), total_rows); |
| } |
| |
| // Verify the last granule (cache_idx = needed-1) is reachable after +1 allocation. |
| TEST_F(CachePreAllocTest, LastGranuleIsReachable) { |
| constexpr int64_t GS = ConditionCacheContext::GRANULE_SIZE; |
| int64_t first_assigned_row = 20000; |
| int64_t total_rows = 10000; |
| |
| int64_t pre_alloc = (total_rows + GS - 1) / GS + 1; |
| std::vector<bool> cache(pre_alloc, false); |
| int64_t base_granule = first_assigned_row / GS; |
| |
| // Simulate marking from _mark_condition_cache_granules for a row |
| // in the last granule. E.g., global row 29000 |
| int64_t global_row = 29000; |
| size_t granule = global_row / GS; // 29000 / 2048 = 14 |
| size_t cache_idx = granule - base_granule; // 14 - 9 = 5 |
| |
| // Without +1, cache.size() would be 5 and cache_idx=5 would be out of bounds |
| // With +1, cache.size() is 6 and cache_idx=5 is valid |
| EXPECT_LT(cache_idx, cache.size()); |
| cache[cache_idx] = true; |
| EXPECT_TRUE(cache[cache_idx]); |
| } |
| |
| // Verify +1 is sufficient for various misaligned first_row values. |
| TEST_F(CachePreAllocTest, PlusOneSufficientForVariousMisalignments) { |
| constexpr int64_t GS = ConditionCacheContext::GRANULE_SIZE; |
| |
| struct TestCase { |
| int64_t first_row; |
| int64_t total_rows; |
| }; |
| std::vector<TestCase> cases = { |
| {.first_row = 20000, .total_rows = 10000}, // original example |
| {.first_row = 1024, .total_rows = 4096}, // small offset |
| {.first_row = 3000, .total_rows = 8000}, // another misalignment |
| {.first_row = 2047, .total_rows = 2049}, // just before boundary, spans 3 granules |
| {.first_row = 0, .total_rows = 10000}, // aligned (+1 is wasted but harmless) |
| {.first_row = 1, .total_rows = 2048}, // off-by-one at start |
| {.first_row = 100000, .total_rows = 10000}, // large offset (second row group) |
| }; |
| |
| for (auto& tc : cases) { |
| int64_t last_row = tc.first_row + tc.total_rows; |
| int64_t pre_alloc = (tc.total_rows + GS - 1) / GS + 1; |
| int64_t base_granule = tc.first_row / GS; |
| int64_t last_granule = (last_row + GS - 1) / GS; |
| int64_t needed = last_granule - base_granule; |
| |
| EXPECT_LE(needed, pre_alloc) |
| << "first_row=" << tc.first_row << " total_rows=" << tc.total_rows; |
| |
| // Verify HIT correctness |
| std::vector<bool> cache(pre_alloc, true); |
| RowRanges ranges; |
| ranges.add(RowRange(0, tc.total_rows)); |
| auto result = |
| RowGroupReader::filter_ranges_by_cache(ranges, cache, tc.first_row, base_granule); |
| EXPECT_EQ(result.count(), tc.total_rows) |
| << "first_row=" << tc.first_row << " total_rows=" << tc.total_rows; |
| } |
| } |
| |
| // Extra +1 element beyond actual data range doesn't cause incorrect filtering. |
| TEST_F(CachePreAllocTest, ExtraElementDoesNotCauseIncorrectFiltering) { |
| // Aligned case: first_row=0, total_rows=4096 (exactly 2 granules) |
| // Pre-alloc = 2 + 1 = 3. The 3rd element (cache[2]) is beyond data range. |
| std::vector<bool> cache = {true, true, false}; // extra false at end |
| |
| RowRanges ranges; |
| ranges.add(RowRange(0, 4096)); |
| auto result = RowGroupReader::filter_ranges_by_cache(ranges, cache, 0, 0); |
| |
| // The extra false granule covers rg-relative [4096, 6144) which doesn't |
| // overlap [0, 4096), so all rows should be kept. |
| EXPECT_EQ(result.count(), 4096); |
| } |
| |
| // ============================================================ |
| // Mock / Testable reader classes |
| // ============================================================ |
| |
| // GenericReader whose has_delete_operations() result is configurable, |
| // used to test condition cache skip logic for various delete scenarios. |
| class MockFileFormatReader : public GenericReader { |
| public: |
| bool mock_has_deletes = false; |
| Status _do_get_next_block(Block*, size_t*, bool*) override { return Status::OK(); } |
| bool has_delete_operations() const override { return mock_has_deletes; } |
| }; |
| // ============================================================ |
| // These tests reproduce the logic from |
| // FileScanner::_init_reader_condition_cache() (file_scanner.cpp) |
| // using real ConditionCache + real reader instances. |
| // ============================================================ |
| |
| class ConditionCacheDeleteOpsTest : public testing::Test { |
| protected: |
| void SetUp() override { |
| _cache.reset(segment_v2::ConditionCache::create_global_cache(10 * 1024 * 1024, 4)); |
| } |
| |
| void TearDown() override { _cache.reset(); } |
| |
| // Reproduces the exact logic from FileScanner::_init_reader_condition_cache(). |
| // Returns whether the condition cache context was created (i.e. cache was not skipped). |
| void simulate_init_condition_cache(GenericReader* reader, uint64_t digest, |
| const std::string& path, |
| /*out*/ bool& cache_hit, |
| /*out*/ std::shared_ptr<std::vector<bool>>& cache, |
| /*out*/ std::shared_ptr<ConditionCacheContext>& ctx) { |
| cache_hit = false; |
| cache = nullptr; |
| ctx = nullptr; |
| |
| // Mirrors: if (_condition_cache_digest == 0 || _is_load) return; |
| if (digest == 0) { |
| return; |
| } |
| |
| // Mirrors: if (_cur_reader && _cur_reader->has_delete_operations()) return; |
| if (reader && reader->has_delete_operations()) { |
| return; |
| } |
| |
| auto* cc = _cache.get(); |
| if (cc == nullptr) { |
| return; |
| } |
| |
| segment_v2::ConditionCache::ExternalCacheKey key(path, -1, 0, digest, 0, -1); |
| |
| segment_v2::ConditionCacheHandle handle; |
| cache_hit = cc->lookup(key, &handle); |
| if (cache_hit) { |
| cache = handle.get_filter_result(); |
| } else { |
| cache = std::make_shared<std::vector<bool>>(); |
| } |
| |
| ctx = std::make_shared<ConditionCacheContext>(); |
| ctx->is_hit = cache_hit; |
| ctx->filter_result = cache; |
| } |
| |
| // Inserts a pre-populated entry into the cache for the given path/digest. |
| void prepopulate_cache(const std::string& path, uint64_t digest) { |
| segment_v2::ConditionCache::ExternalCacheKey key(path, -1, 0, digest, 0, -1); |
| auto filter = std::make_shared<std::vector<bool>>(std::vector<bool> {true, false, true}); |
| _cache->insert(key, filter); |
| } |
| |
| std::unique_ptr<segment_v2::ConditionCache> _cache; |
| }; |
| |
| // -- ParquetReader: no deletes -> cache populated (MISS) -- |
| TEST_F(ConditionCacheDeleteOpsTest, ParquetNoDeletes_CachePopulated) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = ParquetReader::create_unique(params, range, nullptr, nullptr); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 42, "/data/file.parquet", hit, cache, ctx); |
| |
| EXPECT_FALSE(hit); |
| EXPECT_NE(ctx, nullptr); |
| EXPECT_NE(cache, nullptr); |
| EXPECT_FALSE(ctx->is_hit); |
| } |
| |
| // -- ParquetReader: with position deletes -> cache skipped -- |
| TEST_F(ConditionCacheDeleteOpsTest, ParquetWithPositionDeletes_CacheSkipped) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = ParquetReader::create_unique(params, range, nullptr, nullptr); |
| std::vector<int64_t> deletes = {1, 5, 10}; |
| reader->set_delete_rows(&deletes); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 42, "/data/file.parquet", hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| } |
| |
| // -- OrcReader: no deletes -> cache populated (MISS) -- |
| TEST_F(ConditionCacheDeleteOpsTest, OrcNoDeletes_CachePopulated) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 99, "/data/file.orc", hit, cache, ctx); |
| |
| EXPECT_FALSE(hit); |
| EXPECT_NE(ctx, nullptr); |
| EXPECT_NE(cache, nullptr); |
| EXPECT_FALSE(ctx->is_hit); |
| } |
| |
| // -- OrcReader: with position deletes -> cache skipped -- |
| TEST_F(ConditionCacheDeleteOpsTest, OrcWithPositionDeletes_CacheSkipped) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr); |
| std::vector<int64_t> pos_deletes = {0, 3, 7}; |
| reader->set_position_delete_rowids(&pos_deletes); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 99, "/data/file.orc", hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| } |
| |
| // -- OrcReader: with ACID deletes -> cache skipped -- |
| TEST_F(ConditionCacheDeleteOpsTest, OrcWithAcidDeletes_CacheSkipped) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr); |
| AcidRowIDSet acid_deletes; |
| acid_deletes.insert({1, 0, 5}); |
| reader->set_delete_rows(&acid_deletes); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 99, "/data/file.orc", hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| } |
| |
| // -- MockReader: with deletes (simulating Iceberg/Hive with inner deletes) -> cache skipped -- |
| // In the new architecture, Iceberg readers inherit ParquetReader/OrcReader directly (CRTP), |
| // so has_delete_operations() is resolved through the base reader. We use MockFileFormatReader |
| // to test the generic condition cache skip logic. |
| TEST_F(ConditionCacheDeleteOpsTest, ReaderWithDeletes_CacheSkipped) { |
| auto reader = std::make_unique<MockFileFormatReader>(); |
| reader->mock_has_deletes = true; |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 42, "/data/iceberg.parquet", hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| } |
| |
| // -- MockReader: no deletes -> cache populated -- |
| TEST_F(ConditionCacheDeleteOpsTest, ReaderWithoutDeletes_CachePopulated) { |
| auto reader = std::make_unique<MockFileFormatReader>(); |
| reader->mock_has_deletes = false; |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 42, "/data/iceberg.parquet", hit, cache, ctx); |
| |
| EXPECT_FALSE(hit); |
| EXPECT_NE(ctx, nullptr); |
| EXPECT_NE(cache, nullptr); |
| EXPECT_FALSE(ctx->is_hit); |
| } |
| |
| // -- Pre-populated cache entry is NOT returned when deletes exist -- |
| TEST_F(ConditionCacheDeleteOpsTest, CacheHitSkippedWhenDeletesExist) { |
| const std::string path = "/data/cached_file.parquet"; |
| const uint64_t digest = 123; |
| |
| // Insert a cache entry |
| prepopulate_cache(path, digest); |
| |
| // Verify it would be a hit without deletes |
| { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = ParquetReader::create_unique(params, range, nullptr, nullptr); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), digest, path, hit, cache, ctx); |
| |
| EXPECT_TRUE(hit); |
| EXPECT_NE(ctx, nullptr); |
| EXPECT_TRUE(ctx->is_hit); |
| EXPECT_NE(cache, nullptr); |
| EXPECT_EQ(cache->size(), 3); |
| } |
| |
| // Now with deletes: cache entry should NOT be returned |
| { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = ParquetReader::create_unique(params, range, nullptr, nullptr); |
| std::vector<int64_t> deletes = {1, 2, 3}; |
| reader->set_delete_rows(&deletes); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), digest, path, hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| EXPECT_FALSE(hit); |
| } |
| } |
| |
| // -- Zero digest always skips cache, even without deletes -- |
| TEST_F(ConditionCacheDeleteOpsTest, ZeroDigest_CacheAlwaysSkipped) { |
| TFileScanRangeParams params; |
| TFileRangeDesc range; |
| auto reader = ParquetReader::create_unique(params, range, nullptr, nullptr); |
| |
| bool hit = false; |
| std::shared_ptr<std::vector<bool>> cache; |
| std::shared_ptr<ConditionCacheContext> ctx; |
| simulate_init_condition_cache(reader.get(), 0, "/data/file.parquet", hit, cache, ctx); |
| |
| EXPECT_EQ(ctx, nullptr); |
| EXPECT_EQ(cache, nullptr); |
| EXPECT_FALSE(hit); |
| } |
| |
| } // namespace doris::vectorized |