blob: c08b0427847feb7cdde84c72ac7af70cddd176ff [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <gen_cpp/PlanNodes_types.h>
#include "common/status.h"
#include "exprs/vexpr_fwd.h"
#include "runtime/descriptors.h"
#include "storage/predicate/block_column_predicate.h"
#include "util/profile_collector.h"
namespace doris {
class ColumnPredicate;
} // namespace doris
namespace doris {
#include "common/compile_check_begin.h"
class Block;
class VSlotRef;
// Context passed from FileScanner to readers for condition cache integration.
// On MISS: readers populate filter_result per-granule during predicate evaluation.
// On HIT: readers skip granules where filter_result[granule] == false.
struct ConditionCacheContext {
bool is_hit = false;
std::shared_ptr<std::vector<bool>> filter_result; // per-granule: true = has surviving rows
int64_t base_granule = 0; // global granule index of the first granule in filter_result
static constexpr int GRANULE_SIZE = 2048;
};
// This a reader interface for all file readers.
// A GenericReader is responsible for reading a file and return
// a set of blocks with specified schema,
class GenericReader : public ProfileCollector {
public:
GenericReader() : _push_down_agg_type(TPushAggOp::type::NONE) {}
void set_push_down_agg_type(TPushAggOp::type push_down_agg_type) {
_push_down_agg_type = push_down_agg_type;
}
virtual Status get_next_block(Block* block, size_t* read_rows, bool* eof) = 0;
// Type is always nullable to process illegal values.
virtual Status get_columns(std::unordered_map<std::string, DataTypePtr>* name_to_type,
std::unordered_set<std::string>* missing_cols) {
return Status::NotSupported("get_columns is not implemented");
}
// This method is responsible for initializing the resource for parsing schema.
// It will be called before `get_parsed_schema`.
virtual Status init_schema_reader() {
return Status::NotSupported("init_schema_reader is not implemented for this reader.");
}
// `col_types` is always nullable to process illegal values.
virtual Status get_parsed_schema(std::vector<std::string>* col_names,
std::vector<DataTypePtr>* col_types) {
return Status::NotSupported("get_parsed_schema is not implemented for this reader.");
}
~GenericReader() override = default;
/// If the underlying FileReader has filled the partition&missing columns,
/// The FileScanner does not need to fill
virtual bool fill_all_columns() const { return _fill_all_columns; }
/// Tell the underlying FileReader the partition&missing columns,
/// and the FileReader determine to fill columns or not.
/// Should set _fill_all_columns = true, if fill the columns.
virtual Status set_fill_columns(
const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
partition_columns,
const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) {
return Status::OK();
}
virtual Status close() { return Status::OK(); }
Status read_by_rows(const std::list<int64_t>& row_ids) {
_read_by_rows = true;
_row_ids = row_ids;
return _set_read_one_line_impl();
}
/// The reader is responsible for counting the number of rows read,
/// because some readers, such as parquet/orc,
/// can skip some pages/rowgroups through indexes.
virtual bool count_read_rows() { return false; }
protected:
virtual Status _set_read_one_line_impl() {
return Status::NotSupported("read_by_rows is not implemented for this reader.");
}
const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding)
/// Whether the underlying FileReader has filled the partition&missing columns
bool _fill_all_columns = false;
TPushAggOp::type _push_down_agg_type {};
public:
// Pass condition cache context to the reader for HIT/MISS tracking.
virtual void set_condition_cache_context(std::shared_ptr<ConditionCacheContext> ctx) {}
// Returns the total number of rows the reader will produce.
// Used to pre-allocate condition cache with the correct number of granules.
virtual int64_t get_total_rows() const { return 0; }
// Returns true if this reader has delete operations (e.g. Iceberg position/equality deletes,
// Hive ACID deletes). Used to disable condition cache when deletes are present, since cached
// granule results may become stale if delete files change between queries.
virtual bool has_delete_operations() const { return false; }
protected:
bool _read_by_rows = false;
std::list<int64_t> _row_ids;
// Cache to save some common part such as file footer.
// Maybe null if not used
FileMetaCache* _meta_cache = nullptr;
};
#include "common/compile_check_end.h"
} // namespace doris