| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "storage/predicate_collector.h" |
| |
| #include <glog/logging.h> |
| |
| #include <vector> |
| |
| #include "exec/common/variant_util.h" |
| #include "exprs/vexpr.h" |
| #include "exprs/vexpr_context.h" |
| #include "exprs/vliteral.h" |
| #include "exprs/vsearch.h" |
| #include "exprs/vslot_ref.h" |
| #include "gen_cpp/Exprs_types.h" |
| #include "storage/index/index_reader_helper.h" |
| #include "storage/index/inverted/analyzer/analyzer.h" |
| #include "storage/index/inverted/util/string_helper.h" |
| #include "storage/tablet/tablet_schema.h" |
| |
| namespace doris { |
| |
| using namespace segment_v2; |
| |
| VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const { |
| if (!expr) { |
| return nullptr; |
| } |
| |
| auto cur = VExpr::expr_without_cast(expr); |
| if (cur->node_type() == TExprNodeType::SLOT_REF) { |
| return static_cast<VSlotRef*>(cur.get()); |
| } |
| |
| for (const auto& ch : cur->children()) { |
| if (auto* s = find_slot_ref(ch)) { |
| return s; |
| } |
| } |
| |
| return nullptr; |
| } |
| |
| std::string PredicateCollector::build_field_name(int32_t col_unique_id, |
| const std::string& suffix_path) const { |
| std::string field_name = std::to_string(col_unique_id); |
| if (!suffix_path.empty()) { |
| field_name += "." + suffix_path; |
| } |
| return field_name; |
| } |
| |
| Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema, |
| const VExprSPtr& expr, CollectInfoMap* collect_infos) { |
| DCHECK(collect_infos != nullptr); |
| |
| auto* left_slot_ref = find_slot_ref(expr->children()[0]); |
| if (left_slot_ref == nullptr) { |
| return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
| "Index statistics collection failed: Cannot find slot reference in match predicate " |
| "left expression"); |
| } |
| |
| auto* right_literal = static_cast<VLiteral*>(expr->children()[1].get()); |
| DCHECK(right_literal != nullptr); |
| |
| const auto* sd = state->desc_tbl().get_slot_descriptor(left_slot_ref->slot_id()); |
| if (sd == nullptr) { |
| return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
| "Index statistics collection failed: Cannot find slot descriptor for slot_id={}", |
| left_slot_ref->slot_id()); |
| } |
| |
| int32_t col_idx = tablet_schema->field_index(left_slot_ref->column_name()); |
| if (col_idx == -1) { |
| return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
| "Index statistics collection failed: Cannot find column index for column={}", |
| left_slot_ref->column_name()); |
| } |
| |
| const auto& column = tablet_schema->column(col_idx); |
| auto index_metas = tablet_schema->inverted_indexs(column); |
| std::vector<std::shared_ptr<const TabletIndex>> owned_index_metas; |
| std::string index_suffix_path = column.suffix_path(); |
| |
| // Schema-only fallback for variant sub-columns. Collector runs at tablet |
| // level without segment context, so we cannot do nested-group inference |
| // or inherit_index runtime-type dispatch. Two paths cover what is |
| // resolvable from schema alone: |
| // 1. field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB) via |
| // generate_sub_column_info. |
| // 2. Plain parent inverted index when the schema column is the dynamic |
| // path's VARIANT placeholder produced by _init_variant_columns. In |
| // that state inverted_indexs(column) misses because |
| // _path_set_info_map.subcolumn_indexes is only populated for typed |
| // paths / field_pattern outputs, not for plain parent indexes added |
| // by ALTER. Clone the parent's non-field-pattern indexes with the |
| // variant path as suffix so segment-side BM25 statistics can be |
| // collected. |
| if (index_metas.empty() && column.is_extracted_column()) { |
| TabletSchema::SubColumnInfo sub_column_info; |
| const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); |
| if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), |
| relative_path, &sub_column_info) && |
| !sub_column_info.indexes.empty()) { |
| index_suffix_path = sub_column_info.column.suffix_path(); |
| for (auto& idx : sub_column_info.indexes) { |
| index_metas.push_back(idx.get()); |
| owned_index_metas.emplace_back(std::move(idx)); |
| } |
| } else if (column.is_variant_type()) { |
| const auto parent_indexes = tablet_schema->inverted_indexs(column.parent_unique_id()); |
| for (const auto* index : parent_indexes) { |
| if (!index->field_pattern().empty()) { |
| continue; |
| } |
| auto index_ptr = std::make_shared<TabletIndex>(*index); |
| index_ptr->set_escaped_escaped_index_suffix_path( |
| column.path_info_ptr()->get_path()); |
| index_metas.push_back(index_ptr.get()); |
| owned_index_metas.emplace_back(std::move(index_ptr)); |
| } |
| } |
| } |
| |
| #ifndef BE_TEST |
| if (index_metas.empty()) { |
| return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
| "Index statistics collection failed: Score query is not supported without inverted " |
| "index for column={}", |
| left_slot_ref->column_name()); |
| } |
| #endif |
| |
| for (const auto* index_meta : index_metas) { |
| if (!InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
| continue; |
| } |
| |
| if (!IndexReaderHelper::is_need_similarity_score(expr->op(), index_meta)) { |
| continue; |
| } |
| |
| auto options = DataTypeSerDe::get_default_format_options(); |
| options.timezone = &state->timezone_obj(); |
| auto term_infos = InvertedIndexAnalyzer::get_analyse_result(right_literal->value(options), |
| index_meta->properties()); |
| |
| std::string field_name = |
| build_field_name(index_meta->col_unique_ids()[0], index_suffix_path); |
| std::wstring ws_field_name = StringHelper::to_wstring(field_name); |
| |
| auto iter = collect_infos->find(ws_field_name); |
| if (iter == collect_infos->end()) { |
| CollectInfo collect_info; |
| collect_info.term_infos.insert(term_infos.begin(), term_infos.end()); |
| collect_info.index_meta = index_meta; |
| for (const auto& owned_index_meta : owned_index_metas) { |
| if (owned_index_meta.get() == index_meta) { |
| collect_info.owned_index_meta = owned_index_meta; |
| break; |
| } |
| } |
| (*collect_infos)[ws_field_name] = std::move(collect_info); |
| } else { |
| iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status SearchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema, |
| const VExprSPtr& expr, CollectInfoMap* collect_infos) { |
| DCHECK(collect_infos != nullptr); |
| |
| auto* search_expr = dynamic_cast<VSearchExpr*>(expr.get()); |
| if (search_expr == nullptr) { |
| return Status::InternalError("SearchPredicateCollector: expr is not VSearchExpr type"); |
| } |
| |
| const TSearchParam& search_param = search_expr->get_search_param(); |
| |
| RETURN_IF_ERROR(collect_from_clause(search_param.root, state, tablet_schema, collect_infos)); |
| |
| return Status::OK(); |
| } |
| |
| Status SearchPredicateCollector::collect_from_clause(const TSearchClause& clause, |
| RuntimeState* state, |
| const TabletSchemaSPtr& tablet_schema, |
| CollectInfoMap* collect_infos) { |
| const std::string& clause_type = clause.clause_type; |
| ClauseTypeCategory category = get_clause_type_category(clause_type); |
| |
| if (category == ClauseTypeCategory::COMPOUND) { |
| if (clause.__isset.children) { |
| for (const auto& child_clause : clause.children) { |
| RETURN_IF_ERROR( |
| collect_from_clause(child_clause, state, tablet_schema, collect_infos)); |
| } |
| } |
| return Status::OK(); |
| } |
| |
| return collect_from_leaf(clause, state, tablet_schema, collect_infos); |
| } |
| |
| Status SearchPredicateCollector::collect_from_leaf(const TSearchClause& clause, RuntimeState* state, |
| const TabletSchemaSPtr& tablet_schema, |
| CollectInfoMap* collect_infos) { |
| if (!clause.__isset.field_name || !clause.__isset.value) { |
| return Status::InvalidArgument("Search clause missing field_name or value"); |
| } |
| |
| const std::string& field_name = clause.field_name; |
| const std::string& value = clause.value; |
| const std::string& clause_type = clause.clause_type; |
| |
| if (!is_score_query_type(clause_type)) { |
| return Status::OK(); |
| } |
| |
| int32_t col_idx = tablet_schema->field_index(field_name); |
| if (col_idx == -1) { |
| return Status::OK(); |
| } |
| |
| const auto& column = tablet_schema->column(col_idx); |
| |
| auto index_metas = tablet_schema->inverted_indexs(column.unique_id(), column.suffix_path()); |
| if (index_metas.empty()) { |
| return Status::OK(); |
| } |
| |
| ClauseTypeCategory category = get_clause_type_category(clause_type); |
| for (const auto* index_meta : index_metas) { |
| std::set<TermInfo, TermInfoComparer> term_infos; |
| |
| if (category == ClauseTypeCategory::TOKENIZED) { |
| if (InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
| auto analyzed_terms = |
| InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties()); |
| term_infos.insert(analyzed_terms.begin(), analyzed_terms.end()); |
| } else { |
| term_infos.insert(TermInfo(value)); |
| } |
| } else if (category == ClauseTypeCategory::NON_TOKENIZED) { |
| if (clause_type == "TERM" && |
| InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
| auto analyzed_terms = |
| InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties()); |
| term_infos.insert(analyzed_terms.begin(), analyzed_terms.end()); |
| } else { |
| term_infos.insert(TermInfo(value)); |
| } |
| } |
| |
| std::string lucene_field_name = |
| build_field_name(index_meta->col_unique_ids()[0], column.suffix_path()); |
| std::wstring ws_field_name = StringHelper::to_wstring(lucene_field_name); |
| |
| auto iter = collect_infos->find(ws_field_name); |
| if (iter == collect_infos->end()) { |
| CollectInfo collect_info; |
| collect_info.term_infos = std::move(term_infos); |
| collect_info.index_meta = index_meta; |
| (*collect_infos)[ws_field_name] = std::move(collect_info); |
| } else { |
| iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| bool SearchPredicateCollector::is_score_query_type(const std::string& clause_type) const { |
| return clause_type == "TERM" || clause_type == "EXACT" || clause_type == "PHRASE" || |
| clause_type == "MATCH" || clause_type == "ANY" || clause_type == "ALL"; |
| } |
| |
| SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_clause_type_category( |
| const std::string& clause_type) const { |
| if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" || |
| clause_type == "OCCUR_BOOLEAN") { |
| return ClauseTypeCategory::COMPOUND; |
| } else if (clause_type == "TERM" || clause_type == "EXACT") { |
| return ClauseTypeCategory::NON_TOKENIZED; |
| } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" || |
| clause_type == "ALL") { |
| return ClauseTypeCategory::TOKENIZED; |
| } else { |
| LOG(WARNING) << "Unknown clause type '" << clause_type |
| << "', defaulting to NON_TOKENIZED category"; |
| return ClauseTypeCategory::NON_TOKENIZED; |
| } |
| } |
| |
| } // namespace doris |